Files
swe-latn-100mb-after-ppt-sh…/checkpoint-24000/trainer_state.json
ModelHub XC 380cd5d987 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/swe-latn-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407
Source: Original Platform
2026-06-30 03:57:20 +08:00

48123 lines
1.3 MiB

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0162990968283974,
"eval_steps": 3000,
"global_step": 24000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 4.791903686523438,
"epoch": 0.0004200798151648813,
"grad_norm": 14.6875,
"learning_rate": 2e-06,
"loss": 14.3853,
"mean_token_accuracy": 0.0001464128843508661,
"num_tokens": 8348.0,
"step": 5
},
{
"entropy": 4.770248889923096,
"epoch": 0.0008401596303297626,
"grad_norm": 13.5,
"learning_rate": 4.5e-06,
"loss": 14.3427,
"mean_token_accuracy": 0.00011325028026476502,
"num_tokens": 17465.0,
"step": 10
},
{
"entropy": 4.853783750534058,
"epoch": 0.001260239445494644,
"grad_norm": 19.875,
"learning_rate": 7e-06,
"loss": 14.1261,
"mean_token_accuracy": 0.00010341261513531208,
"num_tokens": 26627.0,
"step": 15
},
{
"entropy": 5.086610746383667,
"epoch": 0.0016803192606595252,
"grad_norm": 28.375,
"learning_rate": 9.5e-06,
"loss": 13.5157,
"mean_token_accuracy": 0.0,
"num_tokens": 36069.0,
"step": 20
},
{
"entropy": 7.3999251365661625,
"epoch": 0.002100399075824407,
"grad_norm": 12.0625,
"learning_rate": 1.2e-05,
"loss": 11.7927,
"mean_token_accuracy": 0.0,
"num_tokens": 44967.0,
"step": 25
},
{
"entropy": 10.45841064453125,
"epoch": 0.002520478890989288,
"grad_norm": 3.015625,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.7852,
"mean_token_accuracy": 9.009009227156639e-05,
"num_tokens": 55132.0,
"step": 30
},
{
"entropy": 10.697011375427246,
"epoch": 0.0029405587061541692,
"grad_norm": 2.75,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.5553,
"mean_token_accuracy": 0.006346415734151378,
"num_tokens": 65141.0,
"step": 35
},
{
"entropy": 10.697507858276367,
"epoch": 0.0033606385213190504,
"grad_norm": 2.40625,
"learning_rate": 1.95e-05,
"loss": 10.2781,
"mean_token_accuracy": 0.039602359384298326,
"num_tokens": 74007.0,
"step": 40
},
{
"entropy": 10.668922233581544,
"epoch": 0.003780718336483932,
"grad_norm": 2.296875,
"learning_rate": 2.2e-05,
"loss": 9.9863,
"mean_token_accuracy": 0.04357385709881782,
"num_tokens": 83736.0,
"step": 45
},
{
"entropy": 10.582208251953125,
"epoch": 0.004200798151648814,
"grad_norm": 2.09375,
"learning_rate": 2.4500000000000003e-05,
"loss": 9.7817,
"mean_token_accuracy": 0.044408387318253514,
"num_tokens": 92525.0,
"step": 50
},
{
"entropy": 10.561583709716796,
"epoch": 0.004620877966813695,
"grad_norm": 2.109375,
"learning_rate": 2.7e-05,
"loss": 9.6616,
"mean_token_accuracy": 0.042681990377604964,
"num_tokens": 102015.0,
"step": 55
},
{
"entropy": 10.598667430877686,
"epoch": 0.005040957781978576,
"grad_norm": 1.8671875,
"learning_rate": 2.95e-05,
"loss": 9.6152,
"mean_token_accuracy": 0.03954915180802345,
"num_tokens": 110887.0,
"step": 60
},
{
"entropy": 10.602967929840087,
"epoch": 0.005461037597143457,
"grad_norm": 2.0,
"learning_rate": 3.2e-05,
"loss": 9.5219,
"mean_token_accuracy": 0.04232911877334118,
"num_tokens": 120442.0,
"step": 65
},
{
"entropy": 10.583982849121094,
"epoch": 0.0058811174123083385,
"grad_norm": 2.0625,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.4106,
"mean_token_accuracy": 0.041194649040699007,
"num_tokens": 129297.0,
"step": 70
},
{
"entropy": 10.561800956726074,
"epoch": 0.00630119722747322,
"grad_norm": 1.921875,
"learning_rate": 3.7e-05,
"loss": 9.3205,
"mean_token_accuracy": 0.04238409399986267,
"num_tokens": 138305.0,
"step": 75
},
{
"entropy": 10.55169849395752,
"epoch": 0.006721277042638101,
"grad_norm": 2.03125,
"learning_rate": 3.95e-05,
"loss": 9.2587,
"mean_token_accuracy": 0.04415187537670136,
"num_tokens": 147640.0,
"step": 80
},
{
"entropy": 10.53596887588501,
"epoch": 0.007141356857802983,
"grad_norm": 1.9140625,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.156,
"mean_token_accuracy": 0.05057476349174976,
"num_tokens": 157633.0,
"step": 85
},
{
"entropy": 10.498776817321778,
"epoch": 0.007561436672967864,
"grad_norm": 1.6796875,
"learning_rate": 4.45e-05,
"loss": 9.1055,
"mean_token_accuracy": 0.043933939374983313,
"num_tokens": 167984.0,
"step": 90
},
{
"entropy": 10.410062026977538,
"epoch": 0.007981516488132745,
"grad_norm": 1.84375,
"learning_rate": 4.7000000000000004e-05,
"loss": 8.9796,
"mean_token_accuracy": 0.058869444206357,
"num_tokens": 176984.0,
"step": 95
},
{
"entropy": 10.331822967529297,
"epoch": 0.008401596303297627,
"grad_norm": 2.265625,
"learning_rate": 4.9500000000000004e-05,
"loss": 8.8274,
"mean_token_accuracy": 0.05482011772692204,
"num_tokens": 185931.0,
"step": 100
},
{
"entropy": 10.239299774169922,
"epoch": 0.008821676118462508,
"grad_norm": 2.265625,
"learning_rate": 5.2e-05,
"loss": 8.7979,
"mean_token_accuracy": 0.05091267079114914,
"num_tokens": 195065.0,
"step": 105
},
{
"entropy": 10.274462223052979,
"epoch": 0.00924175593362739,
"grad_norm": 1.609375,
"learning_rate": 5.45e-05,
"loss": 8.653,
"mean_token_accuracy": 0.053716998919844626,
"num_tokens": 203687.0,
"step": 110
},
{
"entropy": 10.133169841766357,
"epoch": 0.00966183574879227,
"grad_norm": 1.4453125,
"learning_rate": 5.7e-05,
"loss": 8.5503,
"mean_token_accuracy": 0.057077201455831526,
"num_tokens": 212847.0,
"step": 115
},
{
"entropy": 10.040187549591064,
"epoch": 0.010081915563957152,
"grad_norm": 1.6796875,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.4092,
"mean_token_accuracy": 0.05596162416040897,
"num_tokens": 222593.0,
"step": 120
},
{
"entropy": 9.901103496551514,
"epoch": 0.010501995379122032,
"grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
"loss": 8.2565,
"mean_token_accuracy": 0.05481334701180458,
"num_tokens": 231174.0,
"step": 125
},
{
"entropy": 9.70338020324707,
"epoch": 0.010922075194286915,
"grad_norm": 1.234375,
"learning_rate": 6.450000000000001e-05,
"loss": 8.1158,
"mean_token_accuracy": 0.058756759762763976,
"num_tokens": 239833.0,
"step": 130
},
{
"entropy": 9.529671382904052,
"epoch": 0.011342155009451797,
"grad_norm": 1.1640625,
"learning_rate": 6.7e-05,
"loss": 8.1187,
"mean_token_accuracy": 0.05617605000734329,
"num_tokens": 248794.0,
"step": 135
},
{
"entropy": 9.257420444488526,
"epoch": 0.011762234824616677,
"grad_norm": 1.09375,
"learning_rate": 6.950000000000001e-05,
"loss": 8.0561,
"mean_token_accuracy": 0.056496378034353256,
"num_tokens": 257123.0,
"step": 140
},
{
"entropy": 9.085350036621094,
"epoch": 0.012182314639781559,
"grad_norm": 1.0,
"learning_rate": 7.2e-05,
"loss": 7.7656,
"mean_token_accuracy": 0.05573978051543236,
"num_tokens": 266088.0,
"step": 145
},
{
"entropy": 8.806443977355958,
"epoch": 0.01260239445494644,
"grad_norm": 0.81640625,
"learning_rate": 7.45e-05,
"loss": 7.9249,
"mean_token_accuracy": 0.05259395204484463,
"num_tokens": 276074.0,
"step": 150
},
{
"entropy": 8.64165735244751,
"epoch": 0.013022474270111321,
"grad_norm": 0.82421875,
"learning_rate": 7.7e-05,
"loss": 7.8106,
"mean_token_accuracy": 0.0555482916533947,
"num_tokens": 285280.0,
"step": 155
},
{
"entropy": 8.47070608139038,
"epoch": 0.013442554085276202,
"grad_norm": 0.80859375,
"learning_rate": 7.950000000000001e-05,
"loss": 7.8078,
"mean_token_accuracy": 0.05328563526272774,
"num_tokens": 296115.0,
"step": 160
},
{
"entropy": 8.336323738098145,
"epoch": 0.013862633900441084,
"grad_norm": 0.83984375,
"learning_rate": 8.2e-05,
"loss": 7.6742,
"mean_token_accuracy": 0.055481255799531934,
"num_tokens": 305483.0,
"step": 165
},
{
"entropy": 8.235133361816406,
"epoch": 0.014282713715605966,
"grad_norm": 0.82421875,
"learning_rate": 8.450000000000001e-05,
"loss": 7.6978,
"mean_token_accuracy": 0.05532576628029347,
"num_tokens": 314000.0,
"step": 170
},
{
"entropy": 8.127510929107666,
"epoch": 0.014702793530770846,
"grad_norm": 0.96484375,
"learning_rate": 8.7e-05,
"loss": 7.6891,
"mean_token_accuracy": 0.05845912620425224,
"num_tokens": 323667.0,
"step": 175
},
{
"entropy": 8.085121822357177,
"epoch": 0.015122873345935728,
"grad_norm": 1.3359375,
"learning_rate": 8.95e-05,
"loss": 7.6917,
"mean_token_accuracy": 0.05741722546517849,
"num_tokens": 332695.0,
"step": 180
},
{
"entropy": 8.051828241348266,
"epoch": 0.015542953161100609,
"grad_norm": 1.1640625,
"learning_rate": 9.2e-05,
"loss": 7.5384,
"mean_token_accuracy": 0.059001700952649117,
"num_tokens": 342428.0,
"step": 185
},
{
"entropy": 8.029651880264282,
"epoch": 0.01596303297626549,
"grad_norm": 0.76953125,
"learning_rate": 9.45e-05,
"loss": 7.6438,
"mean_token_accuracy": 0.05830682367086411,
"num_tokens": 353587.0,
"step": 190
},
{
"entropy": 8.02134084701538,
"epoch": 0.01638311279143037,
"grad_norm": 1.2265625,
"learning_rate": 9.7e-05,
"loss": 7.5846,
"mean_token_accuracy": 0.06251729987561702,
"num_tokens": 362997.0,
"step": 195
},
{
"entropy": 7.912688684463501,
"epoch": 0.016803192606595255,
"grad_norm": 1.0703125,
"learning_rate": 9.95e-05,
"loss": 7.6088,
"mean_token_accuracy": 0.06746274717152119,
"num_tokens": 372346.0,
"step": 200
},
{
"entropy": 8.052432775497437,
"epoch": 0.017223272421760135,
"grad_norm": 1.234375,
"learning_rate": 0.000102,
"loss": 7.4944,
"mean_token_accuracy": 0.06457214206457138,
"num_tokens": 381575.0,
"step": 205
},
{
"entropy": 7.941053867340088,
"epoch": 0.017643352236925015,
"grad_norm": 1.1171875,
"learning_rate": 0.00010449999999999999,
"loss": 7.561,
"mean_token_accuracy": 0.06631600968539715,
"num_tokens": 390706.0,
"step": 210
},
{
"entropy": 7.876999855041504,
"epoch": 0.018063432052089896,
"grad_norm": 0.9140625,
"learning_rate": 0.000107,
"loss": 7.5529,
"mean_token_accuracy": 0.0665743712335825,
"num_tokens": 400000.0,
"step": 215
},
{
"entropy": 7.916263389587402,
"epoch": 0.01848351186725478,
"grad_norm": 0.9765625,
"learning_rate": 0.0001095,
"loss": 7.5184,
"mean_token_accuracy": 0.06940894797444344,
"num_tokens": 409447.0,
"step": 220
},
{
"entropy": 7.911146783828736,
"epoch": 0.01890359168241966,
"grad_norm": 1.0859375,
"learning_rate": 0.000112,
"loss": 7.4837,
"mean_token_accuracy": 0.06915329694747925,
"num_tokens": 418417.0,
"step": 225
},
{
"entropy": 7.839998531341553,
"epoch": 0.01932367149758454,
"grad_norm": 1.2265625,
"learning_rate": 0.0001145,
"loss": 7.4502,
"mean_token_accuracy": 0.06963084377348423,
"num_tokens": 427619.0,
"step": 230
},
{
"entropy": 7.8366899490356445,
"epoch": 0.019743751312749424,
"grad_norm": 1.15625,
"learning_rate": 0.00011700000000000001,
"loss": 7.5047,
"mean_token_accuracy": 0.06452232897281647,
"num_tokens": 437931.0,
"step": 235
},
{
"entropy": 7.8866432189941404,
"epoch": 0.020163831127914304,
"grad_norm": 1.0234375,
"learning_rate": 0.00011949999999999999,
"loss": 7.546,
"mean_token_accuracy": 0.06933673955500126,
"num_tokens": 447595.0,
"step": 240
},
{
"entropy": 7.882558917999267,
"epoch": 0.020583910943079185,
"grad_norm": 1.03125,
"learning_rate": 0.000122,
"loss": 7.4174,
"mean_token_accuracy": 0.06523367092013359,
"num_tokens": 457062.0,
"step": 245
},
{
"entropy": 7.839541244506836,
"epoch": 0.021003990758244065,
"grad_norm": 1.6796875,
"learning_rate": 0.0001245,
"loss": 7.5086,
"mean_token_accuracy": 0.06696470156311989,
"num_tokens": 466191.0,
"step": 250
},
{
"entropy": 7.8005016326904295,
"epoch": 0.02142407057340895,
"grad_norm": 1.2265625,
"learning_rate": 0.000127,
"loss": 7.451,
"mean_token_accuracy": 0.06885578371584415,
"num_tokens": 475693.0,
"step": 255
},
{
"entropy": 7.889404010772705,
"epoch": 0.02184415038857383,
"grad_norm": 1.0625,
"learning_rate": 0.0001295,
"loss": 7.4804,
"mean_token_accuracy": 0.06973730027675629,
"num_tokens": 485173.0,
"step": 260
},
{
"entropy": 7.815570974349976,
"epoch": 0.02226423020373871,
"grad_norm": 1.1640625,
"learning_rate": 0.000132,
"loss": 7.3829,
"mean_token_accuracy": 0.07314893454313279,
"num_tokens": 493985.0,
"step": 265
},
{
"entropy": 7.816596174240113,
"epoch": 0.022684310018903593,
"grad_norm": 1.5546875,
"learning_rate": 0.00013450000000000002,
"loss": 7.4224,
"mean_token_accuracy": 0.07099288031458854,
"num_tokens": 502837.0,
"step": 270
},
{
"entropy": 7.785638856887817,
"epoch": 0.023104389834068473,
"grad_norm": 1.1796875,
"learning_rate": 0.00013700000000000002,
"loss": 7.3589,
"mean_token_accuracy": 0.07525993324816227,
"num_tokens": 511503.0,
"step": 275
},
{
"entropy": 7.709826803207397,
"epoch": 0.023524469649233354,
"grad_norm": 1.0,
"learning_rate": 0.0001395,
"loss": 7.5429,
"mean_token_accuracy": 0.07307531610131264,
"num_tokens": 521499.0,
"step": 280
},
{
"entropy": 7.790460062026978,
"epoch": 0.023944549464398234,
"grad_norm": 1.234375,
"learning_rate": 0.00014199999999999998,
"loss": 7.3077,
"mean_token_accuracy": 0.07697631418704987,
"num_tokens": 530067.0,
"step": 285
},
{
"entropy": 7.686764717102051,
"epoch": 0.024364629279563118,
"grad_norm": 1.03125,
"learning_rate": 0.0001445,
"loss": 7.3034,
"mean_token_accuracy": 0.0773412711918354,
"num_tokens": 538559.0,
"step": 290
},
{
"entropy": 7.772013425827026,
"epoch": 0.024784709094728,
"grad_norm": 1.15625,
"learning_rate": 0.000147,
"loss": 7.4929,
"mean_token_accuracy": 0.07045175209641456,
"num_tokens": 547288.0,
"step": 295
},
{
"entropy": 7.7699915885925295,
"epoch": 0.02520478890989288,
"grad_norm": 0.91796875,
"learning_rate": 0.0001495,
"loss": 7.3747,
"mean_token_accuracy": 0.07021207436919212,
"num_tokens": 557269.0,
"step": 300
},
{
"entropy": 7.7625007152557375,
"epoch": 0.025624868725057762,
"grad_norm": 1.0625,
"learning_rate": 0.000152,
"loss": 7.3902,
"mean_token_accuracy": 0.07074716240167618,
"num_tokens": 567280.0,
"step": 305
},
{
"entropy": 7.732419967651367,
"epoch": 0.026044948540222643,
"grad_norm": 1.296875,
"learning_rate": 0.00015450000000000001,
"loss": 7.2546,
"mean_token_accuracy": 0.07402584217488765,
"num_tokens": 576609.0,
"step": 310
},
{
"entropy": 7.543032264709472,
"epoch": 0.026465028355387523,
"grad_norm": 1.3203125,
"learning_rate": 0.000157,
"loss": 7.1466,
"mean_token_accuracy": 0.08402636647224426,
"num_tokens": 586053.0,
"step": 315
},
{
"entropy": 7.557563781738281,
"epoch": 0.026885108170552403,
"grad_norm": 1.5703125,
"learning_rate": 0.0001595,
"loss": 7.353,
"mean_token_accuracy": 0.07976382523775101,
"num_tokens": 594649.0,
"step": 320
},
{
"entropy": 7.672474765777588,
"epoch": 0.027305187985717287,
"grad_norm": 1.3828125,
"learning_rate": 0.000162,
"loss": 7.299,
"mean_token_accuracy": 0.0721965666860342,
"num_tokens": 603445.0,
"step": 325
},
{
"entropy": 7.649927425384521,
"epoch": 0.027725267800882167,
"grad_norm": 1.21875,
"learning_rate": 0.00016450000000000001,
"loss": 7.3816,
"mean_token_accuracy": 0.07200038619339466,
"num_tokens": 613611.0,
"step": 330
},
{
"entropy": 7.817860317230225,
"epoch": 0.028145347616047048,
"grad_norm": 1.8828125,
"learning_rate": 0.00016700000000000002,
"loss": 7.5608,
"mean_token_accuracy": 0.07428344339132309,
"num_tokens": 623024.0,
"step": 335
},
{
"entropy": 7.615134525299072,
"epoch": 0.02856542743121193,
"grad_norm": 1.140625,
"learning_rate": 0.00016950000000000003,
"loss": 7.241,
"mean_token_accuracy": 0.08163552284240723,
"num_tokens": 631624.0,
"step": 340
},
{
"entropy": 7.6425103664398195,
"epoch": 0.028985507246376812,
"grad_norm": 1.25,
"learning_rate": 0.00017199999999999998,
"loss": 7.1878,
"mean_token_accuracy": 0.08015607595443726,
"num_tokens": 640473.0,
"step": 345
},
{
"entropy": 7.680424308776855,
"epoch": 0.029405587061541692,
"grad_norm": 1.171875,
"learning_rate": 0.00017449999999999999,
"loss": 7.3678,
"mean_token_accuracy": 0.07545013800263405,
"num_tokens": 649692.0,
"step": 350
},
{
"entropy": 7.654160070419311,
"epoch": 0.029825666876706573,
"grad_norm": 1.640625,
"learning_rate": 0.000177,
"loss": 7.319,
"mean_token_accuracy": 0.07563115619122981,
"num_tokens": 658236.0,
"step": 355
},
{
"entropy": 7.521349573135376,
"epoch": 0.030245746691871456,
"grad_norm": 0.96875,
"learning_rate": 0.0001795,
"loss": 7.1117,
"mean_token_accuracy": 0.08184256851673126,
"num_tokens": 667175.0,
"step": 360
},
{
"entropy": 7.665167379379272,
"epoch": 0.030665826507036337,
"grad_norm": 1.015625,
"learning_rate": 0.000182,
"loss": 7.3978,
"mean_token_accuracy": 0.07722308114171028,
"num_tokens": 676456.0,
"step": 365
},
{
"entropy": 7.747646379470825,
"epoch": 0.031085906322201217,
"grad_norm": 1.0078125,
"learning_rate": 0.0001845,
"loss": 7.3621,
"mean_token_accuracy": 0.07609389498829841,
"num_tokens": 686881.0,
"step": 370
},
{
"entropy": 7.486468362808227,
"epoch": 0.0315059861373661,
"grad_norm": 0.921875,
"learning_rate": 0.000187,
"loss": 7.1472,
"mean_token_accuracy": 0.08116972967982292,
"num_tokens": 696045.0,
"step": 375
},
{
"entropy": 7.507930612564087,
"epoch": 0.03192606595253098,
"grad_norm": 1.3046875,
"learning_rate": 0.0001895,
"loss": 7.1262,
"mean_token_accuracy": 0.08457142487168312,
"num_tokens": 704729.0,
"step": 380
},
{
"entropy": 7.4088475704193115,
"epoch": 0.032346145767695865,
"grad_norm": 0.92578125,
"learning_rate": 0.000192,
"loss": 7.1832,
"mean_token_accuracy": 0.07680457159876823,
"num_tokens": 714331.0,
"step": 385
},
{
"entropy": 7.597209930419922,
"epoch": 0.03276622558286074,
"grad_norm": 1.25,
"learning_rate": 0.0001945,
"loss": 7.139,
"mean_token_accuracy": 0.08005330711603165,
"num_tokens": 722788.0,
"step": 390
},
{
"entropy": 7.544012260437012,
"epoch": 0.033186305398025626,
"grad_norm": 1.28125,
"learning_rate": 0.00019700000000000002,
"loss": 7.2038,
"mean_token_accuracy": 0.08137390315532685,
"num_tokens": 731417.0,
"step": 395
},
{
"entropy": 7.463483619689941,
"epoch": 0.03360638521319051,
"grad_norm": 1.0703125,
"learning_rate": 0.00019950000000000002,
"loss": 7.1774,
"mean_token_accuracy": 0.08162600994110107,
"num_tokens": 741034.0,
"step": 400
},
{
"entropy": 7.472280406951905,
"epoch": 0.034026465028355386,
"grad_norm": 1.15625,
"learning_rate": 0.000202,
"loss": 7.1863,
"mean_token_accuracy": 0.08051239103078842,
"num_tokens": 749596.0,
"step": 405
},
{
"entropy": 7.539561700820923,
"epoch": 0.03444654484352027,
"grad_norm": 0.97265625,
"learning_rate": 0.00020449999999999998,
"loss": 7.1293,
"mean_token_accuracy": 0.08054611459374428,
"num_tokens": 758931.0,
"step": 410
},
{
"entropy": 7.323099613189697,
"epoch": 0.03486662465868515,
"grad_norm": 0.953125,
"learning_rate": 0.000207,
"loss": 7.0321,
"mean_token_accuracy": 0.08574115931987762,
"num_tokens": 767534.0,
"step": 415
},
{
"entropy": 7.436507081985473,
"epoch": 0.03528670447385003,
"grad_norm": 1.390625,
"learning_rate": 0.0002095,
"loss": 7.1087,
"mean_token_accuracy": 0.0788518838584423,
"num_tokens": 776456.0,
"step": 420
},
{
"entropy": 7.4387647151947025,
"epoch": 0.035706784289014915,
"grad_norm": 1.03125,
"learning_rate": 0.000212,
"loss": 7.1659,
"mean_token_accuracy": 0.07921701893210412,
"num_tokens": 786172.0,
"step": 425
},
{
"entropy": 7.382072401046753,
"epoch": 0.03612686410417979,
"grad_norm": 1.1171875,
"learning_rate": 0.0002145,
"loss": 7.045,
"mean_token_accuracy": 0.08349293395876885,
"num_tokens": 795081.0,
"step": 430
},
{
"entropy": 7.354331922531128,
"epoch": 0.036546943919344675,
"grad_norm": 1.390625,
"learning_rate": 0.00021700000000000002,
"loss": 7.0961,
"mean_token_accuracy": 0.07894284576177597,
"num_tokens": 804259.0,
"step": 435
},
{
"entropy": 7.416359519958496,
"epoch": 0.03696702373450956,
"grad_norm": 1.25,
"learning_rate": 0.0002195,
"loss": 7.1173,
"mean_token_accuracy": 0.08117417171597481,
"num_tokens": 813463.0,
"step": 440
},
{
"entropy": 7.3510298252105715,
"epoch": 0.037387103549674436,
"grad_norm": 1.171875,
"learning_rate": 0.000222,
"loss": 7.0365,
"mean_token_accuracy": 0.08668158128857613,
"num_tokens": 823029.0,
"step": 445
},
{
"entropy": 7.395490074157715,
"epoch": 0.03780718336483932,
"grad_norm": 1.1484375,
"learning_rate": 0.0002245,
"loss": 7.0882,
"mean_token_accuracy": 0.07999400310218334,
"num_tokens": 832902.0,
"step": 450
},
{
"entropy": 7.301269912719727,
"epoch": 0.0382272631800042,
"grad_norm": 0.99609375,
"learning_rate": 0.00022700000000000002,
"loss": 7.0386,
"mean_token_accuracy": 0.08621552512049675,
"num_tokens": 842162.0,
"step": 455
},
{
"entropy": 7.3866761207580565,
"epoch": 0.03864734299516908,
"grad_norm": 1.1171875,
"learning_rate": 0.00022950000000000002,
"loss": 7.0749,
"mean_token_accuracy": 0.08230168521404266,
"num_tokens": 852328.0,
"step": 460
},
{
"entropy": 7.298245000839233,
"epoch": 0.039067422810333964,
"grad_norm": 1.03125,
"learning_rate": 0.00023200000000000003,
"loss": 7.0332,
"mean_token_accuracy": 0.08720984831452369,
"num_tokens": 860929.0,
"step": 465
},
{
"entropy": 7.339401197433472,
"epoch": 0.03948750262549885,
"grad_norm": 1.3046875,
"learning_rate": 0.00023449999999999998,
"loss": 7.1147,
"mean_token_accuracy": 0.07953860089182854,
"num_tokens": 869144.0,
"step": 470
},
{
"entropy": 7.407509994506836,
"epoch": 0.039907582440663725,
"grad_norm": 1.1328125,
"learning_rate": 0.000237,
"loss": 7.0553,
"mean_token_accuracy": 0.08522386401891709,
"num_tokens": 877447.0,
"step": 475
},
{
"entropy": 7.326080799102783,
"epoch": 0.04032766225582861,
"grad_norm": 1.1875,
"learning_rate": 0.0002395,
"loss": 7.0219,
"mean_token_accuracy": 0.07903883457183838,
"num_tokens": 887020.0,
"step": 480
},
{
"entropy": 7.214662790298462,
"epoch": 0.040747742070993485,
"grad_norm": 1.1953125,
"learning_rate": 0.000242,
"loss": 7.0658,
"mean_token_accuracy": 0.08200340047478676,
"num_tokens": 895937.0,
"step": 485
},
{
"entropy": 7.282938385009766,
"epoch": 0.04116782188615837,
"grad_norm": 1.109375,
"learning_rate": 0.0002445,
"loss": 7.0681,
"mean_token_accuracy": 0.07768266201019287,
"num_tokens": 905446.0,
"step": 490
},
{
"entropy": 7.278123235702514,
"epoch": 0.04158790170132325,
"grad_norm": 1.234375,
"learning_rate": 0.000247,
"loss": 6.9754,
"mean_token_accuracy": 0.08815655037760735,
"num_tokens": 914547.0,
"step": 495
},
{
"entropy": 7.246780204772949,
"epoch": 0.04200798151648813,
"grad_norm": 1.1171875,
"learning_rate": 0.0002495,
"loss": 6.971,
"mean_token_accuracy": 0.08642620444297791,
"num_tokens": 922900.0,
"step": 500
},
{
"entropy": 7.250076103210449,
"epoch": 0.042428061331653014,
"grad_norm": 1.046875,
"learning_rate": 0.000252,
"loss": 6.9895,
"mean_token_accuracy": 0.09132884815335274,
"num_tokens": 930876.0,
"step": 505
},
{
"entropy": 7.281206130981445,
"epoch": 0.0428481411468179,
"grad_norm": 1.0703125,
"learning_rate": 0.0002545,
"loss": 7.0298,
"mean_token_accuracy": 0.08785640895366668,
"num_tokens": 939871.0,
"step": 510
},
{
"entropy": 7.217110443115234,
"epoch": 0.043268220961982774,
"grad_norm": 1.1640625,
"learning_rate": 0.000257,
"loss": 7.0255,
"mean_token_accuracy": 0.08548255637288094,
"num_tokens": 948673.0,
"step": 515
},
{
"entropy": 7.166579723358154,
"epoch": 0.04368830077714766,
"grad_norm": 1.296875,
"learning_rate": 0.0002595,
"loss": 6.9755,
"mean_token_accuracy": 0.08237149193882942,
"num_tokens": 957603.0,
"step": 520
},
{
"entropy": 7.258489608764648,
"epoch": 0.04410838059231254,
"grad_norm": 1.1015625,
"learning_rate": 0.000262,
"loss": 7.0438,
"mean_token_accuracy": 0.08337677642703056,
"num_tokens": 967731.0,
"step": 525
},
{
"entropy": 7.215053987503052,
"epoch": 0.04452846040747742,
"grad_norm": 1.34375,
"learning_rate": 0.00026450000000000003,
"loss": 7.0538,
"mean_token_accuracy": 0.08425389714539051,
"num_tokens": 977427.0,
"step": 530
},
{
"entropy": 7.323162364959717,
"epoch": 0.0449485402226423,
"grad_norm": 1.4296875,
"learning_rate": 0.00026700000000000004,
"loss": 7.0242,
"mean_token_accuracy": 0.0810987412929535,
"num_tokens": 986758.0,
"step": 535
},
{
"entropy": 7.260653781890869,
"epoch": 0.045368620037807186,
"grad_norm": 1.265625,
"learning_rate": 0.00026950000000000005,
"loss": 7.0153,
"mean_token_accuracy": 0.0921817146241665,
"num_tokens": 996377.0,
"step": 540
},
{
"entropy": 7.14100399017334,
"epoch": 0.04578869985297206,
"grad_norm": 1.171875,
"learning_rate": 0.00027200000000000005,
"loss": 7.0719,
"mean_token_accuracy": 0.07598314173519612,
"num_tokens": 1006483.0,
"step": 545
},
{
"entropy": 7.1814124584198,
"epoch": 0.04620877966813695,
"grad_norm": 0.99609375,
"learning_rate": 0.0002745,
"loss": 6.9586,
"mean_token_accuracy": 0.08432785160839558,
"num_tokens": 1016132.0,
"step": 550
},
{
"entropy": 7.212322998046875,
"epoch": 0.04662885948330183,
"grad_norm": 1.0625,
"learning_rate": 0.000277,
"loss": 6.9099,
"mean_token_accuracy": 0.08570380732417107,
"num_tokens": 1024970.0,
"step": 555
},
{
"entropy": 7.235566568374634,
"epoch": 0.04704893929846671,
"grad_norm": 1.0703125,
"learning_rate": 0.0002795,
"loss": 6.986,
"mean_token_accuracy": 0.08736011460423469,
"num_tokens": 1034335.0,
"step": 560
},
{
"entropy": 7.172399663925171,
"epoch": 0.04746901911363159,
"grad_norm": 1.0859375,
"learning_rate": 0.00028199999999999997,
"loss": 7.0432,
"mean_token_accuracy": 0.09397755041718484,
"num_tokens": 1043954.0,
"step": 565
},
{
"entropy": 7.211180973052978,
"epoch": 0.04788909892879647,
"grad_norm": 1.0625,
"learning_rate": 0.0002845,
"loss": 6.9855,
"mean_token_accuracy": 0.08458798602223397,
"num_tokens": 1053554.0,
"step": 570
},
{
"entropy": 7.182736825942993,
"epoch": 0.04830917874396135,
"grad_norm": 1.1875,
"learning_rate": 0.000287,
"loss": 6.9441,
"mean_token_accuracy": 0.0878808081150055,
"num_tokens": 1062008.0,
"step": 575
},
{
"entropy": 7.098301124572754,
"epoch": 0.048729258559126236,
"grad_norm": 1.25,
"learning_rate": 0.0002895,
"loss": 7.0042,
"mean_token_accuracy": 0.09225907325744628,
"num_tokens": 1070740.0,
"step": 580
},
{
"entropy": 7.182776641845703,
"epoch": 0.04914933837429111,
"grad_norm": 1.328125,
"learning_rate": 0.000292,
"loss": 6.9919,
"mean_token_accuracy": 0.08668612986803055,
"num_tokens": 1079681.0,
"step": 585
},
{
"entropy": 7.184729337692261,
"epoch": 0.049569418189456,
"grad_norm": 1.3046875,
"learning_rate": 0.0002945,
"loss": 6.8713,
"mean_token_accuracy": 0.08976615592837334,
"num_tokens": 1088979.0,
"step": 590
},
{
"entropy": 7.06590256690979,
"epoch": 0.04998949800462088,
"grad_norm": 1.28125,
"learning_rate": 0.000297,
"loss": 6.8795,
"mean_token_accuracy": 0.09069397076964378,
"num_tokens": 1097870.0,
"step": 595
},
{
"entropy": 7.103166151046753,
"epoch": 0.05040957781978576,
"grad_norm": 1.1875,
"learning_rate": 0.0002995,
"loss": 6.9879,
"mean_token_accuracy": 0.08688322901725769,
"num_tokens": 1107948.0,
"step": 600
},
{
"entropy": 7.1195960521698,
"epoch": 0.05082965763495064,
"grad_norm": 1.125,
"learning_rate": 0.000302,
"loss": 6.9085,
"mean_token_accuracy": 0.08996571898460388,
"num_tokens": 1117032.0,
"step": 605
},
{
"entropy": 7.0409345626831055,
"epoch": 0.051249737450115525,
"grad_norm": 1.2578125,
"learning_rate": 0.0003045,
"loss": 6.8922,
"mean_token_accuracy": 0.09091109931468963,
"num_tokens": 1127834.0,
"step": 610
},
{
"entropy": 7.202378034591675,
"epoch": 0.0516698172652804,
"grad_norm": 1.359375,
"learning_rate": 0.000307,
"loss": 6.9702,
"mean_token_accuracy": 0.10021311864256859,
"num_tokens": 1137382.0,
"step": 615
},
{
"entropy": 6.999694728851319,
"epoch": 0.052089897080445285,
"grad_norm": 1.0703125,
"learning_rate": 0.0003095,
"loss": 6.8129,
"mean_token_accuracy": 0.09540090411901474,
"num_tokens": 1146095.0,
"step": 620
},
{
"entropy": 7.04736361503601,
"epoch": 0.05250997689561017,
"grad_norm": 1.0703125,
"learning_rate": 0.000312,
"loss": 6.8486,
"mean_token_accuracy": 0.09502546936273575,
"num_tokens": 1154981.0,
"step": 625
},
{
"entropy": 6.99720253944397,
"epoch": 0.052930056710775046,
"grad_norm": 1.140625,
"learning_rate": 0.0003145,
"loss": 6.8621,
"mean_token_accuracy": 0.09437942430377007,
"num_tokens": 1164939.0,
"step": 630
},
{
"entropy": 7.151091146469116,
"epoch": 0.05335013652593993,
"grad_norm": 1.3125,
"learning_rate": 0.000317,
"loss": 6.9914,
"mean_token_accuracy": 0.0866759791970253,
"num_tokens": 1174991.0,
"step": 635
},
{
"entropy": 7.180017423629761,
"epoch": 0.05377021634110481,
"grad_norm": 1.0234375,
"learning_rate": 0.0003195,
"loss": 7.0331,
"mean_token_accuracy": 0.0841725155711174,
"num_tokens": 1184885.0,
"step": 640
},
{
"entropy": 6.973786115646362,
"epoch": 0.05419029615626969,
"grad_norm": 1.2421875,
"learning_rate": 0.000322,
"loss": 6.9191,
"mean_token_accuracy": 0.08975687026977539,
"num_tokens": 1193637.0,
"step": 645
},
{
"entropy": 6.9996246814727785,
"epoch": 0.054610375971434574,
"grad_norm": 1.25,
"learning_rate": 0.00032450000000000003,
"loss": 6.7105,
"mean_token_accuracy": 0.09813873320817948,
"num_tokens": 1202188.0,
"step": 650
},
{
"entropy": 7.099790334701538,
"epoch": 0.05503045578659945,
"grad_norm": 1.125,
"learning_rate": 0.00032700000000000003,
"loss": 6.8407,
"mean_token_accuracy": 0.08720196485519409,
"num_tokens": 1210768.0,
"step": 655
},
{
"entropy": 7.041568231582642,
"epoch": 0.055450535601764335,
"grad_norm": 1.1875,
"learning_rate": 0.00032950000000000004,
"loss": 6.8421,
"mean_token_accuracy": 0.0917449563741684,
"num_tokens": 1219819.0,
"step": 660
},
{
"entropy": 7.046403980255127,
"epoch": 0.05587061541692922,
"grad_norm": 0.921875,
"learning_rate": 0.00033200000000000005,
"loss": 6.8979,
"mean_token_accuracy": 0.08456902354955673,
"num_tokens": 1229703.0,
"step": 665
},
{
"entropy": 7.123572778701782,
"epoch": 0.056290695232094096,
"grad_norm": 1.234375,
"learning_rate": 0.00033450000000000005,
"loss": 6.9292,
"mean_token_accuracy": 0.08853036314249038,
"num_tokens": 1238942.0,
"step": 670
},
{
"entropy": 7.159795522689819,
"epoch": 0.05671077504725898,
"grad_norm": 1.25,
"learning_rate": 0.000337,
"loss": 6.9738,
"mean_token_accuracy": 0.08909042924642563,
"num_tokens": 1248943.0,
"step": 675
},
{
"entropy": 6.958240079879761,
"epoch": 0.05713085486242386,
"grad_norm": 1.0703125,
"learning_rate": 0.0003395,
"loss": 6.8634,
"mean_token_accuracy": 0.09144520461559295,
"num_tokens": 1257761.0,
"step": 680
},
{
"entropy": 6.8688782215118405,
"epoch": 0.05755093467758874,
"grad_norm": 1.140625,
"learning_rate": 0.000342,
"loss": 6.7949,
"mean_token_accuracy": 0.08892206028103829,
"num_tokens": 1267216.0,
"step": 685
},
{
"entropy": 7.068194055557251,
"epoch": 0.057971014492753624,
"grad_norm": 1.125,
"learning_rate": 0.00034449999999999997,
"loss": 6.8935,
"mean_token_accuracy": 0.0898799903690815,
"num_tokens": 1277210.0,
"step": 690
},
{
"entropy": 7.016180753707886,
"epoch": 0.05839109430791851,
"grad_norm": 1.0546875,
"learning_rate": 0.000347,
"loss": 6.818,
"mean_token_accuracy": 0.08777436465024949,
"num_tokens": 1285310.0,
"step": 695
},
{
"entropy": 6.991688251495361,
"epoch": 0.058811174123083385,
"grad_norm": 1.265625,
"learning_rate": 0.0003495,
"loss": 6.83,
"mean_token_accuracy": 0.09071314185857773,
"num_tokens": 1294421.0,
"step": 700
},
{
"entropy": 6.878597545623779,
"epoch": 0.05923125393824827,
"grad_norm": 1.1953125,
"learning_rate": 0.000352,
"loss": 6.6618,
"mean_token_accuracy": 0.09866252094507218,
"num_tokens": 1303281.0,
"step": 705
},
{
"entropy": 6.936507320404052,
"epoch": 0.059651333753413145,
"grad_norm": 1.1953125,
"learning_rate": 0.0003545,
"loss": 6.824,
"mean_token_accuracy": 0.0997501090168953,
"num_tokens": 1312280.0,
"step": 710
},
{
"entropy": 6.8826006889343265,
"epoch": 0.06007141356857803,
"grad_norm": 0.98828125,
"learning_rate": 0.000357,
"loss": 6.7922,
"mean_token_accuracy": 0.09014676585793495,
"num_tokens": 1321243.0,
"step": 715
},
{
"entropy": 6.928562927246094,
"epoch": 0.06049149338374291,
"grad_norm": 1.0625,
"learning_rate": 0.0003595,
"loss": 6.8825,
"mean_token_accuracy": 0.09469160959124565,
"num_tokens": 1330324.0,
"step": 720
},
{
"entropy": 6.990442323684692,
"epoch": 0.06091157319890779,
"grad_norm": 1.140625,
"learning_rate": 0.000362,
"loss": 6.7224,
"mean_token_accuracy": 0.09678644239902497,
"num_tokens": 1339485.0,
"step": 725
},
{
"entropy": 6.953311347961426,
"epoch": 0.06133165301407267,
"grad_norm": 1.1796875,
"learning_rate": 0.0003645,
"loss": 6.8803,
"mean_token_accuracy": 0.08837029710412025,
"num_tokens": 1348640.0,
"step": 730
},
{
"entropy": 6.882500171661377,
"epoch": 0.06175173282923756,
"grad_norm": 1.1796875,
"learning_rate": 0.000367,
"loss": 6.7691,
"mean_token_accuracy": 0.09767747819423675,
"num_tokens": 1357581.0,
"step": 735
},
{
"entropy": 6.97215313911438,
"epoch": 0.062171812644402434,
"grad_norm": 1.1171875,
"learning_rate": 0.0003695,
"loss": 6.8411,
"mean_token_accuracy": 0.0938787505030632,
"num_tokens": 1367883.0,
"step": 740
},
{
"entropy": 6.919119882583618,
"epoch": 0.06259189245956731,
"grad_norm": 1.0546875,
"learning_rate": 0.000372,
"loss": 6.7914,
"mean_token_accuracy": 0.09219447746872902,
"num_tokens": 1376936.0,
"step": 745
},
{
"entropy": 6.825827884674072,
"epoch": 0.0630119722747322,
"grad_norm": 1.0859375,
"learning_rate": 0.0003745,
"loss": 6.7125,
"mean_token_accuracy": 0.09528392925858498,
"num_tokens": 1386359.0,
"step": 750
},
{
"entropy": 6.892624235153198,
"epoch": 0.06343205208989708,
"grad_norm": 1.0078125,
"learning_rate": 0.000377,
"loss": 6.7627,
"mean_token_accuracy": 0.09940937235951423,
"num_tokens": 1395223.0,
"step": 755
},
{
"entropy": 7.047525787353516,
"epoch": 0.06385213190506196,
"grad_norm": 1.09375,
"learning_rate": 0.0003795,
"loss": 6.9106,
"mean_token_accuracy": 0.09024005718529224,
"num_tokens": 1404917.0,
"step": 760
},
{
"entropy": 6.961672592163086,
"epoch": 0.06427221172022685,
"grad_norm": 1.1328125,
"learning_rate": 0.000382,
"loss": 6.8159,
"mean_token_accuracy": 0.10144984871149063,
"num_tokens": 1413348.0,
"step": 765
},
{
"entropy": 6.793653059005737,
"epoch": 0.06469229153539173,
"grad_norm": 1.1484375,
"learning_rate": 0.0003845,
"loss": 6.7916,
"mean_token_accuracy": 0.09195128381252289,
"num_tokens": 1421726.0,
"step": 770
},
{
"entropy": 6.895196437835693,
"epoch": 0.0651123713505566,
"grad_norm": 1.0546875,
"learning_rate": 0.00038700000000000003,
"loss": 6.7955,
"mean_token_accuracy": 0.09626475274562836,
"num_tokens": 1430686.0,
"step": 775
},
{
"entropy": 6.93384485244751,
"epoch": 0.06553245116572148,
"grad_norm": 1.0703125,
"learning_rate": 0.00038950000000000003,
"loss": 6.7897,
"mean_token_accuracy": 0.09465737789869308,
"num_tokens": 1439499.0,
"step": 780
},
{
"entropy": 6.955707168579101,
"epoch": 0.06595253098088637,
"grad_norm": 1.3203125,
"learning_rate": 0.00039200000000000004,
"loss": 6.7769,
"mean_token_accuracy": 0.09800057783722878,
"num_tokens": 1448220.0,
"step": 785
},
{
"entropy": 6.76906795501709,
"epoch": 0.06637261079605125,
"grad_norm": 1.046875,
"learning_rate": 0.00039450000000000005,
"loss": 6.7919,
"mean_token_accuracy": 0.08977739810943604,
"num_tokens": 1458217.0,
"step": 790
},
{
"entropy": 6.814671993255615,
"epoch": 0.06679269061121614,
"grad_norm": 1.03125,
"learning_rate": 0.00039700000000000005,
"loss": 6.7075,
"mean_token_accuracy": 0.09342081621289253,
"num_tokens": 1467422.0,
"step": 795
},
{
"entropy": 6.887504005432129,
"epoch": 0.06721277042638102,
"grad_norm": 1.125,
"learning_rate": 0.0003995,
"loss": 6.6819,
"mean_token_accuracy": 0.10001382231712341,
"num_tokens": 1476152.0,
"step": 800
},
{
"entropy": 6.807573080062866,
"epoch": 0.06763285024154589,
"grad_norm": 1.1171875,
"learning_rate": 0.000402,
"loss": 6.7751,
"mean_token_accuracy": 0.09214248061180115,
"num_tokens": 1485248.0,
"step": 805
},
{
"entropy": 6.854774427413941,
"epoch": 0.06805293005671077,
"grad_norm": 1.1484375,
"learning_rate": 0.0004045,
"loss": 6.7307,
"mean_token_accuracy": 0.09543775320053101,
"num_tokens": 1494248.0,
"step": 810
},
{
"entropy": 6.848575687408447,
"epoch": 0.06847300987187566,
"grad_norm": 1.234375,
"learning_rate": 0.00040699999999999997,
"loss": 6.8448,
"mean_token_accuracy": 0.0940382607281208,
"num_tokens": 1503565.0,
"step": 815
},
{
"entropy": 6.988439130783081,
"epoch": 0.06889308968704054,
"grad_norm": 1.0546875,
"learning_rate": 0.0004095,
"loss": 6.9384,
"mean_token_accuracy": 0.08889181464910507,
"num_tokens": 1513227.0,
"step": 820
},
{
"entropy": 6.93678297996521,
"epoch": 0.06931316950220542,
"grad_norm": 1.1796875,
"learning_rate": 0.000412,
"loss": 6.7217,
"mean_token_accuracy": 0.10070210471749305,
"num_tokens": 1522312.0,
"step": 825
},
{
"entropy": 6.770338535308838,
"epoch": 0.0697332493173703,
"grad_norm": 1.0390625,
"learning_rate": 0.0004145,
"loss": 6.6784,
"mean_token_accuracy": 0.09791189730167389,
"num_tokens": 1531720.0,
"step": 830
},
{
"entropy": 6.800765943527222,
"epoch": 0.07015332913253518,
"grad_norm": 1.1171875,
"learning_rate": 0.000417,
"loss": 6.7521,
"mean_token_accuracy": 0.09716509580612183,
"num_tokens": 1541238.0,
"step": 835
},
{
"entropy": 6.8829351425170895,
"epoch": 0.07057340894770006,
"grad_norm": 1.1328125,
"learning_rate": 0.0004195,
"loss": 6.8628,
"mean_token_accuracy": 0.09571778625249863,
"num_tokens": 1550875.0,
"step": 840
},
{
"entropy": 6.7474853515625,
"epoch": 0.07099348876286495,
"grad_norm": 1.0390625,
"learning_rate": 0.000422,
"loss": 6.7945,
"mean_token_accuracy": 0.09439405128359794,
"num_tokens": 1560287.0,
"step": 845
},
{
"entropy": 6.8450279712677,
"epoch": 0.07141356857802983,
"grad_norm": 1.0703125,
"learning_rate": 0.0004245,
"loss": 6.6719,
"mean_token_accuracy": 0.10050797313451768,
"num_tokens": 1569043.0,
"step": 850
},
{
"entropy": 6.72012848854065,
"epoch": 0.07183364839319471,
"grad_norm": 1.0703125,
"learning_rate": 0.000427,
"loss": 6.6946,
"mean_token_accuracy": 0.10327838435769081,
"num_tokens": 1578112.0,
"step": 855
},
{
"entropy": 6.666503381729126,
"epoch": 0.07225372820835958,
"grad_norm": 1.1484375,
"learning_rate": 0.0004295,
"loss": 6.6083,
"mean_token_accuracy": 0.10177602767944335,
"num_tokens": 1586587.0,
"step": 860
},
{
"entropy": 6.876049327850342,
"epoch": 0.07267380802352447,
"grad_norm": 1.0390625,
"learning_rate": 0.000432,
"loss": 6.7715,
"mean_token_accuracy": 0.09597784802317619,
"num_tokens": 1595585.0,
"step": 865
},
{
"entropy": 6.793572664260864,
"epoch": 0.07309388783868935,
"grad_norm": 1.078125,
"learning_rate": 0.0004345,
"loss": 6.7402,
"mean_token_accuracy": 0.09475546851754188,
"num_tokens": 1605355.0,
"step": 870
},
{
"entropy": 6.829131984710694,
"epoch": 0.07351396765385423,
"grad_norm": 1.1796875,
"learning_rate": 0.000437,
"loss": 6.7645,
"mean_token_accuracy": 0.09627607688307763,
"num_tokens": 1613637.0,
"step": 875
},
{
"entropy": 6.7632164478302,
"epoch": 0.07393404746901912,
"grad_norm": 1.1171875,
"learning_rate": 0.0004395,
"loss": 6.7187,
"mean_token_accuracy": 0.09899500831961631,
"num_tokens": 1622731.0,
"step": 880
},
{
"entropy": 6.812683629989624,
"epoch": 0.074354127284184,
"grad_norm": 1.0234375,
"learning_rate": 0.000442,
"loss": 6.678,
"mean_token_accuracy": 0.09412262439727784,
"num_tokens": 1632098.0,
"step": 885
},
{
"entropy": 6.743659448623657,
"epoch": 0.07477420709934887,
"grad_norm": 1.0234375,
"learning_rate": 0.0004445,
"loss": 6.6765,
"mean_token_accuracy": 0.09482985511422157,
"num_tokens": 1641259.0,
"step": 890
},
{
"entropy": 6.833035087585449,
"epoch": 0.07519428691451376,
"grad_norm": 1.1796875,
"learning_rate": 0.000447,
"loss": 6.7498,
"mean_token_accuracy": 0.09258906096220017,
"num_tokens": 1651362.0,
"step": 895
},
{
"entropy": 6.710019874572754,
"epoch": 0.07561436672967864,
"grad_norm": 1.109375,
"learning_rate": 0.00044950000000000003,
"loss": 6.6731,
"mean_token_accuracy": 0.09449022710323333,
"num_tokens": 1660190.0,
"step": 900
},
{
"entropy": 6.716372060775757,
"epoch": 0.07603444654484352,
"grad_norm": 1.1796875,
"learning_rate": 0.00045200000000000004,
"loss": 6.6958,
"mean_token_accuracy": 0.09791603237390518,
"num_tokens": 1669020.0,
"step": 905
},
{
"entropy": 6.81228666305542,
"epoch": 0.0764545263600084,
"grad_norm": 1.09375,
"learning_rate": 0.00045450000000000004,
"loss": 6.7321,
"mean_token_accuracy": 0.09860685616731643,
"num_tokens": 1678158.0,
"step": 910
},
{
"entropy": 6.792080020904541,
"epoch": 0.07687460617517328,
"grad_norm": 1.109375,
"learning_rate": 0.00045700000000000005,
"loss": 6.7306,
"mean_token_accuracy": 0.09886500239372253,
"num_tokens": 1687481.0,
"step": 915
},
{
"entropy": 6.71827883720398,
"epoch": 0.07729468599033816,
"grad_norm": 1.0234375,
"learning_rate": 0.00045950000000000006,
"loss": 6.6994,
"mean_token_accuracy": 0.103325155377388,
"num_tokens": 1696782.0,
"step": 920
},
{
"entropy": 6.721747827529907,
"epoch": 0.07771476580550304,
"grad_norm": 1.0546875,
"learning_rate": 0.000462,
"loss": 6.7107,
"mean_token_accuracy": 0.10372448563575745,
"num_tokens": 1706153.0,
"step": 925
},
{
"entropy": 6.703522777557373,
"epoch": 0.07813484562066793,
"grad_norm": 1.1640625,
"learning_rate": 0.0004645,
"loss": 6.7323,
"mean_token_accuracy": 0.10109473243355752,
"num_tokens": 1715585.0,
"step": 930
},
{
"entropy": 6.9429340839385985,
"epoch": 0.07855492543583281,
"grad_norm": 1.4296875,
"learning_rate": 0.000467,
"loss": 6.8552,
"mean_token_accuracy": 0.09585651680827141,
"num_tokens": 1724857.0,
"step": 935
},
{
"entropy": 6.723682641983032,
"epoch": 0.0789750052509977,
"grad_norm": 1.2265625,
"learning_rate": 0.0004695,
"loss": 6.6587,
"mean_token_accuracy": 0.10578344613313675,
"num_tokens": 1733528.0,
"step": 940
},
{
"entropy": 6.796629476547241,
"epoch": 0.07939508506616257,
"grad_norm": 0.9765625,
"learning_rate": 0.000472,
"loss": 6.7839,
"mean_token_accuracy": 0.09946857616305352,
"num_tokens": 1742953.0,
"step": 945
},
{
"entropy": 6.778720664978027,
"epoch": 0.07981516488132745,
"grad_norm": 1.265625,
"learning_rate": 0.0004745,
"loss": 6.7418,
"mean_token_accuracy": 0.10183344334363938,
"num_tokens": 1752155.0,
"step": 950
},
{
"entropy": 6.6747581481933596,
"epoch": 0.08023524469649233,
"grad_norm": 1.1328125,
"learning_rate": 0.000477,
"loss": 6.6189,
"mean_token_accuracy": 0.10308177843689918,
"num_tokens": 1760562.0,
"step": 955
},
{
"entropy": 6.6881184577941895,
"epoch": 0.08065532451165722,
"grad_norm": 1.1875,
"learning_rate": 0.0004795,
"loss": 6.6407,
"mean_token_accuracy": 0.09682166650891304,
"num_tokens": 1769631.0,
"step": 960
},
{
"entropy": 6.686205625534058,
"epoch": 0.0810754043268221,
"grad_norm": 1.1875,
"learning_rate": 0.000482,
"loss": 6.6737,
"mean_token_accuracy": 0.09623132348060608,
"num_tokens": 1779080.0,
"step": 965
},
{
"entropy": 6.71329026222229,
"epoch": 0.08149548414198697,
"grad_norm": 1.3359375,
"learning_rate": 0.0004845,
"loss": 6.6501,
"mean_token_accuracy": 0.09797736331820488,
"num_tokens": 1787830.0,
"step": 970
},
{
"entropy": 6.607724714279175,
"epoch": 0.08191556395715185,
"grad_norm": 1.0390625,
"learning_rate": 0.000487,
"loss": 6.5762,
"mean_token_accuracy": 0.10056376829743385,
"num_tokens": 1796998.0,
"step": 975
},
{
"entropy": 6.796718168258667,
"epoch": 0.08233564377231674,
"grad_norm": 1.1171875,
"learning_rate": 0.0004895,
"loss": 6.6548,
"mean_token_accuracy": 0.10055585950613022,
"num_tokens": 1806194.0,
"step": 980
},
{
"entropy": 6.432325410842895,
"epoch": 0.08275572358748162,
"grad_norm": 1.0,
"learning_rate": 0.000492,
"loss": 6.5356,
"mean_token_accuracy": 0.10625480636954307,
"num_tokens": 1815751.0,
"step": 985
},
{
"entropy": 6.659997034072876,
"epoch": 0.0831758034026465,
"grad_norm": 1.046875,
"learning_rate": 0.0004945,
"loss": 6.6207,
"mean_token_accuracy": 0.10119200572371483,
"num_tokens": 1825379.0,
"step": 990
},
{
"entropy": 6.685537910461425,
"epoch": 0.08359588321781139,
"grad_norm": 1.0859375,
"learning_rate": 0.000497,
"loss": 6.5776,
"mean_token_accuracy": 0.10274154916405678,
"num_tokens": 1834158.0,
"step": 995
},
{
"entropy": 6.586271667480469,
"epoch": 0.08401596303297626,
"grad_norm": 1.1796875,
"learning_rate": 0.0004995,
"loss": 6.5331,
"mean_token_accuracy": 0.1030009813606739,
"num_tokens": 1842724.0,
"step": 1000
},
{
"entropy": 6.606362009048462,
"epoch": 0.08443604284814114,
"grad_norm": 0.98828125,
"learning_rate": 0.000499999998724557,
"loss": 6.5511,
"mean_token_accuracy": 0.10277646854519844,
"num_tokens": 1852485.0,
"step": 1005
},
{
"entropy": 6.6398862361907955,
"epoch": 0.08485612266330603,
"grad_norm": 1.109375,
"learning_rate": 0.0004999999935430703,
"loss": 6.5824,
"mean_token_accuracy": 0.10646345019340515,
"num_tokens": 1861303.0,
"step": 1010
},
{
"entropy": 6.527065420150757,
"epoch": 0.08527620247847091,
"grad_norm": 1.078125,
"learning_rate": 0.0004999999843758243,
"loss": 6.5766,
"mean_token_accuracy": 0.1160741962492466,
"num_tokens": 1870859.0,
"step": 1015
},
{
"entropy": 6.720239210128784,
"epoch": 0.0856962822936358,
"grad_norm": 1.03125,
"learning_rate": 0.0004999999712228196,
"loss": 6.7375,
"mean_token_accuracy": 0.09559379816055298,
"num_tokens": 1880295.0,
"step": 1020
},
{
"entropy": 6.755932474136353,
"epoch": 0.08611636210880068,
"grad_norm": 1.03125,
"learning_rate": 0.0004999999540840562,
"loss": 6.6322,
"mean_token_accuracy": 0.10711020082235337,
"num_tokens": 1889193.0,
"step": 1025
},
{
"entropy": 6.635529565811157,
"epoch": 0.08653644192396555,
"grad_norm": 0.98046875,
"learning_rate": 0.0004999999329595345,
"loss": 6.7369,
"mean_token_accuracy": 0.09584898650646209,
"num_tokens": 1899437.0,
"step": 1030
},
{
"entropy": 6.71239709854126,
"epoch": 0.08695652173913043,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999999078492548,
"loss": 6.6284,
"mean_token_accuracy": 0.10203150510787964,
"num_tokens": 1907882.0,
"step": 1035
},
{
"entropy": 6.588162136077881,
"epoch": 0.08737660155429532,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999998787532176,
"loss": 6.5411,
"mean_token_accuracy": 0.10083841383457184,
"num_tokens": 1916872.0,
"step": 1040
},
{
"entropy": 6.676869010925293,
"epoch": 0.0877966813694602,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999998456714234,
"loss": 6.7265,
"mean_token_accuracy": 0.09887873977422715,
"num_tokens": 1926636.0,
"step": 1045
},
{
"entropy": 6.626446390151978,
"epoch": 0.08821676118462508,
"grad_norm": 1.125,
"learning_rate": 0.0004999998086038729,
"loss": 6.6125,
"mean_token_accuracy": 0.10665144100785255,
"num_tokens": 1935962.0,
"step": 1050
},
{
"entropy": 6.63335337638855,
"epoch": 0.08863684099978995,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999997675505665,
"loss": 6.5714,
"mean_token_accuracy": 0.10421753227710724,
"num_tokens": 1944600.0,
"step": 1055
},
{
"entropy": 6.66912956237793,
"epoch": 0.08905692081495484,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999997225115052,
"loss": 6.7618,
"mean_token_accuracy": 0.10299883931875228,
"num_tokens": 1954234.0,
"step": 1060
},
{
"entropy": 6.80169267654419,
"epoch": 0.08947700063011972,
"grad_norm": 1.109375,
"learning_rate": 0.0004999996734866896,
"loss": 6.7125,
"mean_token_accuracy": 0.10105927959084511,
"num_tokens": 1964499.0,
"step": 1065
},
{
"entropy": 6.4505407333374025,
"epoch": 0.0898970804452846,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999996204761206,
"loss": 6.4236,
"mean_token_accuracy": 0.1128264844417572,
"num_tokens": 1973635.0,
"step": 1070
},
{
"entropy": 6.56596827507019,
"epoch": 0.09031716026044949,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999995634797993,
"loss": 6.5739,
"mean_token_accuracy": 0.10490957424044609,
"num_tokens": 1983509.0,
"step": 1075
},
{
"entropy": 6.62415657043457,
"epoch": 0.09073724007561437,
"grad_norm": 1.09375,
"learning_rate": 0.0004999995024977265,
"loss": 6.5551,
"mean_token_accuracy": 0.11068339720368385,
"num_tokens": 1992336.0,
"step": 1080
},
{
"entropy": 6.581994724273682,
"epoch": 0.09115731989077924,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999994375299034,
"loss": 6.5937,
"mean_token_accuracy": 0.10452346429228783,
"num_tokens": 2001931.0,
"step": 1085
},
{
"entropy": 6.580089092254639,
"epoch": 0.09157739970594413,
"grad_norm": 0.96484375,
"learning_rate": 0.000499999368576331,
"loss": 6.4447,
"mean_token_accuracy": 0.11201497912406921,
"num_tokens": 2010935.0,
"step": 1090
},
{
"entropy": 6.511315250396729,
"epoch": 0.09199747952110901,
"grad_norm": 1.03125,
"learning_rate": 0.0004999992956370109,
"loss": 6.4995,
"mean_token_accuracy": 0.10933665409684182,
"num_tokens": 2020587.0,
"step": 1095
},
{
"entropy": 6.465148067474365,
"epoch": 0.0924175593362739,
"grad_norm": 1.0234375,
"learning_rate": 0.000499999218711944,
"loss": 6.5391,
"mean_token_accuracy": 0.10621756613254547,
"num_tokens": 2029743.0,
"step": 1100
},
{
"entropy": 6.62024712562561,
"epoch": 0.09283763915143878,
"grad_norm": 1.09375,
"learning_rate": 0.0004999991378011317,
"loss": 6.5513,
"mean_token_accuracy": 0.11122238337993622,
"num_tokens": 2038468.0,
"step": 1105
},
{
"entropy": 6.526382637023926,
"epoch": 0.09325771896660366,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999990529045757,
"loss": 6.4773,
"mean_token_accuracy": 0.10901794061064721,
"num_tokens": 2047456.0,
"step": 1110
},
{
"entropy": 6.6758506298065186,
"epoch": 0.09367779878176853,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999989640222771,
"loss": 6.7907,
"mean_token_accuracy": 0.09581167250871658,
"num_tokens": 2056691.0,
"step": 1115
},
{
"entropy": 6.724249839782715,
"epoch": 0.09409787859693342,
"grad_norm": 1.0,
"learning_rate": 0.000499998871154238,
"loss": 6.5726,
"mean_token_accuracy": 0.1056052066385746,
"num_tokens": 2066068.0,
"step": 1120
},
{
"entropy": 6.619223117828369,
"epoch": 0.0945179584120983,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999987743004597,
"loss": 6.5105,
"mean_token_accuracy": 0.10904356241226196,
"num_tokens": 2075113.0,
"step": 1125
},
{
"entropy": 6.579265213012695,
"epoch": 0.09493803822726318,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999986734609438,
"loss": 6.648,
"mean_token_accuracy": 0.10513966977596283,
"num_tokens": 2084557.0,
"step": 1130
},
{
"entropy": 6.630017042160034,
"epoch": 0.09535811804242807,
"grad_norm": 1.09375,
"learning_rate": 0.0004999985686356923,
"loss": 6.5466,
"mean_token_accuracy": 0.10622756630182266,
"num_tokens": 2093424.0,
"step": 1135
},
{
"entropy": 6.602306842803955,
"epoch": 0.09577819785759294,
"grad_norm": 1.0234375,
"learning_rate": 0.000499998459824707,
"loss": 6.6635,
"mean_token_accuracy": 0.10230447798967361,
"num_tokens": 2103066.0,
"step": 1140
},
{
"entropy": 6.612010765075683,
"epoch": 0.09619827767275782,
"grad_norm": 1.078125,
"learning_rate": 0.00049999834702799,
"loss": 6.5355,
"mean_token_accuracy": 0.1076541669666767,
"num_tokens": 2112447.0,
"step": 1145
},
{
"entropy": 6.522880172729492,
"epoch": 0.0966183574879227,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999982302455431,
"loss": 6.5497,
"mean_token_accuracy": 0.10876928493380547,
"num_tokens": 2121949.0,
"step": 1150
},
{
"entropy": 6.574669218063354,
"epoch": 0.09703843730308759,
"grad_norm": 1.078125,
"learning_rate": 0.0004999981094773683,
"loss": 6.4538,
"mean_token_accuracy": 0.10955686494708061,
"num_tokens": 2130464.0,
"step": 1155
},
{
"entropy": 6.626054668426514,
"epoch": 0.09745851711825247,
"grad_norm": 1.109375,
"learning_rate": 0.000499997984723468,
"loss": 6.6151,
"mean_token_accuracy": 0.10182780474424362,
"num_tokens": 2139577.0,
"step": 1160
},
{
"entropy": 6.2696503639221195,
"epoch": 0.09787859693341736,
"grad_norm": 0.9375,
"learning_rate": 0.0004999978559838441,
"loss": 6.3583,
"mean_token_accuracy": 0.10666822865605355,
"num_tokens": 2147919.0,
"step": 1165
},
{
"entropy": 6.4677763938903805,
"epoch": 0.09829867674858223,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999977232584991,
"loss": 6.5126,
"mean_token_accuracy": 0.10790005698800087,
"num_tokens": 2156936.0,
"step": 1170
},
{
"entropy": 6.619596195220947,
"epoch": 0.09871875656374711,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999975865474354,
"loss": 6.565,
"mean_token_accuracy": 0.1067568302154541,
"num_tokens": 2165362.0,
"step": 1175
},
{
"entropy": 6.488823509216308,
"epoch": 0.099138836378912,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999974458506551,
"loss": 6.4994,
"mean_token_accuracy": 0.105003522336483,
"num_tokens": 2173665.0,
"step": 1180
},
{
"entropy": 6.633204603195191,
"epoch": 0.09955891619407688,
"grad_norm": 1.140625,
"learning_rate": 0.000499997301168161,
"loss": 6.4861,
"mean_token_accuracy": 0.10850023925304413,
"num_tokens": 2182222.0,
"step": 1185
},
{
"entropy": 6.550716543197632,
"epoch": 0.09997899600924176,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999971524999556,
"loss": 6.5724,
"mean_token_accuracy": 0.11284129321575165,
"num_tokens": 2192358.0,
"step": 1190
},
{
"entropy": 6.5804180145263675,
"epoch": 0.10039907582440663,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999969998460414,
"loss": 6.5471,
"mean_token_accuracy": 0.10514037609100342,
"num_tokens": 2201889.0,
"step": 1195
},
{
"entropy": 6.548522186279297,
"epoch": 0.10081915563957151,
"grad_norm": 1.25,
"learning_rate": 0.0004999968432064213,
"loss": 6.5442,
"mean_token_accuracy": 0.11475524455308914,
"num_tokens": 2211810.0,
"step": 1200
},
{
"entropy": 6.474522876739502,
"epoch": 0.1012392354547364,
"grad_norm": 0.94140625,
"learning_rate": 0.0004999966825810979,
"loss": 6.483,
"mean_token_accuracy": 0.10969577804207802,
"num_tokens": 2221123.0,
"step": 1205
},
{
"entropy": 6.482648086547852,
"epoch": 0.10165931526990128,
"grad_norm": 1.0625,
"learning_rate": 0.0004999965179700742,
"loss": 6.4233,
"mean_token_accuracy": 0.11192466542124749,
"num_tokens": 2230129.0,
"step": 1210
},
{
"entropy": 6.427746820449829,
"epoch": 0.10207939508506617,
"grad_norm": 0.98046875,
"learning_rate": 0.000499996349373353,
"loss": 6.4731,
"mean_token_accuracy": 0.11126175448298455,
"num_tokens": 2239929.0,
"step": 1215
},
{
"entropy": 6.540882635116577,
"epoch": 0.10249947490023105,
"grad_norm": 1.0625,
"learning_rate": 0.0004999961767909374,
"loss": 6.4503,
"mean_token_accuracy": 0.11326849237084388,
"num_tokens": 2248078.0,
"step": 1220
},
{
"entropy": 6.496834564208984,
"epoch": 0.10291955471539592,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999960002228303,
"loss": 6.5467,
"mean_token_accuracy": 0.11118405237793923,
"num_tokens": 2256975.0,
"step": 1225
},
{
"entropy": 6.528544092178345,
"epoch": 0.1033396345305608,
"grad_norm": 1.109375,
"learning_rate": 0.0004999958196690349,
"loss": 6.4084,
"mean_token_accuracy": 0.11252974197268487,
"num_tokens": 2265797.0,
"step": 1230
},
{
"entropy": 6.500071668624878,
"epoch": 0.10375971434572569,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999956351295545,
"loss": 6.4961,
"mean_token_accuracy": 0.11529959514737129,
"num_tokens": 2274099.0,
"step": 1235
},
{
"entropy": 6.427072525024414,
"epoch": 0.10417979416089057,
"grad_norm": 1.03125,
"learning_rate": 0.0004999954466043922,
"loss": 6.4331,
"mean_token_accuracy": 0.11734503880143166,
"num_tokens": 2282360.0,
"step": 1240
},
{
"entropy": 6.490129566192627,
"epoch": 0.10459987397605545,
"grad_norm": 0.9375,
"learning_rate": 0.0004999952540935514,
"loss": 6.5292,
"mean_token_accuracy": 0.10359383374452591,
"num_tokens": 2292714.0,
"step": 1245
},
{
"entropy": 6.519558954238891,
"epoch": 0.10501995379122034,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999950575970356,
"loss": 6.4583,
"mean_token_accuracy": 0.11484305784106255,
"num_tokens": 2301633.0,
"step": 1250
},
{
"entropy": 6.521380376815796,
"epoch": 0.10544003360638521,
"grad_norm": 1.03125,
"learning_rate": 0.0004999948571148482,
"loss": 6.4373,
"mean_token_accuracy": 0.1137208767235279,
"num_tokens": 2310067.0,
"step": 1255
},
{
"entropy": 6.447480583190918,
"epoch": 0.10586011342155009,
"grad_norm": 1.046875,
"learning_rate": 0.0004999946526469927,
"loss": 6.5213,
"mean_token_accuracy": 0.11123185902833939,
"num_tokens": 2320090.0,
"step": 1260
},
{
"entropy": 6.488873481750488,
"epoch": 0.10628019323671498,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999944441934728,
"loss": 6.474,
"mean_token_accuracy": 0.11672058925032616,
"num_tokens": 2329255.0,
"step": 1265
},
{
"entropy": 6.575503969192505,
"epoch": 0.10670027305187986,
"grad_norm": 1.109375,
"learning_rate": 0.0004999942317542922,
"loss": 6.5597,
"mean_token_accuracy": 0.11168134436011315,
"num_tokens": 2339535.0,
"step": 1270
},
{
"entropy": 6.4351553440094,
"epoch": 0.10712035286704474,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999940153294546,
"loss": 6.4631,
"mean_token_accuracy": 0.11417224109172822,
"num_tokens": 2348948.0,
"step": 1275
},
{
"entropy": 6.516087198257447,
"epoch": 0.10754043268220961,
"grad_norm": 1.0078125,
"learning_rate": 0.000499993794918964,
"loss": 6.4852,
"mean_token_accuracy": 0.10800226852297783,
"num_tokens": 2359141.0,
"step": 1280
},
{
"entropy": 6.426724910736084,
"epoch": 0.1079605124973745,
"grad_norm": 1.15625,
"learning_rate": 0.0004999935705228241,
"loss": 6.5269,
"mean_token_accuracy": 0.10617210119962692,
"num_tokens": 2368906.0,
"step": 1285
},
{
"entropy": 6.60525369644165,
"epoch": 0.10838059231253938,
"grad_norm": 1.125,
"learning_rate": 0.0004999933421410389,
"loss": 6.5118,
"mean_token_accuracy": 0.11492194160819054,
"num_tokens": 2377029.0,
"step": 1290
},
{
"entropy": 6.507741403579712,
"epoch": 0.10880067212770426,
"grad_norm": 0.91015625,
"learning_rate": 0.0004999931097736125,
"loss": 6.5731,
"mean_token_accuracy": 0.10368336364626884,
"num_tokens": 2387088.0,
"step": 1295
},
{
"entropy": 6.560735607147217,
"epoch": 0.10922075194286915,
"grad_norm": 1.09375,
"learning_rate": 0.0004999928734205492,
"loss": 6.4678,
"mean_token_accuracy": 0.11101854220032692,
"num_tokens": 2395596.0,
"step": 1300
},
{
"entropy": 6.4469006061553955,
"epoch": 0.10964083175803403,
"grad_norm": 1.0625,
"learning_rate": 0.0004999926330818528,
"loss": 6.4508,
"mean_token_accuracy": 0.11560385897755623,
"num_tokens": 2404506.0,
"step": 1305
},
{
"entropy": 6.50532808303833,
"epoch": 0.1100609115731989,
"grad_norm": 1.125,
"learning_rate": 0.0004999923887575278,
"loss": 6.4871,
"mean_token_accuracy": 0.1127387061715126,
"num_tokens": 2414342.0,
"step": 1310
},
{
"entropy": 6.511183404922486,
"epoch": 0.11048099138836379,
"grad_norm": 1.0625,
"learning_rate": 0.0004999921404475785,
"loss": 6.464,
"mean_token_accuracy": 0.11368927583098412,
"num_tokens": 2423076.0,
"step": 1315
},
{
"entropy": 6.4253387451171875,
"epoch": 0.11090107120352867,
"grad_norm": 0.8984375,
"learning_rate": 0.0004999918881520093,
"loss": 6.415,
"mean_token_accuracy": 0.11362927556037902,
"num_tokens": 2432492.0,
"step": 1320
},
{
"entropy": 6.421670770645141,
"epoch": 0.11132115101869355,
"grad_norm": 1.015625,
"learning_rate": 0.0004999916318708246,
"loss": 6.3657,
"mean_token_accuracy": 0.11932958588004113,
"num_tokens": 2441916.0,
"step": 1325
},
{
"entropy": 6.4051666259765625,
"epoch": 0.11174123083385844,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999913716040291,
"loss": 6.4305,
"mean_token_accuracy": 0.11500561088323594,
"num_tokens": 2450932.0,
"step": 1330
},
{
"entropy": 6.410413789749145,
"epoch": 0.11216131064902331,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999911073516272,
"loss": 6.4353,
"mean_token_accuracy": 0.11533465534448624,
"num_tokens": 2460058.0,
"step": 1335
},
{
"entropy": 6.411908531188965,
"epoch": 0.11258139046418819,
"grad_norm": 1.015625,
"learning_rate": 0.0004999908391136237,
"loss": 6.3795,
"mean_token_accuracy": 0.11413594856858253,
"num_tokens": 2469607.0,
"step": 1340
},
{
"entropy": 6.480467748641968,
"epoch": 0.11300147027935308,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999905668900234,
"loss": 6.4206,
"mean_token_accuracy": 0.10967940241098403,
"num_tokens": 2478345.0,
"step": 1345
},
{
"entropy": 6.437506008148193,
"epoch": 0.11342155009451796,
"grad_norm": 1.1484375,
"learning_rate": 0.000499990290680831,
"loss": 6.3551,
"mean_token_accuracy": 0.11509306952357293,
"num_tokens": 2486662.0,
"step": 1350
},
{
"entropy": 6.440225267410279,
"epoch": 0.11384162990968284,
"grad_norm": 1.0625,
"learning_rate": 0.0004999900104860516,
"loss": 6.493,
"mean_token_accuracy": 0.10815305337309837,
"num_tokens": 2495392.0,
"step": 1355
},
{
"entropy": 6.476166391372681,
"epoch": 0.11426170972484773,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999897263056898,
"loss": 6.5134,
"mean_token_accuracy": 0.1084674522280693,
"num_tokens": 2505254.0,
"step": 1360
},
{
"entropy": 6.572962856292724,
"epoch": 0.1146817895400126,
"grad_norm": 1.0390625,
"learning_rate": 0.000499989438139751,
"loss": 6.3391,
"mean_token_accuracy": 0.11722229272127152,
"num_tokens": 2514096.0,
"step": 1365
},
{
"entropy": 6.320563554763794,
"epoch": 0.11510186935517748,
"grad_norm": 0.90625,
"learning_rate": 0.0004999891459882401,
"loss": 6.3415,
"mean_token_accuracy": 0.11798084080219269,
"num_tokens": 2523635.0,
"step": 1370
},
{
"entropy": 6.370605707168579,
"epoch": 0.11552194917034236,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999888498511624,
"loss": 6.4231,
"mean_token_accuracy": 0.11351286545395851,
"num_tokens": 2532528.0,
"step": 1375
},
{
"entropy": 6.42153754234314,
"epoch": 0.11594202898550725,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999885497285229,
"loss": 6.3223,
"mean_token_accuracy": 0.11377703249454499,
"num_tokens": 2541893.0,
"step": 1380
},
{
"entropy": 6.374641036987304,
"epoch": 0.11636210880067213,
"grad_norm": 1.0,
"learning_rate": 0.0004999882456203273,
"loss": 6.3828,
"mean_token_accuracy": 0.11570416316390038,
"num_tokens": 2551551.0,
"step": 1385
},
{
"entropy": 6.409754896163941,
"epoch": 0.11678218861583702,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999879375265806,
"loss": 6.3397,
"mean_token_accuracy": 0.11345641836524009,
"num_tokens": 2560183.0,
"step": 1390
},
{
"entropy": 6.346025371551514,
"epoch": 0.11720226843100189,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999876254472886,
"loss": 6.244,
"mean_token_accuracy": 0.1259176701307297,
"num_tokens": 2568697.0,
"step": 1395
},
{
"entropy": 6.418486166000366,
"epoch": 0.11762234824616677,
"grad_norm": 0.91796875,
"learning_rate": 0.0004999873093824565,
"loss": 6.4413,
"mean_token_accuracy": 0.11301257386803627,
"num_tokens": 2578151.0,
"step": 1400
},
{
"entropy": 6.546730661392212,
"epoch": 0.11804242806133165,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999869893320902,
"loss": 6.5711,
"mean_token_accuracy": 0.11398048102855682,
"num_tokens": 2585901.0,
"step": 1405
},
{
"entropy": 6.384084796905517,
"epoch": 0.11846250787649654,
"grad_norm": 1.046875,
"learning_rate": 0.0004999866652961952,
"loss": 6.3911,
"mean_token_accuracy": 0.1123290129005909,
"num_tokens": 2595655.0,
"step": 1410
},
{
"entropy": 6.452324104309082,
"epoch": 0.11888258769166142,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999863372747773,
"loss": 6.3493,
"mean_token_accuracy": 0.10948696061968803,
"num_tokens": 2604949.0,
"step": 1415
},
{
"entropy": 6.454392957687378,
"epoch": 0.11930266750682629,
"grad_norm": 1.1796875,
"learning_rate": 0.0004999860052678423,
"loss": 6.4265,
"mean_token_accuracy": 0.11580813452601432,
"num_tokens": 2614260.0,
"step": 1420
},
{
"entropy": 6.358513355255127,
"epoch": 0.11972274732199117,
"grad_norm": 1.1796875,
"learning_rate": 0.0004999856692753959,
"loss": 6.4088,
"mean_token_accuracy": 0.11651854142546654,
"num_tokens": 2623740.0,
"step": 1425
},
{
"entropy": 6.431614780426026,
"epoch": 0.12014282713715606,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999853292974444,
"loss": 6.3193,
"mean_token_accuracy": 0.119064449518919,
"num_tokens": 2631998.0,
"step": 1430
},
{
"entropy": 6.387428283691406,
"epoch": 0.12056290695232094,
"grad_norm": 0.9375,
"learning_rate": 0.0004999849853339936,
"loss": 6.4515,
"mean_token_accuracy": 0.11693638861179352,
"num_tokens": 2641169.0,
"step": 1435
},
{
"entropy": 6.461726379394531,
"epoch": 0.12098298676748583,
"grad_norm": 0.94140625,
"learning_rate": 0.0004999846373850497,
"loss": 6.3006,
"mean_token_accuracy": 0.11889183148741722,
"num_tokens": 2650576.0,
"step": 1440
},
{
"entropy": 6.285704851150513,
"epoch": 0.12140306658265071,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999842854506186,
"loss": 6.3909,
"mean_token_accuracy": 0.11337714865803719,
"num_tokens": 2660817.0,
"step": 1445
},
{
"entropy": 6.454374599456787,
"epoch": 0.12182314639781558,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999839295307069,
"loss": 6.3413,
"mean_token_accuracy": 0.11637749969959259,
"num_tokens": 2669338.0,
"step": 1450
},
{
"entropy": 6.4314216613769535,
"epoch": 0.12224322621298046,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999835696253206,
"loss": 6.3931,
"mean_token_accuracy": 0.1177656464278698,
"num_tokens": 2679108.0,
"step": 1455
},
{
"entropy": 6.414116144180298,
"epoch": 0.12266330602814535,
"grad_norm": 0.95703125,
"learning_rate": 0.0004999832057344664,
"loss": 6.3513,
"mean_token_accuracy": 0.11523670107126235,
"num_tokens": 2688126.0,
"step": 1460
},
{
"entropy": 6.248635339736938,
"epoch": 0.12308338584331023,
"grad_norm": 1.09375,
"learning_rate": 0.0004999828378581504,
"loss": 6.3207,
"mean_token_accuracy": 0.1255062073469162,
"num_tokens": 2697245.0,
"step": 1465
},
{
"entropy": 6.469296169281006,
"epoch": 0.12350346565847511,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999824659963793,
"loss": 6.3851,
"mean_token_accuracy": 0.12115937024354935,
"num_tokens": 2705934.0,
"step": 1470
},
{
"entropy": 6.348638343811035,
"epoch": 0.12392354547364,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999820901491598,
"loss": 6.3102,
"mean_token_accuracy": 0.12247596234083176,
"num_tokens": 2714367.0,
"step": 1475
},
{
"entropy": 6.288274192810059,
"epoch": 0.12434362528880487,
"grad_norm": 1.046875,
"learning_rate": 0.0004999817103164983,
"loss": 6.347,
"mean_token_accuracy": 0.120758505910635,
"num_tokens": 2724366.0,
"step": 1480
},
{
"entropy": 6.409024095535278,
"epoch": 0.12476370510396975,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999813264984017,
"loss": 6.3559,
"mean_token_accuracy": 0.11786462664604187,
"num_tokens": 2733980.0,
"step": 1485
},
{
"entropy": 6.405835437774658,
"epoch": 0.12518378491913462,
"grad_norm": 1.0,
"learning_rate": 0.0004999809386948767,
"loss": 6.3447,
"mean_token_accuracy": 0.12095973119139672,
"num_tokens": 2744013.0,
"step": 1490
},
{
"entropy": 6.306218957901001,
"epoch": 0.12560386473429952,
"grad_norm": 1.078125,
"learning_rate": 0.0004999805469059302,
"loss": 6.409,
"mean_token_accuracy": 0.11885412856936454,
"num_tokens": 2753385.0,
"step": 1495
},
{
"entropy": 6.402831554412842,
"epoch": 0.1260239445494644,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999801511315693,
"loss": 6.2797,
"mean_token_accuracy": 0.11564093008637429,
"num_tokens": 2762875.0,
"step": 1500
},
{
"entropy": 6.413339233398437,
"epoch": 0.1264440243646293,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999797513718007,
"loss": 6.3316,
"mean_token_accuracy": 0.12512060776352882,
"num_tokens": 2772182.0,
"step": 1505
},
{
"entropy": 6.232090759277344,
"epoch": 0.12686410417979416,
"grad_norm": 1.015625,
"learning_rate": 0.0004999793476266317,
"loss": 6.2777,
"mean_token_accuracy": 0.1216270886361599,
"num_tokens": 2780814.0,
"step": 1510
},
{
"entropy": 6.62731146812439,
"epoch": 0.12728418399495905,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999789398960695,
"loss": 6.5737,
"mean_token_accuracy": 0.11791431680321693,
"num_tokens": 2791104.0,
"step": 1515
},
{
"entropy": 6.245316696166992,
"epoch": 0.12770426381012392,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999785281801212,
"loss": 6.2623,
"mean_token_accuracy": 0.12040979042649269,
"num_tokens": 2800081.0,
"step": 1520
},
{
"entropy": 6.359339189529419,
"epoch": 0.1281243436252888,
"grad_norm": 1.09375,
"learning_rate": 0.000499978112478794,
"loss": 6.3902,
"mean_token_accuracy": 0.11980480477213859,
"num_tokens": 2809096.0,
"step": 1525
},
{
"entropy": 6.453446865081787,
"epoch": 0.1285444234404537,
"grad_norm": 1.046875,
"learning_rate": 0.0004999776927920955,
"loss": 6.3617,
"mean_token_accuracy": 0.12184310704469681,
"num_tokens": 2818857.0,
"step": 1530
},
{
"entropy": 6.291106510162353,
"epoch": 0.12896450325561856,
"grad_norm": 1.109375,
"learning_rate": 0.000499977269120033,
"loss": 6.444,
"mean_token_accuracy": 0.11637230366468429,
"num_tokens": 2829332.0,
"step": 1535
},
{
"entropy": 6.427845621109009,
"epoch": 0.12938458307078346,
"grad_norm": 0.97265625,
"learning_rate": 0.000499976841462614,
"loss": 6.3583,
"mean_token_accuracy": 0.11303109228610993,
"num_tokens": 2839193.0,
"step": 1540
},
{
"entropy": 6.396592044830323,
"epoch": 0.12980466288594833,
"grad_norm": 0.92578125,
"learning_rate": 0.000499976409819846,
"loss": 6.34,
"mean_token_accuracy": 0.11379488185048103,
"num_tokens": 2848535.0,
"step": 1545
},
{
"entropy": 6.191351747512817,
"epoch": 0.1302247427011132,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999759741917369,
"loss": 6.2442,
"mean_token_accuracy": 0.1232595019042492,
"num_tokens": 2858090.0,
"step": 1550
},
{
"entropy": 6.410407018661499,
"epoch": 0.1306448225162781,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999755345782941,
"loss": 6.3942,
"mean_token_accuracy": 0.11326258555054665,
"num_tokens": 2866984.0,
"step": 1555
},
{
"entropy": 6.209921407699585,
"epoch": 0.13106490233144297,
"grad_norm": 0.90625,
"learning_rate": 0.0004999750909795256,
"loss": 6.202,
"mean_token_accuracy": 0.12322057262063027,
"num_tokens": 2876550.0,
"step": 1560
},
{
"entropy": 6.351957511901856,
"epoch": 0.13148498214660786,
"grad_norm": 0.9765625,
"learning_rate": 0.0004999746433954394,
"loss": 6.3062,
"mean_token_accuracy": 0.11787799671292305,
"num_tokens": 2885782.0,
"step": 1565
},
{
"entropy": 6.3278861999511715,
"epoch": 0.13190506196177273,
"grad_norm": 1.03125,
"learning_rate": 0.000499974191826043,
"loss": 6.2833,
"mean_token_accuracy": 0.13189474642276763,
"num_tokens": 2894807.0,
"step": 1570
},
{
"entropy": 6.376989316940308,
"epoch": 0.1323251417769376,
"grad_norm": 1.15625,
"learning_rate": 0.0004999737362713448,
"loss": 6.3235,
"mean_token_accuracy": 0.12015982195734978,
"num_tokens": 2904076.0,
"step": 1575
},
{
"entropy": 6.2569879531860355,
"epoch": 0.1327452215921025,
"grad_norm": 1.046875,
"learning_rate": 0.0004999732767313527,
"loss": 6.2239,
"mean_token_accuracy": 0.12142896950244904,
"num_tokens": 2913761.0,
"step": 1580
},
{
"entropy": 6.479147958755493,
"epoch": 0.13316530140726737,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999728132060746,
"loss": 6.4597,
"mean_token_accuracy": 0.1231887847185135,
"num_tokens": 2922848.0,
"step": 1585
},
{
"entropy": 6.33915228843689,
"epoch": 0.13358538122243227,
"grad_norm": 0.91796875,
"learning_rate": 0.0004999723456955192,
"loss": 6.3453,
"mean_token_accuracy": 0.12047107368707657,
"num_tokens": 2932718.0,
"step": 1590
},
{
"entropy": 6.313900423049927,
"epoch": 0.13400546103759714,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999718741996945,
"loss": 6.2846,
"mean_token_accuracy": 0.12055236473679543,
"num_tokens": 2942686.0,
"step": 1595
},
{
"entropy": 6.250068187713623,
"epoch": 0.13442554085276204,
"grad_norm": 1.0234375,
"learning_rate": 0.000499971398718609,
"loss": 6.2541,
"mean_token_accuracy": 0.12233499884605407,
"num_tokens": 2952096.0,
"step": 1600
},
{
"entropy": 6.400825262069702,
"epoch": 0.1348456206679269,
"grad_norm": 0.984375,
"learning_rate": 0.0004999709192522708,
"loss": 6.343,
"mean_token_accuracy": 0.12068295776844025,
"num_tokens": 2960660.0,
"step": 1605
},
{
"entropy": 6.373941278457641,
"epoch": 0.13526570048309178,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999704358006887,
"loss": 6.338,
"mean_token_accuracy": 0.11847807541489601,
"num_tokens": 2969834.0,
"step": 1610
},
{
"entropy": 6.3392332077026365,
"epoch": 0.13568578029825668,
"grad_norm": 1.09375,
"learning_rate": 0.0004999699483638712,
"loss": 6.3224,
"mean_token_accuracy": 0.11975563690066338,
"num_tokens": 2979023.0,
"step": 1615
},
{
"entropy": 6.312718057632447,
"epoch": 0.13610586011342155,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999694569418269,
"loss": 6.3217,
"mean_token_accuracy": 0.12105349376797676,
"num_tokens": 2988083.0,
"step": 1620
},
{
"entropy": 6.298077011108399,
"epoch": 0.13652593992858644,
"grad_norm": 1.015625,
"learning_rate": 0.0004999689615345645,
"loss": 6.2407,
"mean_token_accuracy": 0.12377956956624984,
"num_tokens": 2997240.0,
"step": 1625
},
{
"entropy": 6.40955810546875,
"epoch": 0.1369460197437513,
"grad_norm": 1.046875,
"learning_rate": 0.0004999684621420928,
"loss": 6.325,
"mean_token_accuracy": 0.11809631884098053,
"num_tokens": 3007077.0,
"step": 1630
},
{
"entropy": 6.277155590057373,
"epoch": 0.13736609955891618,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999679587644205,
"loss": 6.3514,
"mean_token_accuracy": 0.117049939930439,
"num_tokens": 3015821.0,
"step": 1635
},
{
"entropy": 6.334466028213501,
"epoch": 0.13778617937408108,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999674514015568,
"loss": 6.27,
"mean_token_accuracy": 0.12460469976067542,
"num_tokens": 3025858.0,
"step": 1640
},
{
"entropy": 6.350087356567383,
"epoch": 0.13820625918924595,
"grad_norm": 0.9921875,
"learning_rate": 0.0004999669400535105,
"loss": 6.246,
"mean_token_accuracy": 0.11693726480007172,
"num_tokens": 3035537.0,
"step": 1645
},
{
"entropy": 6.1635466575622555,
"epoch": 0.13862633900441085,
"grad_norm": 1.140625,
"learning_rate": 0.0004999664247202907,
"loss": 6.1621,
"mean_token_accuracy": 0.12513800859451293,
"num_tokens": 3044204.0,
"step": 1650
},
{
"entropy": 6.405851364135742,
"epoch": 0.13904641881957572,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999659054019066,
"loss": 6.3371,
"mean_token_accuracy": 0.12036163732409477,
"num_tokens": 3053111.0,
"step": 1655
},
{
"entropy": 6.243852519989014,
"epoch": 0.1394664986347406,
"grad_norm": 1.078125,
"learning_rate": 0.0004999653820983673,
"loss": 6.2536,
"mean_token_accuracy": 0.12018982619047165,
"num_tokens": 3062456.0,
"step": 1660
},
{
"entropy": 6.309187984466552,
"epoch": 0.13988657844990549,
"grad_norm": 1.03125,
"learning_rate": 0.000499964854809682,
"loss": 6.2742,
"mean_token_accuracy": 0.12464778944849968,
"num_tokens": 3071132.0,
"step": 1665
},
{
"entropy": 6.253125143051148,
"epoch": 0.14030665826507036,
"grad_norm": 1.0,
"learning_rate": 0.0004999643235358602,
"loss": 6.2296,
"mean_token_accuracy": 0.12636966705322267,
"num_tokens": 3080892.0,
"step": 1670
},
{
"entropy": 6.241448926925659,
"epoch": 0.14072673808023525,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999637882769112,
"loss": 6.166,
"mean_token_accuracy": 0.129159078001976,
"num_tokens": 3089874.0,
"step": 1675
},
{
"entropy": 6.321546411514282,
"epoch": 0.14114681789540012,
"grad_norm": 0.9375,
"learning_rate": 0.0004999632490328447,
"loss": 6.3087,
"mean_token_accuracy": 0.12358235269784927,
"num_tokens": 3099535.0,
"step": 1680
},
{
"entropy": 6.297367906570434,
"epoch": 0.14156689771056502,
"grad_norm": 0.9609375,
"learning_rate": 0.0004999627058036699,
"loss": 6.2544,
"mean_token_accuracy": 0.11949014514684678,
"num_tokens": 3108772.0,
"step": 1685
},
{
"entropy": 6.318113327026367,
"epoch": 0.1419869775257299,
"grad_norm": 1.03125,
"learning_rate": 0.0004999621585893966,
"loss": 6.2875,
"mean_token_accuracy": 0.11683647856116294,
"num_tokens": 3118333.0,
"step": 1690
},
{
"entropy": 6.319453144073487,
"epoch": 0.14240705734089476,
"grad_norm": 1.0625,
"learning_rate": 0.0004999616073900346,
"loss": 6.3252,
"mean_token_accuracy": 0.12005885392427444,
"num_tokens": 3127356.0,
"step": 1695
},
{
"entropy": 6.334293079376221,
"epoch": 0.14282713715605966,
"grad_norm": 1.09375,
"learning_rate": 0.0004999610522055935,
"loss": 6.2875,
"mean_token_accuracy": 0.11621066182851791,
"num_tokens": 3136859.0,
"step": 1700
},
{
"entropy": 6.296079492568969,
"epoch": 0.14324721697122453,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999604930360832,
"loss": 6.3178,
"mean_token_accuracy": 0.11642780154943466,
"num_tokens": 3146607.0,
"step": 1705
},
{
"entropy": 6.24936580657959,
"epoch": 0.14366729678638943,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999599298815136,
"loss": 6.2668,
"mean_token_accuracy": 0.12453223988413811,
"num_tokens": 3156327.0,
"step": 1710
},
{
"entropy": 6.249935483932495,
"epoch": 0.1440873766015543,
"grad_norm": 1.625,
"learning_rate": 0.0004999593627418947,
"loss": 6.203,
"mean_token_accuracy": 0.12521106824278833,
"num_tokens": 3165559.0,
"step": 1715
},
{
"entropy": 6.336143445968628,
"epoch": 0.14450745641671917,
"grad_norm": 1.0625,
"learning_rate": 0.0004999587916172365,
"loss": 6.3011,
"mean_token_accuracy": 0.1152021661400795,
"num_tokens": 3173850.0,
"step": 1720
},
{
"entropy": 6.285958003997803,
"epoch": 0.14492753623188406,
"grad_norm": 1.015625,
"learning_rate": 0.0004999582165075492,
"loss": 6.2465,
"mean_token_accuracy": 0.115229881554842,
"num_tokens": 3182838.0,
"step": 1725
},
{
"entropy": 6.184572982788086,
"epoch": 0.14534761604704893,
"grad_norm": 1.046875,
"learning_rate": 0.0004999576374128429,
"loss": 6.237,
"mean_token_accuracy": 0.12389757782220841,
"num_tokens": 3191692.0,
"step": 1730
},
{
"entropy": 6.397026300430298,
"epoch": 0.14576769586221383,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999570543331279,
"loss": 6.2806,
"mean_token_accuracy": 0.12041986957192422,
"num_tokens": 3200069.0,
"step": 1735
},
{
"entropy": 6.260718488693238,
"epoch": 0.1461877756773787,
"grad_norm": 1.140625,
"learning_rate": 0.0004999564672684145,
"loss": 6.3438,
"mean_token_accuracy": 0.1193387195467949,
"num_tokens": 3209653.0,
"step": 1740
},
{
"entropy": 6.393806028366089,
"epoch": 0.14660785549254357,
"grad_norm": 1.03125,
"learning_rate": 0.0004999558762187131,
"loss": 6.217,
"mean_token_accuracy": 0.12818640992045402,
"num_tokens": 3218313.0,
"step": 1745
},
{
"entropy": 6.188047790527344,
"epoch": 0.14702793530770847,
"grad_norm": 1.03125,
"learning_rate": 0.0004999552811840342,
"loss": 6.1623,
"mean_token_accuracy": 0.12572802156209945,
"num_tokens": 3227525.0,
"step": 1750
},
{
"entropy": 6.254945421218872,
"epoch": 0.14744801512287334,
"grad_norm": 0.9609375,
"learning_rate": 0.0004999546821643884,
"loss": 6.275,
"mean_token_accuracy": 0.1252661019563675,
"num_tokens": 3237022.0,
"step": 1755
},
{
"entropy": 6.239528560638428,
"epoch": 0.14786809493803824,
"grad_norm": 1.0,
"learning_rate": 0.0004999540791597861,
"loss": 6.1635,
"mean_token_accuracy": 0.1251967169344425,
"num_tokens": 3246605.0,
"step": 1760
},
{
"entropy": 6.140348815917969,
"epoch": 0.1482881747532031,
"grad_norm": 1.046875,
"learning_rate": 0.0004999534721702383,
"loss": 6.1328,
"mean_token_accuracy": 0.12913108766078948,
"num_tokens": 3255587.0,
"step": 1765
},
{
"entropy": 6.292690563201904,
"epoch": 0.148708254568368,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999528611957553,
"loss": 6.2117,
"mean_token_accuracy": 0.1271899625658989,
"num_tokens": 3265669.0,
"step": 1770
},
{
"entropy": 6.244245195388794,
"epoch": 0.14912833438353287,
"grad_norm": 1.09375,
"learning_rate": 0.0004999522462363485,
"loss": 6.2107,
"mean_token_accuracy": 0.12735746875405313,
"num_tokens": 3275013.0,
"step": 1775
},
{
"entropy": 6.274697399139404,
"epoch": 0.14954841419869774,
"grad_norm": 0.95703125,
"learning_rate": 0.0004999516272920283,
"loss": 6.3057,
"mean_token_accuracy": 0.12256114035844803,
"num_tokens": 3284723.0,
"step": 1780
},
{
"entropy": 6.149048852920532,
"epoch": 0.14996849401386264,
"grad_norm": 0.98828125,
"learning_rate": 0.000499951004362806,
"loss": 6.148,
"mean_token_accuracy": 0.12680203318595887,
"num_tokens": 3293860.0,
"step": 1785
},
{
"entropy": 6.166975450515747,
"epoch": 0.1503885738290275,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999503774486924,
"loss": 6.1912,
"mean_token_accuracy": 0.12579060941934586,
"num_tokens": 3303158.0,
"step": 1790
},
{
"entropy": 6.141963243484497,
"epoch": 0.1508086536441924,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999497465496987,
"loss": 6.127,
"mean_token_accuracy": 0.11945450827479362,
"num_tokens": 3313068.0,
"step": 1795
},
{
"entropy": 6.272381019592285,
"epoch": 0.15122873345935728,
"grad_norm": 1.0546875,
"learning_rate": 0.000499949111665836,
"loss": 6.2113,
"mean_token_accuracy": 0.12545057907700538,
"num_tokens": 3321885.0,
"step": 1800
},
{
"entropy": 6.260741281509399,
"epoch": 0.15164881327452215,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999484727971158,
"loss": 6.2004,
"mean_token_accuracy": 0.1255272276699543,
"num_tokens": 3330924.0,
"step": 1805
},
{
"entropy": 6.217929172515869,
"epoch": 0.15206889308968705,
"grad_norm": 0.96875,
"learning_rate": 0.000499947829943549,
"loss": 6.2325,
"mean_token_accuracy": 0.12151647359132767,
"num_tokens": 3340070.0,
"step": 1810
},
{
"entropy": 6.26392617225647,
"epoch": 0.15248897290485192,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999471831051474,
"loss": 6.2282,
"mean_token_accuracy": 0.1356467515230179,
"num_tokens": 3349870.0,
"step": 1815
},
{
"entropy": 6.306232357025147,
"epoch": 0.1529090527200168,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999465322819222,
"loss": 6.2642,
"mean_token_accuracy": 0.12093094363808632,
"num_tokens": 3359573.0,
"step": 1820
},
{
"entropy": 6.253879976272583,
"epoch": 0.15332913253518168,
"grad_norm": 1.03125,
"learning_rate": 0.0004999458774738851,
"loss": 6.2008,
"mean_token_accuracy": 0.1337040476500988,
"num_tokens": 3368577.0,
"step": 1825
},
{
"entropy": 6.189093828201294,
"epoch": 0.15374921235034655,
"grad_norm": 1.046875,
"learning_rate": 0.0004999452186810476,
"loss": 6.1859,
"mean_token_accuracy": 0.12909814566373826,
"num_tokens": 3377801.0,
"step": 1830
},
{
"entropy": 6.290127277374268,
"epoch": 0.15416929216551145,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999445559034214,
"loss": 6.2251,
"mean_token_accuracy": 0.13048515170812608,
"num_tokens": 3386666.0,
"step": 1835
},
{
"entropy": 6.365573501586914,
"epoch": 0.15458937198067632,
"grad_norm": 1.015625,
"learning_rate": 0.0004999438891410181,
"loss": 6.364,
"mean_token_accuracy": 0.11682043001055717,
"num_tokens": 3396086.0,
"step": 1840
},
{
"entropy": 6.219829654693603,
"epoch": 0.15500945179584122,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999432183938496,
"loss": 6.2847,
"mean_token_accuracy": 0.12174131944775582,
"num_tokens": 3404894.0,
"step": 1845
},
{
"entropy": 6.1805830001831055,
"epoch": 0.1554295316110061,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999425436619279,
"loss": 6.2621,
"mean_token_accuracy": 0.119705418497324,
"num_tokens": 3414172.0,
"step": 1850
},
{
"entropy": 6.348053646087647,
"epoch": 0.15584961142617096,
"grad_norm": 0.9453125,
"learning_rate": 0.000499941864945265,
"loss": 6.2477,
"mean_token_accuracy": 0.11934950053691865,
"num_tokens": 3423409.0,
"step": 1855
},
{
"entropy": 6.174108409881592,
"epoch": 0.15626969124133586,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999411822438726,
"loss": 6.1935,
"mean_token_accuracy": 0.12505294382572174,
"num_tokens": 3433047.0,
"step": 1860
},
{
"entropy": 6.257612848281861,
"epoch": 0.15668977105650073,
"grad_norm": 1.109375,
"learning_rate": 0.000499940495557763,
"loss": 6.178,
"mean_token_accuracy": 0.12779488489031793,
"num_tokens": 3442490.0,
"step": 1865
},
{
"entropy": 6.248907375335693,
"epoch": 0.15710985087166562,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999398048869485,
"loss": 6.2472,
"mean_token_accuracy": 0.12368729263544083,
"num_tokens": 3451804.0,
"step": 1870
},
{
"entropy": 6.3145753860473635,
"epoch": 0.1575299306868305,
"grad_norm": 1.015625,
"learning_rate": 0.000499939110231441,
"loss": 6.2386,
"mean_token_accuracy": 0.1283128082752228,
"num_tokens": 3461481.0,
"step": 1875
},
{
"entropy": 6.25744366645813,
"epoch": 0.1579500105019954,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999384115912531,
"loss": 6.2722,
"mean_token_accuracy": 0.12746385112404823,
"num_tokens": 3471798.0,
"step": 1880
},
{
"entropy": 6.1273274421691895,
"epoch": 0.15837009031716026,
"grad_norm": 0.97265625,
"learning_rate": 0.000499937708966397,
"loss": 6.1695,
"mean_token_accuracy": 0.1235118955373764,
"num_tokens": 3481386.0,
"step": 1885
},
{
"entropy": 6.278209686279297,
"epoch": 0.15879017013232513,
"grad_norm": 1.0,
"learning_rate": 0.0004999370023568853,
"loss": 6.1682,
"mean_token_accuracy": 0.1265706330537796,
"num_tokens": 3489981.0,
"step": 1890
},
{
"entropy": 6.165022706985473,
"epoch": 0.15921024994749003,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999362917627304,
"loss": 6.1462,
"mean_token_accuracy": 0.12947175428271293,
"num_tokens": 3498551.0,
"step": 1895
},
{
"entropy": 6.203568363189698,
"epoch": 0.1596303297626549,
"grad_norm": 1.0625,
"learning_rate": 0.0004999355771839448,
"loss": 6.1261,
"mean_token_accuracy": 0.12978117987513543,
"num_tokens": 3507921.0,
"step": 1900
},
{
"entropy": 6.325990772247314,
"epoch": 0.1600504095778198,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999348586205414,
"loss": 6.3032,
"mean_token_accuracy": 0.1252683699131012,
"num_tokens": 3517570.0,
"step": 1905
},
{
"entropy": 6.316230726242066,
"epoch": 0.16047048939298467,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999341360725327,
"loss": 6.2873,
"mean_token_accuracy": 0.1217451848089695,
"num_tokens": 3526774.0,
"step": 1910
},
{
"entropy": 6.25418004989624,
"epoch": 0.16089056920814954,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999334095399317,
"loss": 6.2253,
"mean_token_accuracy": 0.13255979344248772,
"num_tokens": 3535319.0,
"step": 1915
},
{
"entropy": 6.0919859409332275,
"epoch": 0.16131064902331443,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999326790227512,
"loss": 6.192,
"mean_token_accuracy": 0.12934273406863211,
"num_tokens": 3544468.0,
"step": 1920
},
{
"entropy": 6.13924469947815,
"epoch": 0.1617307288384793,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999319445210041,
"loss": 6.0772,
"mean_token_accuracy": 0.1331401713192463,
"num_tokens": 3553529.0,
"step": 1925
},
{
"entropy": 6.140831708908081,
"epoch": 0.1621508086536442,
"grad_norm": 0.984375,
"learning_rate": 0.0004999312060347034,
"loss": 6.1169,
"mean_token_accuracy": 0.128947152197361,
"num_tokens": 3563053.0,
"step": 1930
},
{
"entropy": 6.201217079162598,
"epoch": 0.16257088846880907,
"grad_norm": 0.97265625,
"learning_rate": 0.0004999304635638621,
"loss": 6.0724,
"mean_token_accuracy": 0.1313454084098339,
"num_tokens": 3571877.0,
"step": 1935
},
{
"entropy": 6.137188291549682,
"epoch": 0.16299096828397394,
"grad_norm": 0.9140625,
"learning_rate": 0.0004999297171084935,
"loss": 6.1144,
"mean_token_accuracy": 0.13178201764822006,
"num_tokens": 3581496.0,
"step": 1940
},
{
"entropy": 6.274984455108642,
"epoch": 0.16341104809913884,
"grad_norm": 0.984375,
"learning_rate": 0.0004999289666686109,
"loss": 6.1397,
"mean_token_accuracy": 0.12548287436366082,
"num_tokens": 3590752.0,
"step": 1945
},
{
"entropy": 6.027514791488647,
"epoch": 0.1638311279143037,
"grad_norm": 0.9765625,
"learning_rate": 0.0004999282122442274,
"loss": 6.1413,
"mean_token_accuracy": 0.12893687859177588,
"num_tokens": 3599885.0,
"step": 1950
},
{
"entropy": 6.314913415908814,
"epoch": 0.1642512077294686,
"grad_norm": 0.953125,
"learning_rate": 0.0004999274538353564,
"loss": 6.225,
"mean_token_accuracy": 0.12287019938230515,
"num_tokens": 3610039.0,
"step": 1955
},
{
"entropy": 6.137080287933349,
"epoch": 0.16467128754463348,
"grad_norm": 1.046875,
"learning_rate": 0.0004999266914420114,
"loss": 6.1398,
"mean_token_accuracy": 0.12600617855787277,
"num_tokens": 3619954.0,
"step": 1960
},
{
"entropy": 6.19411187171936,
"epoch": 0.16509136735979837,
"grad_norm": 1.0625,
"learning_rate": 0.000499925925064206,
"loss": 6.1087,
"mean_token_accuracy": 0.13167392686009408,
"num_tokens": 3628164.0,
"step": 1965
},
{
"entropy": 6.276717853546143,
"epoch": 0.16551144717496324,
"grad_norm": 1.0,
"learning_rate": 0.0004999251547019535,
"loss": 6.2662,
"mean_token_accuracy": 0.12937605381011963,
"num_tokens": 3636778.0,
"step": 1970
},
{
"entropy": 6.321251440048218,
"epoch": 0.16593152699012811,
"grad_norm": 1.0,
"learning_rate": 0.0004999243803552678,
"loss": 6.2031,
"mean_token_accuracy": 0.12865082323551177,
"num_tokens": 3647046.0,
"step": 1975
},
{
"entropy": 6.092304801940918,
"epoch": 0.166351606805293,
"grad_norm": 1.09375,
"learning_rate": 0.0004999236020241625,
"loss": 6.1208,
"mean_token_accuracy": 0.12650337740778922,
"num_tokens": 3656130.0,
"step": 1980
},
{
"entropy": 6.216774225234985,
"epoch": 0.16677168662045788,
"grad_norm": 1.046875,
"learning_rate": 0.0004999228197086514,
"loss": 6.1975,
"mean_token_accuracy": 0.11985883414745331,
"num_tokens": 3666145.0,
"step": 1985
},
{
"entropy": 6.222474908828735,
"epoch": 0.16719176643562278,
"grad_norm": 0.8984375,
"learning_rate": 0.0004999220334087484,
"loss": 6.2471,
"mean_token_accuracy": 0.1268869273364544,
"num_tokens": 3676722.0,
"step": 1990
},
{
"entropy": 6.2897861957550045,
"epoch": 0.16761184625078765,
"grad_norm": 1.0,
"learning_rate": 0.0004999212431244673,
"loss": 6.2493,
"mean_token_accuracy": 0.1202566534280777,
"num_tokens": 3685880.0,
"step": 1995
},
{
"entropy": 6.107799291610718,
"epoch": 0.16803192606595252,
"grad_norm": 0.9921875,
"learning_rate": 0.0004999204488558222,
"loss": 6.0634,
"mean_token_accuracy": 0.13050426542758942,
"num_tokens": 3695167.0,
"step": 2000
},
{
"entropy": 6.197722768783569,
"epoch": 0.16845200588111742,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999196506028273,
"loss": 6.183,
"mean_token_accuracy": 0.1260111801326275,
"num_tokens": 3703700.0,
"step": 2005
},
{
"entropy": 6.235522031784058,
"epoch": 0.1688720856962823,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999188483654965,
"loss": 6.1217,
"mean_token_accuracy": 0.1260783888399601,
"num_tokens": 3712825.0,
"step": 2010
},
{
"entropy": 6.099165439605713,
"epoch": 0.16929216551144718,
"grad_norm": 0.953125,
"learning_rate": 0.0004999180421438442,
"loss": 6.089,
"mean_token_accuracy": 0.12902323082089423,
"num_tokens": 3721807.0,
"step": 2015
},
{
"entropy": 6.248017930984497,
"epoch": 0.16971224532661205,
"grad_norm": 1.09375,
"learning_rate": 0.0004999172319378846,
"loss": 6.2757,
"mean_token_accuracy": 0.12072234824299813,
"num_tokens": 3730502.0,
"step": 2020
},
{
"entropy": 6.263659858703614,
"epoch": 0.17013232514177692,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999164177476319,
"loss": 6.149,
"mean_token_accuracy": 0.12994196712970735,
"num_tokens": 3739696.0,
"step": 2025
},
{
"entropy": 6.063019037246704,
"epoch": 0.17055240495694182,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999155995731009,
"loss": 6.1537,
"mean_token_accuracy": 0.13063850849866868,
"num_tokens": 3748675.0,
"step": 2030
},
{
"entropy": 6.347209215164185,
"epoch": 0.1709724847721067,
"grad_norm": 1.046875,
"learning_rate": 0.0004999147774143057,
"loss": 6.233,
"mean_token_accuracy": 0.12277846410870552,
"num_tokens": 3757714.0,
"step": 2035
},
{
"entropy": 6.09723424911499,
"epoch": 0.1713925645872716,
"grad_norm": 0.984375,
"learning_rate": 0.000499913951271261,
"loss": 6.0474,
"mean_token_accuracy": 0.13144370764493943,
"num_tokens": 3767589.0,
"step": 2040
},
{
"entropy": 6.220744895935058,
"epoch": 0.17181264440243646,
"grad_norm": 1.1640625,
"learning_rate": 0.0004999131211439816,
"loss": 6.1603,
"mean_token_accuracy": 0.12925415337085724,
"num_tokens": 3777261.0,
"step": 2045
},
{
"entropy": 6.157608842849731,
"epoch": 0.17223272421760136,
"grad_norm": 1.0390625,
"learning_rate": 0.000499912287032482,
"loss": 6.1136,
"mean_token_accuracy": 0.13876328021287918,
"num_tokens": 3786658.0,
"step": 2050
},
{
"entropy": 6.097555351257324,
"epoch": 0.17265280403276623,
"grad_norm": 1.0703125,
"learning_rate": 0.000499911448936777,
"loss": 6.1039,
"mean_token_accuracy": 0.13591505512595176,
"num_tokens": 3794977.0,
"step": 2055
},
{
"entropy": 6.0962035179138185,
"epoch": 0.1730728838479311,
"grad_norm": 0.953125,
"learning_rate": 0.0004999106068568816,
"loss": 6.1794,
"mean_token_accuracy": 0.12809087112545967,
"num_tokens": 3805138.0,
"step": 2060
},
{
"entropy": 6.231352376937866,
"epoch": 0.173492963663096,
"grad_norm": 1.015625,
"learning_rate": 0.0004999097607928106,
"loss": 6.1206,
"mean_token_accuracy": 0.1363896384835243,
"num_tokens": 3814444.0,
"step": 2065
},
{
"entropy": 6.190911483764649,
"epoch": 0.17391304347826086,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999089107445788,
"loss": 6.0912,
"mean_token_accuracy": 0.12825695872306825,
"num_tokens": 3822859.0,
"step": 2070
},
{
"entropy": 6.055686283111572,
"epoch": 0.17433312329342576,
"grad_norm": 0.97265625,
"learning_rate": 0.0004999080567122016,
"loss": 6.1054,
"mean_token_accuracy": 0.12722248658537866,
"num_tokens": 3833159.0,
"step": 2075
},
{
"entropy": 6.126674318313599,
"epoch": 0.17475320310859063,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999071986956941,
"loss": 6.1145,
"mean_token_accuracy": 0.13205313310027122,
"num_tokens": 3842136.0,
"step": 2080
},
{
"entropy": 6.132652282714844,
"epoch": 0.1751732829237555,
"grad_norm": 1.046875,
"learning_rate": 0.0004999063366950713,
"loss": 6.1975,
"mean_token_accuracy": 0.12595101371407508,
"num_tokens": 3851406.0,
"step": 2085
},
{
"entropy": 6.16230001449585,
"epoch": 0.1755933627389204,
"grad_norm": 1.015625,
"learning_rate": 0.0004999054707103486,
"loss": 6.1031,
"mean_token_accuracy": 0.1279636099934578,
"num_tokens": 3861061.0,
"step": 2090
},
{
"entropy": 6.178242540359497,
"epoch": 0.17601344255408527,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999046007415412,
"loss": 6.0821,
"mean_token_accuracy": 0.12796319127082825,
"num_tokens": 3870357.0,
"step": 2095
},
{
"entropy": 6.1904627799987795,
"epoch": 0.17643352236925017,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999037267886646,
"loss": 6.1006,
"mean_token_accuracy": 0.13064605742692947,
"num_tokens": 3879393.0,
"step": 2100
},
{
"entropy": 6.076858758926392,
"epoch": 0.17685360218441504,
"grad_norm": 1.078125,
"learning_rate": 0.0004999028488517343,
"loss": 6.1097,
"mean_token_accuracy": 0.1291967511177063,
"num_tokens": 3888030.0,
"step": 2105
},
{
"entropy": 6.22084493637085,
"epoch": 0.1772736819995799,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999019669307659,
"loss": 6.1189,
"mean_token_accuracy": 0.1333627261221409,
"num_tokens": 3897430.0,
"step": 2110
},
{
"entropy": 6.15262508392334,
"epoch": 0.1776937618147448,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999010810257749,
"loss": 6.1461,
"mean_token_accuracy": 0.1225900873541832,
"num_tokens": 3907711.0,
"step": 2115
},
{
"entropy": 6.100615692138672,
"epoch": 0.17811384162990967,
"grad_norm": 1.03125,
"learning_rate": 0.0004999001911367771,
"loss": 6.0784,
"mean_token_accuracy": 0.13617549166083337,
"num_tokens": 3915816.0,
"step": 2120
},
{
"entropy": 6.130948638916015,
"epoch": 0.17853392144507457,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998992972637883,
"loss": 6.2002,
"mean_token_accuracy": 0.12154756337404252,
"num_tokens": 3925162.0,
"step": 2125
},
{
"entropy": 6.210935020446778,
"epoch": 0.17895400126023944,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998983994068242,
"loss": 6.0874,
"mean_token_accuracy": 0.1311741665005684,
"num_tokens": 3934476.0,
"step": 2130
},
{
"entropy": 6.098375844955444,
"epoch": 0.17937408107540434,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998974975659006,
"loss": 6.1351,
"mean_token_accuracy": 0.12713489457964897,
"num_tokens": 3943501.0,
"step": 2135
},
{
"entropy": 6.198156356811523,
"epoch": 0.1797941608905692,
"grad_norm": 0.984375,
"learning_rate": 0.0004998965917410338,
"loss": 6.1279,
"mean_token_accuracy": 0.12831434607505798,
"num_tokens": 3953663.0,
"step": 2140
},
{
"entropy": 6.133723402023316,
"epoch": 0.18021424070573408,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998956819322397,
"loss": 6.088,
"mean_token_accuracy": 0.12946128249168395,
"num_tokens": 3962634.0,
"step": 2145
},
{
"entropy": 6.1390259742736815,
"epoch": 0.18063432052089898,
"grad_norm": 1.0,
"learning_rate": 0.0004998947681395343,
"loss": 6.0855,
"mean_token_accuracy": 0.13397737592458725,
"num_tokens": 3972496.0,
"step": 2150
},
{
"entropy": 6.254479598999024,
"epoch": 0.18105440033606385,
"grad_norm": 1.0546875,
"learning_rate": 0.000499893850362934,
"loss": 6.3337,
"mean_token_accuracy": 0.12332669869065285,
"num_tokens": 3980724.0,
"step": 2155
},
{
"entropy": 6.2081263065338135,
"epoch": 0.18147448015122875,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998929286024548,
"loss": 6.1709,
"mean_token_accuracy": 0.12541203945875168,
"num_tokens": 3989842.0,
"step": 2160
},
{
"entropy": 6.17150354385376,
"epoch": 0.18189455996639362,
"grad_norm": 1.109375,
"learning_rate": 0.0004998920028581133,
"loss": 6.0848,
"mean_token_accuracy": 0.1349337212741375,
"num_tokens": 3998534.0,
"step": 2165
},
{
"entropy": 6.159293746948242,
"epoch": 0.18231463978155849,
"grad_norm": 1.0,
"learning_rate": 0.0004998910731299258,
"loss": 6.0963,
"mean_token_accuracy": 0.12547213733196258,
"num_tokens": 4007677.0,
"step": 2170
},
{
"entropy": 6.151889276504517,
"epoch": 0.18273471959672338,
"grad_norm": 1.03125,
"learning_rate": 0.0004998901394179085,
"loss": 6.1632,
"mean_token_accuracy": 0.12913861274719238,
"num_tokens": 4016347.0,
"step": 2175
},
{
"entropy": 6.125647306442261,
"epoch": 0.18315479941188825,
"grad_norm": 1.109375,
"learning_rate": 0.0004998892017220784,
"loss": 6.0392,
"mean_token_accuracy": 0.13342646807432174,
"num_tokens": 4025199.0,
"step": 2180
},
{
"entropy": 6.153134059906006,
"epoch": 0.18357487922705315,
"grad_norm": 1.125,
"learning_rate": 0.0004998882600424519,
"loss": 6.0961,
"mean_token_accuracy": 0.12564898803830146,
"num_tokens": 4033933.0,
"step": 2185
},
{
"entropy": 6.154629516601562,
"epoch": 0.18399495904221802,
"grad_norm": 1.09375,
"learning_rate": 0.0004998873143790455,
"loss": 6.0291,
"mean_token_accuracy": 0.13878689035773278,
"num_tokens": 4042891.0,
"step": 2190
},
{
"entropy": 6.129179048538208,
"epoch": 0.1844150388573829,
"grad_norm": 1.03125,
"learning_rate": 0.0004998863647318763,
"loss": 6.1413,
"mean_token_accuracy": 0.1272033281624317,
"num_tokens": 4051123.0,
"step": 2195
},
{
"entropy": 6.1010294437408445,
"epoch": 0.1848351186725478,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998854111009608,
"loss": 6.1152,
"mean_token_accuracy": 0.12936600148677826,
"num_tokens": 4060025.0,
"step": 2200
},
{
"entropy": 6.11760630607605,
"epoch": 0.18525519848771266,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998844534863161,
"loss": 6.0205,
"mean_token_accuracy": 0.12755625769495965,
"num_tokens": 4069363.0,
"step": 2205
},
{
"entropy": 6.150998878479004,
"epoch": 0.18567527830287756,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998834918879592,
"loss": 6.1697,
"mean_token_accuracy": 0.1331343524158001,
"num_tokens": 4078855.0,
"step": 2210
},
{
"entropy": 6.200693273544312,
"epoch": 0.18609535811804243,
"grad_norm": 0.94921875,
"learning_rate": 0.000499882526305907,
"loss": 6.1425,
"mean_token_accuracy": 0.12896015048027037,
"num_tokens": 4087801.0,
"step": 2215
},
{
"entropy": 6.137786483764648,
"epoch": 0.18651543793320732,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998815567401765,
"loss": 6.1525,
"mean_token_accuracy": 0.12895300164818763,
"num_tokens": 4096949.0,
"step": 2220
},
{
"entropy": 6.203073024749756,
"epoch": 0.1869355177483722,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998805831907851,
"loss": 6.1034,
"mean_token_accuracy": 0.1270811975002289,
"num_tokens": 4105399.0,
"step": 2225
},
{
"entropy": 6.1230597496032715,
"epoch": 0.18735559756353706,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998796056577501,
"loss": 6.0488,
"mean_token_accuracy": 0.12729625552892684,
"num_tokens": 4113873.0,
"step": 2230
},
{
"entropy": 6.073714399337769,
"epoch": 0.18777567737870196,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998786241410886,
"loss": 6.1026,
"mean_token_accuracy": 0.13166192471981047,
"num_tokens": 4123528.0,
"step": 2235
},
{
"entropy": 6.234827375411987,
"epoch": 0.18819575719386683,
"grad_norm": 0.96484375,
"learning_rate": 0.000499877638640818,
"loss": 6.1114,
"mean_token_accuracy": 0.12597778365015982,
"num_tokens": 4133370.0,
"step": 2240
},
{
"entropy": 6.071894741058349,
"epoch": 0.18861583700903173,
"grad_norm": 0.98828125,
"learning_rate": 0.000499876649156956,
"loss": 6.028,
"mean_token_accuracy": 0.1308871813118458,
"num_tokens": 4142370.0,
"step": 2245
},
{
"entropy": 6.07040696144104,
"epoch": 0.1890359168241966,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998756556895196,
"loss": 6.1178,
"mean_token_accuracy": 0.13069864958524705,
"num_tokens": 4152367.0,
"step": 2250
},
{
"entropy": 6.167872524261474,
"epoch": 0.18945599663936147,
"grad_norm": 1.03125,
"learning_rate": 0.000499874658238527,
"loss": 6.1024,
"mean_token_accuracy": 0.13190473541617392,
"num_tokens": 4161126.0,
"step": 2255
},
{
"entropy": 6.139232063293457,
"epoch": 0.18987607645452637,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998736568039957,
"loss": 5.9976,
"mean_token_accuracy": 0.13119693994522094,
"num_tokens": 4169910.0,
"step": 2260
},
{
"entropy": 6.1185729026794435,
"epoch": 0.19029615626969124,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998726513859432,
"loss": 6.1461,
"mean_token_accuracy": 0.12717667669057847,
"num_tokens": 4179893.0,
"step": 2265
},
{
"entropy": 6.202062654495239,
"epoch": 0.19071623608485613,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998716419843875,
"loss": 6.1633,
"mean_token_accuracy": 0.13450514376163483,
"num_tokens": 4190065.0,
"step": 2270
},
{
"entropy": 6.053268957138061,
"epoch": 0.191136315900021,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998706285993465,
"loss": 6.0722,
"mean_token_accuracy": 0.12834293991327286,
"num_tokens": 4198395.0,
"step": 2275
},
{
"entropy": 6.130187177658081,
"epoch": 0.19155639571518587,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998696112308381,
"loss": 6.0944,
"mean_token_accuracy": 0.12858275175094605,
"num_tokens": 4207555.0,
"step": 2280
},
{
"entropy": 6.074777889251709,
"epoch": 0.19197647553035077,
"grad_norm": 0.984375,
"learning_rate": 0.0004998685898788803,
"loss": 6.0485,
"mean_token_accuracy": 0.13106716349720954,
"num_tokens": 4216533.0,
"step": 2285
},
{
"entropy": 6.191452217102051,
"epoch": 0.19239655534551564,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998675645434914,
"loss": 6.1523,
"mean_token_accuracy": 0.13225150257349014,
"num_tokens": 4225575.0,
"step": 2290
},
{
"entropy": 6.0228188037872314,
"epoch": 0.19281663516068054,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998665352246891,
"loss": 5.9361,
"mean_token_accuracy": 0.13841283321380615,
"num_tokens": 4234306.0,
"step": 2295
},
{
"entropy": 6.043151473999023,
"epoch": 0.1932367149758454,
"grad_norm": 1.015625,
"learning_rate": 0.0004998655019224921,
"loss": 6.1283,
"mean_token_accuracy": 0.13190191760659217,
"num_tokens": 4243998.0,
"step": 2300
},
{
"entropy": 6.166877937316895,
"epoch": 0.19365679479101028,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998644646369185,
"loss": 6.0139,
"mean_token_accuracy": 0.12847840487957002,
"num_tokens": 4253653.0,
"step": 2305
},
{
"entropy": 6.034109115600586,
"epoch": 0.19407687460617518,
"grad_norm": 1.046875,
"learning_rate": 0.0004998634233679865,
"loss": 6.0949,
"mean_token_accuracy": 0.12612878382205964,
"num_tokens": 4263305.0,
"step": 2310
},
{
"entropy": 6.1194260597229,
"epoch": 0.19449695442134005,
"grad_norm": 1.046875,
"learning_rate": 0.000499862378115715,
"loss": 5.9818,
"mean_token_accuracy": 0.13570686057209969,
"num_tokens": 4272212.0,
"step": 2315
},
{
"entropy": 6.182863759994507,
"epoch": 0.19491703423650494,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998613288801221,
"loss": 6.1959,
"mean_token_accuracy": 0.1276652343571186,
"num_tokens": 4281445.0,
"step": 2320
},
{
"entropy": 6.213861799240112,
"epoch": 0.1953371140516698,
"grad_norm": 0.9609375,
"learning_rate": 0.0004998602756612267,
"loss": 6.1058,
"mean_token_accuracy": 0.12670243680477142,
"num_tokens": 4290938.0,
"step": 2325
},
{
"entropy": 6.066101360321045,
"epoch": 0.1957571938668347,
"grad_norm": 1.015625,
"learning_rate": 0.0004998592184590471,
"loss": 6.1379,
"mean_token_accuracy": 0.12725966945290565,
"num_tokens": 4300022.0,
"step": 2330
},
{
"entropy": 6.080681276321411,
"epoch": 0.19617727368199958,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998581572736024,
"loss": 6.019,
"mean_token_accuracy": 0.1344592235982418,
"num_tokens": 4308910.0,
"step": 2335
},
{
"entropy": 6.050167417526245,
"epoch": 0.19659735349716445,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998570921049112,
"loss": 5.9814,
"mean_token_accuracy": 0.13220275193452835,
"num_tokens": 4317136.0,
"step": 2340
},
{
"entropy": 6.09632978439331,
"epoch": 0.19701743331232935,
"grad_norm": 1.078125,
"learning_rate": 0.0004998560229529924,
"loss": 6.0501,
"mean_token_accuracy": 0.1387757182121277,
"num_tokens": 4326163.0,
"step": 2345
},
{
"entropy": 6.229255342483521,
"epoch": 0.19743751312749422,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998549498178649,
"loss": 6.1921,
"mean_token_accuracy": 0.12868764251470566,
"num_tokens": 4335837.0,
"step": 2350
},
{
"entropy": 6.126859140396118,
"epoch": 0.19785759294265912,
"grad_norm": 1.1171875,
"learning_rate": 0.0004998538726995477,
"loss": 6.1084,
"mean_token_accuracy": 0.13344382494688034,
"num_tokens": 4345108.0,
"step": 2355
},
{
"entropy": 6.168588161468506,
"epoch": 0.198277672757824,
"grad_norm": 0.9765625,
"learning_rate": 0.00049985279159806,
"loss": 6.119,
"mean_token_accuracy": 0.12684730514883996,
"num_tokens": 4353761.0,
"step": 2360
},
{
"entropy": 6.090028953552246,
"epoch": 0.19869775257298886,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998517065134208,
"loss": 6.0824,
"mean_token_accuracy": 0.13213628232479097,
"num_tokens": 4363244.0,
"step": 2365
},
{
"entropy": 6.138245010375977,
"epoch": 0.19911783238815375,
"grad_norm": 0.9921875,
"learning_rate": 0.0004998506174456494,
"loss": 6.0839,
"mean_token_accuracy": 0.12802947238087653,
"num_tokens": 4373034.0,
"step": 2370
},
{
"entropy": 6.12951922416687,
"epoch": 0.19953791220331862,
"grad_norm": 0.9453125,
"learning_rate": 0.0004998495243947653,
"loss": 6.0216,
"mean_token_accuracy": 0.1251508317887783,
"num_tokens": 4382554.0,
"step": 2375
},
{
"entropy": 6.1475914478302,
"epoch": 0.19995799201848352,
"grad_norm": 1.140625,
"learning_rate": 0.0004998484273607875,
"loss": 6.0463,
"mean_token_accuracy": 0.136245708912611,
"num_tokens": 4391001.0,
"step": 2380
},
{
"entropy": 5.926258325576782,
"epoch": 0.2003780718336484,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998473263437356,
"loss": 5.9565,
"mean_token_accuracy": 0.13519108295440674,
"num_tokens": 4400632.0,
"step": 2385
},
{
"entropy": 6.048220825195313,
"epoch": 0.20079815164881326,
"grad_norm": 1.0234375,
"learning_rate": 0.000499846221343629,
"loss": 6.051,
"mean_token_accuracy": 0.13025175258517266,
"num_tokens": 4409565.0,
"step": 2390
},
{
"entropy": 6.0700782299041744,
"epoch": 0.20121823146397816,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998451123604875,
"loss": 5.9988,
"mean_token_accuracy": 0.14039506316184996,
"num_tokens": 4418384.0,
"step": 2395
},
{
"entropy": 6.146504878997803,
"epoch": 0.20163831127914303,
"grad_norm": 1.0625,
"learning_rate": 0.0004998439993943306,
"loss": 6.1232,
"mean_token_accuracy": 0.13494747802615165,
"num_tokens": 4427581.0,
"step": 2400
},
{
"entropy": 6.175554275512695,
"epoch": 0.20205839109430793,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998428824451779,
"loss": 6.1094,
"mean_token_accuracy": 0.1269066423177719,
"num_tokens": 4436572.0,
"step": 2405
},
{
"entropy": 6.086094999313355,
"epoch": 0.2024784709094728,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998417615130495,
"loss": 6.1156,
"mean_token_accuracy": 0.12977832332253456,
"num_tokens": 4445230.0,
"step": 2410
},
{
"entropy": 6.189484167098999,
"epoch": 0.2028985507246377,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998406365979649,
"loss": 6.1725,
"mean_token_accuracy": 0.13100939616560936,
"num_tokens": 4454251.0,
"step": 2415
},
{
"entropy": 6.080749225616455,
"epoch": 0.20331863053980256,
"grad_norm": 0.953125,
"learning_rate": 0.0004998395076999443,
"loss": 6.0178,
"mean_token_accuracy": 0.13957264572381972,
"num_tokens": 4463949.0,
"step": 2420
},
{
"entropy": 6.179844999313355,
"epoch": 0.20373871035496743,
"grad_norm": 1.03125,
"learning_rate": 0.0004998383748190076,
"loss": 6.2136,
"mean_token_accuracy": 0.1258860044181347,
"num_tokens": 4473373.0,
"step": 2425
},
{
"entropy": 6.209265089035034,
"epoch": 0.20415879017013233,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998372379551748,
"loss": 6.0447,
"mean_token_accuracy": 0.13152522593736649,
"num_tokens": 4482303.0,
"step": 2430
},
{
"entropy": 6.047933959960938,
"epoch": 0.2045788699852972,
"grad_norm": 1.03125,
"learning_rate": 0.0004998360971084663,
"loss": 6.0094,
"mean_token_accuracy": 0.1300078108906746,
"num_tokens": 4491214.0,
"step": 2435
},
{
"entropy": 5.923014402389526,
"epoch": 0.2049989498004621,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998349522789019,
"loss": 5.9367,
"mean_token_accuracy": 0.13840087428689002,
"num_tokens": 4500099.0,
"step": 2440
},
{
"entropy": 6.065491151809693,
"epoch": 0.20541902961562697,
"grad_norm": 1.0,
"learning_rate": 0.0004998338034665021,
"loss": 6.0088,
"mean_token_accuracy": 0.14065721929073333,
"num_tokens": 4509893.0,
"step": 2445
},
{
"entropy": 6.0568382263183596,
"epoch": 0.20583910943079184,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998326506712872,
"loss": 5.9933,
"mean_token_accuracy": 0.13122306242585183,
"num_tokens": 4518606.0,
"step": 2450
},
{
"entropy": 6.102985429763794,
"epoch": 0.20625918924595674,
"grad_norm": 1.0625,
"learning_rate": 0.0004998314938932778,
"loss": 6.0731,
"mean_token_accuracy": 0.13163421601057052,
"num_tokens": 4528392.0,
"step": 2455
},
{
"entropy": 6.143069076538086,
"epoch": 0.2066792690611216,
"grad_norm": 1.03125,
"learning_rate": 0.0004998303331324943,
"loss": 6.0446,
"mean_token_accuracy": 0.13527958691120148,
"num_tokens": 4536983.0,
"step": 2460
},
{
"entropy": 5.983616924285888,
"epoch": 0.2070993488762865,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998291683889571,
"loss": 5.9545,
"mean_token_accuracy": 0.13531955927610398,
"num_tokens": 4544967.0,
"step": 2465
},
{
"entropy": 6.107338380813599,
"epoch": 0.20751942869145137,
"grad_norm": 1.0859375,
"learning_rate": 0.000499827999662687,
"loss": 6.0262,
"mean_token_accuracy": 0.12663825750350952,
"num_tokens": 4554646.0,
"step": 2470
},
{
"entropy": 6.1525249004364015,
"epoch": 0.20793950850661624,
"grad_norm": 0.984375,
"learning_rate": 0.0004998268269537046,
"loss": 6.0498,
"mean_token_accuracy": 0.13295727223157883,
"num_tokens": 4564040.0,
"step": 2475
},
{
"entropy": 5.977402019500732,
"epoch": 0.20835958832178114,
"grad_norm": 1.015625,
"learning_rate": 0.0004998256502620308,
"loss": 6.0631,
"mean_token_accuracy": 0.13584776520729064,
"num_tokens": 4573758.0,
"step": 2480
},
{
"entropy": 6.187655830383301,
"epoch": 0.208779668136946,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998244695876864,
"loss": 6.0894,
"mean_token_accuracy": 0.12783714309334754,
"num_tokens": 4582097.0,
"step": 2485
},
{
"entropy": 6.000187587738037,
"epoch": 0.2091997479521109,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998232849306921,
"loss": 6.0587,
"mean_token_accuracy": 0.1367252618074417,
"num_tokens": 4590687.0,
"step": 2490
},
{
"entropy": 6.167983675003052,
"epoch": 0.20961982776727578,
"grad_norm": 1.0625,
"learning_rate": 0.0004998220962910693,
"loss": 6.0418,
"mean_token_accuracy": 0.1291399121284485,
"num_tokens": 4599497.0,
"step": 2495
},
{
"entropy": 6.0570958137512205,
"epoch": 0.21003990758244068,
"grad_norm": 1.109375,
"learning_rate": 0.0004998209036688386,
"loss": 6.0052,
"mean_token_accuracy": 0.134087672829628,
"num_tokens": 4607958.0,
"step": 2500
},
{
"entropy": 6.154553699493408,
"epoch": 0.21045998739760555,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998197070640216,
"loss": 6.1436,
"mean_token_accuracy": 0.1265629693865776,
"num_tokens": 4617515.0,
"step": 2505
},
{
"entropy": 6.159878873825074,
"epoch": 0.21088006721277042,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998185064766391,
"loss": 6.029,
"mean_token_accuracy": 0.13321105763316154,
"num_tokens": 4627037.0,
"step": 2510
},
{
"entropy": 5.991772747039795,
"epoch": 0.21130014702793531,
"grad_norm": 0.96875,
"learning_rate": 0.0004998173019067127,
"loss": 6.0263,
"mean_token_accuracy": 0.13551492914557456,
"num_tokens": 4637393.0,
"step": 2515
},
{
"entropy": 6.076245498657227,
"epoch": 0.21172022684310018,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998160933542633,
"loss": 6.0656,
"mean_token_accuracy": 0.1218369334936142,
"num_tokens": 4646832.0,
"step": 2520
},
{
"entropy": 6.151754760742188,
"epoch": 0.21214030665826508,
"grad_norm": 1.109375,
"learning_rate": 0.0004998148808193128,
"loss": 6.0983,
"mean_token_accuracy": 0.13457245901226997,
"num_tokens": 4655719.0,
"step": 2525
},
{
"entropy": 6.129997682571411,
"epoch": 0.21256038647342995,
"grad_norm": 1.03125,
"learning_rate": 0.0004998136643018823,
"loss": 6.0362,
"mean_token_accuracy": 0.13282962441444396,
"num_tokens": 4665364.0,
"step": 2530
},
{
"entropy": 6.060281705856323,
"epoch": 0.21298046628859482,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998124438019935,
"loss": 6.016,
"mean_token_accuracy": 0.1327340230345726,
"num_tokens": 4674760.0,
"step": 2535
},
{
"entropy": 5.969087028503418,
"epoch": 0.21340054610375972,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998112193196681,
"loss": 5.9355,
"mean_token_accuracy": 0.1348019614815712,
"num_tokens": 4683900.0,
"step": 2540
},
{
"entropy": 5.984380483627319,
"epoch": 0.2138206259189246,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998099908549277,
"loss": 5.9913,
"mean_token_accuracy": 0.13222553506493567,
"num_tokens": 4693915.0,
"step": 2545
},
{
"entropy": 5.969556903839111,
"epoch": 0.2142407057340895,
"grad_norm": 1.0234375,
"learning_rate": 0.000499808758407794,
"loss": 5.8426,
"mean_token_accuracy": 0.14078716412186623,
"num_tokens": 4703102.0,
"step": 2550
},
{
"entropy": 6.0391675472259525,
"epoch": 0.21466078554925436,
"grad_norm": 1.015625,
"learning_rate": 0.0004998075219782889,
"loss": 6.076,
"mean_token_accuracy": 0.13493016585707665,
"num_tokens": 4712925.0,
"step": 2555
},
{
"entropy": 6.044742774963379,
"epoch": 0.21508086536441923,
"grad_norm": 1.09375,
"learning_rate": 0.0004998062815664344,
"loss": 6.0143,
"mean_token_accuracy": 0.1313328728079796,
"num_tokens": 4722641.0,
"step": 2560
},
{
"entropy": 6.006574583053589,
"epoch": 0.21550094517958412,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998050371722524,
"loss": 6.0658,
"mean_token_accuracy": 0.13319918289780616,
"num_tokens": 4732603.0,
"step": 2565
},
{
"entropy": 5.998908233642578,
"epoch": 0.215921024994749,
"grad_norm": 0.97265625,
"learning_rate": 0.0004998037887957649,
"loss": 5.91,
"mean_token_accuracy": 0.13604277446866037,
"num_tokens": 4742644.0,
"step": 2570
},
{
"entropy": 6.173721647262573,
"epoch": 0.2163411048099139,
"grad_norm": 1.046875,
"learning_rate": 0.0004998025364369939,
"loss": 6.2348,
"mean_token_accuracy": 0.1272044688463211,
"num_tokens": 4751482.0,
"step": 2575
},
{
"entropy": 6.235174703598022,
"epoch": 0.21676118462507876,
"grad_norm": 1.09375,
"learning_rate": 0.0004998012800959619,
"loss": 6.0898,
"mean_token_accuracy": 0.13052162379026414,
"num_tokens": 4760593.0,
"step": 2580
},
{
"entropy": 6.111858797073364,
"epoch": 0.21718126444024366,
"grad_norm": 1.109375,
"learning_rate": 0.0004998000197726909,
"loss": 6.0859,
"mean_token_accuracy": 0.13651981949806213,
"num_tokens": 4769294.0,
"step": 2585
},
{
"entropy": 6.069575262069702,
"epoch": 0.21760134425540853,
"grad_norm": 0.921875,
"learning_rate": 0.0004997987554672033,
"loss": 5.9867,
"mean_token_accuracy": 0.13439425751566886,
"num_tokens": 4779239.0,
"step": 2590
},
{
"entropy": 6.047330856323242,
"epoch": 0.2180214240705734,
"grad_norm": 0.98828125,
"learning_rate": 0.0004997974871795215,
"loss": 6.0684,
"mean_token_accuracy": 0.1306284710764885,
"num_tokens": 4788211.0,
"step": 2595
},
{
"entropy": 6.074938344955444,
"epoch": 0.2184415038857383,
"grad_norm": 0.9375,
"learning_rate": 0.000499796214909668,
"loss": 6.0291,
"mean_token_accuracy": 0.13745831623673438,
"num_tokens": 4797921.0,
"step": 2600
},
{
"entropy": 6.087161684036255,
"epoch": 0.21886158370090317,
"grad_norm": 1.0234375,
"learning_rate": 0.0004997949386576653,
"loss": 6.0259,
"mean_token_accuracy": 0.13380660563707353,
"num_tokens": 4807772.0,
"step": 2605
},
{
"entropy": 6.045609188079834,
"epoch": 0.21928166351606806,
"grad_norm": 0.96484375,
"learning_rate": 0.000499793658423536,
"loss": 6.053,
"mean_token_accuracy": 0.12916495203971862,
"num_tokens": 4817999.0,
"step": 2610
},
{
"entropy": 6.102342319488526,
"epoch": 0.21970174333123293,
"grad_norm": 1.1171875,
"learning_rate": 0.0004997923742073028,
"loss": 6.0064,
"mean_token_accuracy": 0.14091803804039954,
"num_tokens": 4826679.0,
"step": 2615
},
{
"entropy": 5.991182327270508,
"epoch": 0.2201218231463978,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997910860089884,
"loss": 6.0052,
"mean_token_accuracy": 0.13694314211606978,
"num_tokens": 4834998.0,
"step": 2620
},
{
"entropy": 6.100556564331055,
"epoch": 0.2205419029615627,
"grad_norm": 1.0390625,
"learning_rate": 0.0004997897938286156,
"loss": 5.9615,
"mean_token_accuracy": 0.13899449855089188,
"num_tokens": 4843635.0,
"step": 2625
},
{
"entropy": 6.097274303436279,
"epoch": 0.22096198277672757,
"grad_norm": 1.140625,
"learning_rate": 0.0004997884976662075,
"loss": 6.0782,
"mean_token_accuracy": 0.13321034833788872,
"num_tokens": 4852027.0,
"step": 2630
},
{
"entropy": 6.147892570495605,
"epoch": 0.22138206259189247,
"grad_norm": 1.046875,
"learning_rate": 0.0004997871975217868,
"loss": 5.997,
"mean_token_accuracy": 0.14202353730797768,
"num_tokens": 4861244.0,
"step": 2635
},
{
"entropy": 5.8932945251464846,
"epoch": 0.22180214240705734,
"grad_norm": 0.99609375,
"learning_rate": 0.0004997858933953768,
"loss": 5.9307,
"mean_token_accuracy": 0.13638841062784196,
"num_tokens": 4869902.0,
"step": 2640
},
{
"entropy": 5.95978422164917,
"epoch": 0.2222222222222222,
"grad_norm": 1.0078125,
"learning_rate": 0.0004997845852870004,
"loss": 5.8916,
"mean_token_accuracy": 0.1398716911673546,
"num_tokens": 4878502.0,
"step": 2645
},
{
"entropy": 6.00921368598938,
"epoch": 0.2226423020373871,
"grad_norm": 1.046875,
"learning_rate": 0.0004997832731966806,
"loss": 5.9483,
"mean_token_accuracy": 0.14096327498555183,
"num_tokens": 4888348.0,
"step": 2650
},
{
"entropy": 5.991452217102051,
"epoch": 0.22306238185255198,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997819571244411,
"loss": 6.0189,
"mean_token_accuracy": 0.1372461050748825,
"num_tokens": 4897302.0,
"step": 2655
},
{
"entropy": 6.012991952896118,
"epoch": 0.22348246166771688,
"grad_norm": 1.03125,
"learning_rate": 0.0004997806370703049,
"loss": 6.0444,
"mean_token_accuracy": 0.13569730073213576,
"num_tokens": 4907078.0,
"step": 2660
},
{
"entropy": 5.988911724090576,
"epoch": 0.22390254148288175,
"grad_norm": 0.921875,
"learning_rate": 0.0004997793130342954,
"loss": 5.874,
"mean_token_accuracy": 0.13960873782634736,
"num_tokens": 4917489.0,
"step": 2665
},
{
"entropy": 5.930651092529297,
"epoch": 0.22432262129804661,
"grad_norm": 1.046875,
"learning_rate": 0.0004997779850164363,
"loss": 5.9779,
"mean_token_accuracy": 0.13561228066682815,
"num_tokens": 4927073.0,
"step": 2670
},
{
"entropy": 6.158057308197021,
"epoch": 0.2247427011132115,
"grad_norm": 1.0546875,
"learning_rate": 0.0004997766530167508,
"loss": 6.0815,
"mean_token_accuracy": 0.13055123686790465,
"num_tokens": 4935464.0,
"step": 2675
},
{
"entropy": 6.137771940231323,
"epoch": 0.22516278092837638,
"grad_norm": 1.1015625,
"learning_rate": 0.0004997753170352627,
"loss": 6.1621,
"mean_token_accuracy": 0.12912468686699868,
"num_tokens": 4944718.0,
"step": 2680
},
{
"entropy": 6.082327508926392,
"epoch": 0.22558286074354128,
"grad_norm": 1.125,
"learning_rate": 0.0004997739770719955,
"loss": 6.0351,
"mean_token_accuracy": 0.13314241990447045,
"num_tokens": 4954223.0,
"step": 2685
},
{
"entropy": 6.058599901199341,
"epoch": 0.22600294055870615,
"grad_norm": 0.96875,
"learning_rate": 0.000499772633126973,
"loss": 6.0734,
"mean_token_accuracy": 0.132587993144989,
"num_tokens": 4963371.0,
"step": 2690
},
{
"entropy": 6.008123540878296,
"epoch": 0.22642302037387105,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997712852002192,
"loss": 5.9177,
"mean_token_accuracy": 0.14214024171233178,
"num_tokens": 4972973.0,
"step": 2695
},
{
"entropy": 6.0829901695251465,
"epoch": 0.22684310018903592,
"grad_norm": 1.09375,
"learning_rate": 0.0004997699332917578,
"loss": 6.1705,
"mean_token_accuracy": 0.12515681982040405,
"num_tokens": 4982808.0,
"step": 2700
},
{
"entropy": 6.190164613723755,
"epoch": 0.2272631800042008,
"grad_norm": 0.984375,
"learning_rate": 0.0004997685774016127,
"loss": 6.0492,
"mean_token_accuracy": 0.13310236856341362,
"num_tokens": 4992427.0,
"step": 2705
},
{
"entropy": 6.118629407882691,
"epoch": 0.22768325981936569,
"grad_norm": 0.9296875,
"learning_rate": 0.000499767217529808,
"loss": 6.2194,
"mean_token_accuracy": 0.1233941525220871,
"num_tokens": 5003562.0,
"step": 2710
},
{
"entropy": 6.01776008605957,
"epoch": 0.22810333963453056,
"grad_norm": 0.91015625,
"learning_rate": 0.0004997658536763678,
"loss": 5.9127,
"mean_token_accuracy": 0.13683665543794632,
"num_tokens": 5013429.0,
"step": 2715
},
{
"entropy": 6.101313972473145,
"epoch": 0.22852341944969545,
"grad_norm": 0.99609375,
"learning_rate": 0.0004997644858413163,
"loss": 6.0303,
"mean_token_accuracy": 0.13920028880238533,
"num_tokens": 5022045.0,
"step": 2720
},
{
"entropy": 5.957802677154541,
"epoch": 0.22894349926486032,
"grad_norm": 0.9609375,
"learning_rate": 0.0004997631140246775,
"loss": 5.8742,
"mean_token_accuracy": 0.1393354929983616,
"num_tokens": 5032260.0,
"step": 2725
},
{
"entropy": 6.008182525634766,
"epoch": 0.2293635790800252,
"grad_norm": 1.015625,
"learning_rate": 0.000499761738226476,
"loss": 5.9304,
"mean_token_accuracy": 0.13518433645367622,
"num_tokens": 5041688.0,
"step": 2730
},
{
"entropy": 6.007689189910889,
"epoch": 0.2297836588951901,
"grad_norm": 1.0703125,
"learning_rate": 0.000499760358446736,
"loss": 6.0285,
"mean_token_accuracy": 0.1302636370062828,
"num_tokens": 5051005.0,
"step": 2735
},
{
"entropy": 6.086931037902832,
"epoch": 0.23020373871035496,
"grad_norm": 1.015625,
"learning_rate": 0.000499758974685482,
"loss": 5.9683,
"mean_token_accuracy": 0.13642121478915215,
"num_tokens": 5060084.0,
"step": 2740
},
{
"entropy": 6.020289707183838,
"epoch": 0.23062381852551986,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997575869427385,
"loss": 5.9633,
"mean_token_accuracy": 0.14196947664022447,
"num_tokens": 5069081.0,
"step": 2745
},
{
"entropy": 6.004650115966797,
"epoch": 0.23104389834068473,
"grad_norm": 1.015625,
"learning_rate": 0.00049975619521853,
"loss": 5.9567,
"mean_token_accuracy": 0.12970962673425673,
"num_tokens": 5078597.0,
"step": 2750
},
{
"entropy": 5.982837867736817,
"epoch": 0.2314639781558496,
"grad_norm": 1.015625,
"learning_rate": 0.0004997547995128814,
"loss": 6.0116,
"mean_token_accuracy": 0.13743849247694015,
"num_tokens": 5087607.0,
"step": 2755
},
{
"entropy": 6.052627325057983,
"epoch": 0.2318840579710145,
"grad_norm": 1.1015625,
"learning_rate": 0.0004997533998258171,
"loss": 6.0129,
"mean_token_accuracy": 0.13540438339114189,
"num_tokens": 5097412.0,
"step": 2760
},
{
"entropy": 6.117385768890381,
"epoch": 0.23230413778617937,
"grad_norm": 1.078125,
"learning_rate": 0.0004997519961573622,
"loss": 6.0833,
"mean_token_accuracy": 0.12936894744634628,
"num_tokens": 5105817.0,
"step": 2765
},
{
"entropy": 6.181453561782837,
"epoch": 0.23272421760134426,
"grad_norm": 1.1328125,
"learning_rate": 0.0004997505885075414,
"loss": 6.1236,
"mean_token_accuracy": 0.131087327003479,
"num_tokens": 5114958.0,
"step": 2770
},
{
"entropy": 6.078393983840942,
"epoch": 0.23314429741650913,
"grad_norm": 1.03125,
"learning_rate": 0.0004997491768763795,
"loss": 6.0399,
"mean_token_accuracy": 0.13298976346850394,
"num_tokens": 5123728.0,
"step": 2775
},
{
"entropy": 6.067098140716553,
"epoch": 0.23356437723167403,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997477612639018,
"loss": 6.0705,
"mean_token_accuracy": 0.13075175881385803,
"num_tokens": 5134099.0,
"step": 2780
},
{
"entropy": 6.135926675796509,
"epoch": 0.2339844570468389,
"grad_norm": 1.078125,
"learning_rate": 0.0004997463416701332,
"loss": 6.0951,
"mean_token_accuracy": 0.12707924395799636,
"num_tokens": 5142934.0,
"step": 2785
},
{
"entropy": 6.010523986816406,
"epoch": 0.23440453686200377,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997449180950989,
"loss": 5.9263,
"mean_token_accuracy": 0.15269666612148286,
"num_tokens": 5151835.0,
"step": 2790
},
{
"entropy": 5.981678676605225,
"epoch": 0.23482461667716867,
"grad_norm": 0.95703125,
"learning_rate": 0.0004997434905388241,
"loss": 5.9711,
"mean_token_accuracy": 0.1440896660089493,
"num_tokens": 5161136.0,
"step": 2795
},
{
"entropy": 6.001285219192505,
"epoch": 0.23524469649233354,
"grad_norm": 0.96875,
"learning_rate": 0.000499742059001334,
"loss": 5.9034,
"mean_token_accuracy": 0.13934119418263435,
"num_tokens": 5170741.0,
"step": 2800
},
{
"entropy": 6.018690299987793,
"epoch": 0.23566477630749844,
"grad_norm": 1.046875,
"learning_rate": 0.0004997406234826541,
"loss": 5.9552,
"mean_token_accuracy": 0.14054046496748923,
"num_tokens": 5180549.0,
"step": 2805
},
{
"entropy": 5.938224172592163,
"epoch": 0.2360848561226633,
"grad_norm": 0.97265625,
"learning_rate": 0.0004997391839828098,
"loss": 5.9206,
"mean_token_accuracy": 0.14346935153007506,
"num_tokens": 5189486.0,
"step": 2810
},
{
"entropy": 5.992739534378051,
"epoch": 0.23650493593782818,
"grad_norm": 1.046875,
"learning_rate": 0.0004997377405018266,
"loss": 5.989,
"mean_token_accuracy": 0.13284722566604615,
"num_tokens": 5198525.0,
"step": 2815
},
{
"entropy": 6.092206192016602,
"epoch": 0.23692501575299307,
"grad_norm": 1.0234375,
"learning_rate": 0.00049973629303973,
"loss": 6.0314,
"mean_token_accuracy": 0.13478757068514824,
"num_tokens": 5207124.0,
"step": 2820
},
{
"entropy": 5.945079660415649,
"epoch": 0.23734509556815794,
"grad_norm": 0.96875,
"learning_rate": 0.0004997348415965457,
"loss": 5.8623,
"mean_token_accuracy": 0.13759657815098764,
"num_tokens": 5216529.0,
"step": 2825
},
{
"entropy": 6.023407697677612,
"epoch": 0.23776517538332284,
"grad_norm": 1.09375,
"learning_rate": 0.0004997333861722995,
"loss": 6.0141,
"mean_token_accuracy": 0.13514548242092134,
"num_tokens": 5225796.0,
"step": 2830
},
{
"entropy": 6.10746054649353,
"epoch": 0.2381852551984877,
"grad_norm": 1.1015625,
"learning_rate": 0.000499731926767017,
"loss": 6.038,
"mean_token_accuracy": 0.13310380131006241,
"num_tokens": 5233876.0,
"step": 2835
},
{
"entropy": 5.989476442337036,
"epoch": 0.23860533501365258,
"grad_norm": 0.984375,
"learning_rate": 0.0004997304633807242,
"loss": 6.0249,
"mean_token_accuracy": 0.13009608685970306,
"num_tokens": 5244782.0,
"step": 2840
},
{
"entropy": 6.030221557617187,
"epoch": 0.23902541482881748,
"grad_norm": 1.078125,
"learning_rate": 0.0004997289960134468,
"loss": 5.9703,
"mean_token_accuracy": 0.1368887707591057,
"num_tokens": 5253453.0,
"step": 2845
},
{
"entropy": 5.974435091018677,
"epoch": 0.23944549464398235,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997275246652111,
"loss": 5.9925,
"mean_token_accuracy": 0.13992664366960525,
"num_tokens": 5262355.0,
"step": 2850
},
{
"entropy": 5.960743093490601,
"epoch": 0.23986557445914725,
"grad_norm": 1.09375,
"learning_rate": 0.000499726049336043,
"loss": 5.9169,
"mean_token_accuracy": 0.13876279294490815,
"num_tokens": 5271959.0,
"step": 2855
},
{
"entropy": 6.026579475402832,
"epoch": 0.24028565427431212,
"grad_norm": 1.078125,
"learning_rate": 0.0004997245700259686,
"loss": 5.9378,
"mean_token_accuracy": 0.14501210153102875,
"num_tokens": 5281393.0,
"step": 2860
},
{
"entropy": 6.063703155517578,
"epoch": 0.240705734089477,
"grad_norm": 0.96484375,
"learning_rate": 0.0004997230867350141,
"loss": 6.0777,
"mean_token_accuracy": 0.13284969255328177,
"num_tokens": 5290979.0,
"step": 2865
},
{
"entropy": 6.101211261749268,
"epoch": 0.24112581390464188,
"grad_norm": 1.0078125,
"learning_rate": 0.0004997215994632059,
"loss": 6.0262,
"mean_token_accuracy": 0.13758746907114983,
"num_tokens": 5300263.0,
"step": 2870
},
{
"entropy": 6.094251537322998,
"epoch": 0.24154589371980675,
"grad_norm": 1.03125,
"learning_rate": 0.0004997201082105704,
"loss": 6.0534,
"mean_token_accuracy": 0.13286127597093583,
"num_tokens": 5309522.0,
"step": 2875
},
{
"entropy": 6.022702550888061,
"epoch": 0.24196597353497165,
"grad_norm": 0.9921875,
"learning_rate": 0.0004997186129771338,
"loss": 6.0403,
"mean_token_accuracy": 0.13587599471211434,
"num_tokens": 5319770.0,
"step": 2880
},
{
"entropy": 6.158471012115479,
"epoch": 0.24238605335013652,
"grad_norm": 1.109375,
"learning_rate": 0.0004997171137629226,
"loss": 6.0553,
"mean_token_accuracy": 0.13789832219481468,
"num_tokens": 5328400.0,
"step": 2885
},
{
"entropy": 5.906137275695801,
"epoch": 0.24280613316530142,
"grad_norm": 1.1328125,
"learning_rate": 0.0004997156105679636,
"loss": 5.8523,
"mean_token_accuracy": 0.1505324125289917,
"num_tokens": 5336338.0,
"step": 2890
},
{
"entropy": 5.937476301193238,
"epoch": 0.2432262129804663,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997141033922832,
"loss": 5.9567,
"mean_token_accuracy": 0.13674451038241386,
"num_tokens": 5345391.0,
"step": 2895
},
{
"entropy": 6.064019298553466,
"epoch": 0.24364629279563116,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997125922359081,
"loss": 5.9792,
"mean_token_accuracy": 0.12863812744617462,
"num_tokens": 5354709.0,
"step": 2900
},
{
"entropy": 6.019229459762573,
"epoch": 0.24406637261079606,
"grad_norm": 1.046875,
"learning_rate": 0.0004997110770988652,
"loss": 5.8953,
"mean_token_accuracy": 0.1387791097164154,
"num_tokens": 5363738.0,
"step": 2905
},
{
"entropy": 6.001317119598388,
"epoch": 0.24448645242596093,
"grad_norm": 1.1796875,
"learning_rate": 0.0004997095579811813,
"loss": 6.0193,
"mean_token_accuracy": 0.13423686176538469,
"num_tokens": 5373583.0,
"step": 2910
},
{
"entropy": 6.057038736343384,
"epoch": 0.24490653224112582,
"grad_norm": 1.0,
"learning_rate": 0.0004997080348828833,
"loss": 6.0731,
"mean_token_accuracy": 0.1342361442744732,
"num_tokens": 5383486.0,
"step": 2915
},
{
"entropy": 6.072895145416259,
"epoch": 0.2453266120562907,
"grad_norm": 1.1171875,
"learning_rate": 0.0004997065078039981,
"loss": 5.9871,
"mean_token_accuracy": 0.13158463463187217,
"num_tokens": 5391974.0,
"step": 2920
},
{
"entropy": 6.060671615600586,
"epoch": 0.24574669187145556,
"grad_norm": 1.078125,
"learning_rate": 0.0004997049767445529,
"loss": 6.0206,
"mean_token_accuracy": 0.13064797297120095,
"num_tokens": 5400882.0,
"step": 2925
},
{
"entropy": 6.072234678268432,
"epoch": 0.24616677168662046,
"grad_norm": 1.0390625,
"learning_rate": 0.0004997034417045746,
"loss": 5.9808,
"mean_token_accuracy": 0.13372990265488624,
"num_tokens": 5410538.0,
"step": 2930
},
{
"entropy": 5.924702501296997,
"epoch": 0.24658685150178533,
"grad_norm": 1.078125,
"learning_rate": 0.0004997019026840907,
"loss": 5.8715,
"mean_token_accuracy": 0.13977383449673653,
"num_tokens": 5419406.0,
"step": 2935
},
{
"entropy": 5.90452995300293,
"epoch": 0.24700693131695023,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997003596831282,
"loss": 5.9778,
"mean_token_accuracy": 0.1360274873673916,
"num_tokens": 5428817.0,
"step": 2940
},
{
"entropy": 6.072986078262329,
"epoch": 0.2474270111321151,
"grad_norm": 1.046875,
"learning_rate": 0.0004996988127017145,
"loss": 6.0065,
"mean_token_accuracy": 0.135275649279356,
"num_tokens": 5438277.0,
"step": 2945
},
{
"entropy": 6.042351722717285,
"epoch": 0.24784709094728,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996972617398772,
"loss": 6.0257,
"mean_token_accuracy": 0.1333914116024971,
"num_tokens": 5447440.0,
"step": 2950
},
{
"entropy": 6.086295413970947,
"epoch": 0.24826717076244487,
"grad_norm": 1.046875,
"learning_rate": 0.0004996957067976435,
"loss": 5.9563,
"mean_token_accuracy": 0.1371552027761936,
"num_tokens": 5455988.0,
"step": 2955
},
{
"entropy": 5.992445421218872,
"epoch": 0.24868725057760974,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996941478750411,
"loss": 5.9653,
"mean_token_accuracy": 0.13751397803425788,
"num_tokens": 5464996.0,
"step": 2960
},
{
"entropy": 6.070774841308594,
"epoch": 0.24910733039277463,
"grad_norm": 0.953125,
"learning_rate": 0.0004996925849720975,
"loss": 6.0955,
"mean_token_accuracy": 0.1302956260740757,
"num_tokens": 5474174.0,
"step": 2965
},
{
"entropy": 6.157161235809326,
"epoch": 0.2495274102079395,
"grad_norm": 1.109375,
"learning_rate": 0.0004996910180888405,
"loss": 5.9967,
"mean_token_accuracy": 0.13579627349972725,
"num_tokens": 5482838.0,
"step": 2970
},
{
"entropy": 5.956932163238525,
"epoch": 0.2499474900231044,
"grad_norm": 1.03125,
"learning_rate": 0.0004996894472252977,
"loss": 6.0037,
"mean_token_accuracy": 0.13603818491101266,
"num_tokens": 5491616.0,
"step": 2975
},
{
"entropy": 6.029225301742554,
"epoch": 0.25036756983826924,
"grad_norm": 1.015625,
"learning_rate": 0.0004996878723814973,
"loss": 5.9869,
"mean_token_accuracy": 0.13199443742632866,
"num_tokens": 5500942.0,
"step": 2980
},
{
"entropy": 6.036972188949585,
"epoch": 0.25078764965343414,
"grad_norm": 1.0234375,
"learning_rate": 0.0004996862935574667,
"loss": 5.9337,
"mean_token_accuracy": 0.12902862578630447,
"num_tokens": 5510078.0,
"step": 2985
},
{
"entropy": 5.945563459396363,
"epoch": 0.25120772946859904,
"grad_norm": 0.99609375,
"learning_rate": 0.0004996847107532342,
"loss": 5.967,
"mean_token_accuracy": 0.13469186648726464,
"num_tokens": 5518924.0,
"step": 2990
},
{
"entropy": 6.076007747650147,
"epoch": 0.25162780928376394,
"grad_norm": 0.9765625,
"learning_rate": 0.0004996831239688277,
"loss": 5.9812,
"mean_token_accuracy": 0.13338077962398528,
"num_tokens": 5527385.0,
"step": 2995
},
{
"entropy": 5.917894792556763,
"epoch": 0.2520478890989288,
"grad_norm": 1.046875,
"learning_rate": 0.0004996815332042754,
"loss": 5.8328,
"mean_token_accuracy": 0.1404413752257824,
"num_tokens": 5536781.0,
"step": 3000
},
{
"epoch": 0.2520478890989288,
"eval_entropy": 5.782812329743307,
"eval_loss": 6.003105163574219,
"eval_mean_token_accuracy": 0.1416803571949664,
"eval_num_tokens": 5536781.0,
"eval_runtime": 27.3987,
"eval_samples_per_second": 1363.789,
"eval_steps_per_second": 170.483,
"step": 3000
},
{
"entropy": 5.977773952484131,
"epoch": 0.2524679689140937,
"grad_norm": 0.984375,
"learning_rate": 0.0004996799384596054,
"loss": 6.0179,
"mean_token_accuracy": 0.13907053023576738,
"num_tokens": 5545893.0,
"step": 3005
},
{
"entropy": 6.0576560497283936,
"epoch": 0.2528880487292586,
"grad_norm": 0.9375,
"learning_rate": 0.0004996783397348461,
"loss": 5.9442,
"mean_token_accuracy": 0.13274262547492982,
"num_tokens": 5555818.0,
"step": 3010
},
{
"entropy": 6.054492235183716,
"epoch": 0.2533081285444234,
"grad_norm": 0.9453125,
"learning_rate": 0.0004996767370300256,
"loss": 5.9341,
"mean_token_accuracy": 0.14041735008358955,
"num_tokens": 5565331.0,
"step": 3015
},
{
"entropy": 6.005804014205933,
"epoch": 0.2537282083595883,
"grad_norm": 1.078125,
"learning_rate": 0.0004996751303451724,
"loss": 5.9358,
"mean_token_accuracy": 0.14184233397245408,
"num_tokens": 5574003.0,
"step": 3020
},
{
"entropy": 5.961899328231811,
"epoch": 0.2541482881747532,
"grad_norm": 1.03125,
"learning_rate": 0.0004996735196803149,
"loss": 5.8439,
"mean_token_accuracy": 0.1449878543615341,
"num_tokens": 5582517.0,
"step": 3025
},
{
"entropy": 6.0015801906585695,
"epoch": 0.2545683679899181,
"grad_norm": 0.95703125,
"learning_rate": 0.0004996719050354818,
"loss": 6.056,
"mean_token_accuracy": 0.13052123561501502,
"num_tokens": 5591952.0,
"step": 3030
},
{
"entropy": 6.00065770149231,
"epoch": 0.25498844780508295,
"grad_norm": 0.984375,
"learning_rate": 0.0004996702864107015,
"loss": 5.9486,
"mean_token_accuracy": 0.13981510698795319,
"num_tokens": 5601460.0,
"step": 3035
},
{
"entropy": 6.187022733688354,
"epoch": 0.25540852762024785,
"grad_norm": 1.046875,
"learning_rate": 0.0004996686638060028,
"loss": 6.0716,
"mean_token_accuracy": 0.13106706812977792,
"num_tokens": 5610776.0,
"step": 3040
},
{
"entropy": 5.974349451065064,
"epoch": 0.25582860743541275,
"grad_norm": 0.96875,
"learning_rate": 0.0004996670372214144,
"loss": 5.9846,
"mean_token_accuracy": 0.13845233097672463,
"num_tokens": 5619627.0,
"step": 3045
},
{
"entropy": 5.915146541595459,
"epoch": 0.2562486872505776,
"grad_norm": 0.96875,
"learning_rate": 0.0004996654066569651,
"loss": 5.8361,
"mean_token_accuracy": 0.14477361738681793,
"num_tokens": 5628969.0,
"step": 3050
},
{
"entropy": 5.960678625106811,
"epoch": 0.2566687670657425,
"grad_norm": 1.046875,
"learning_rate": 0.0004996637721126839,
"loss": 5.9181,
"mean_token_accuracy": 0.1386953629553318,
"num_tokens": 5638629.0,
"step": 3055
},
{
"entropy": 5.996072626113891,
"epoch": 0.2570888468809074,
"grad_norm": 1.1640625,
"learning_rate": 0.0004996621335885996,
"loss": 5.9879,
"mean_token_accuracy": 0.135421983897686,
"num_tokens": 5647571.0,
"step": 3060
},
{
"entropy": 6.034391689300537,
"epoch": 0.2575089266960722,
"grad_norm": 1.3515625,
"learning_rate": 0.0004996604910847413,
"loss": 5.8941,
"mean_token_accuracy": 0.14892763271927834,
"num_tokens": 5656709.0,
"step": 3065
},
{
"entropy": 6.000552320480347,
"epoch": 0.2579290065112371,
"grad_norm": 1.03125,
"learning_rate": 0.000499658844601138,
"loss": 6.0898,
"mean_token_accuracy": 0.13416215255856515,
"num_tokens": 5665714.0,
"step": 3070
},
{
"entropy": 6.078448486328125,
"epoch": 0.258349086326402,
"grad_norm": 1.0078125,
"learning_rate": 0.000499657194137819,
"loss": 6.0433,
"mean_token_accuracy": 0.13601171597838402,
"num_tokens": 5675854.0,
"step": 3075
},
{
"entropy": 6.0375328540802,
"epoch": 0.2587691661415669,
"grad_norm": 1.078125,
"learning_rate": 0.0004996555396948136,
"loss": 5.8646,
"mean_token_accuracy": 0.1391562268137932,
"num_tokens": 5685690.0,
"step": 3080
},
{
"entropy": 5.9128001689910885,
"epoch": 0.25918924595673176,
"grad_norm": 0.9765625,
"learning_rate": 0.0004996538812721509,
"loss": 5.9151,
"mean_token_accuracy": 0.14342632815241813,
"num_tokens": 5695766.0,
"step": 3085
},
{
"entropy": 5.95283260345459,
"epoch": 0.25960932577189666,
"grad_norm": 1.1953125,
"learning_rate": 0.0004996522188698603,
"loss": 5.983,
"mean_token_accuracy": 0.13854097425937653,
"num_tokens": 5704365.0,
"step": 3090
},
{
"entropy": 6.116002321243286,
"epoch": 0.26002940558706156,
"grad_norm": 1.1796875,
"learning_rate": 0.0004996505524879714,
"loss": 6.0766,
"mean_token_accuracy": 0.13496588468551635,
"num_tokens": 5713345.0,
"step": 3095
},
{
"entropy": 6.015519475936889,
"epoch": 0.2604494854022264,
"grad_norm": 0.96875,
"learning_rate": 0.0004996488821265137,
"loss": 5.8806,
"mean_token_accuracy": 0.1452178031206131,
"num_tokens": 5722907.0,
"step": 3100
},
{
"entropy": 5.933816623687744,
"epoch": 0.2608695652173913,
"grad_norm": 1.0546875,
"learning_rate": 0.0004996472077855166,
"loss": 5.9337,
"mean_token_accuracy": 0.14104732349514962,
"num_tokens": 5731589.0,
"step": 3105
},
{
"entropy": 5.977402830123902,
"epoch": 0.2612896450325562,
"grad_norm": 1.0625,
"learning_rate": 0.00049964552946501,
"loss": 5.9115,
"mean_token_accuracy": 0.13637820407748222,
"num_tokens": 5739922.0,
"step": 3110
},
{
"entropy": 5.934371995925903,
"epoch": 0.2617097248477211,
"grad_norm": 1.03125,
"learning_rate": 0.0004996438471650235,
"loss": 5.8288,
"mean_token_accuracy": 0.14435512721538543,
"num_tokens": 5749206.0,
"step": 3115
},
{
"entropy": 5.976166868209839,
"epoch": 0.26212980466288593,
"grad_norm": 0.99609375,
"learning_rate": 0.0004996421608855869,
"loss": 5.8863,
"mean_token_accuracy": 0.142840925604105,
"num_tokens": 5758803.0,
"step": 3120
},
{
"entropy": 5.948814392089844,
"epoch": 0.26254988447805083,
"grad_norm": 1.03125,
"learning_rate": 0.0004996404706267301,
"loss": 5.984,
"mean_token_accuracy": 0.13239931613206862,
"num_tokens": 5768368.0,
"step": 3125
},
{
"entropy": 5.966429710388184,
"epoch": 0.26296996429321573,
"grad_norm": 1.1328125,
"learning_rate": 0.000499638776388483,
"loss": 5.8325,
"mean_token_accuracy": 0.14922686219215392,
"num_tokens": 5776707.0,
"step": 3130
},
{
"entropy": 5.935250091552734,
"epoch": 0.26339004410838057,
"grad_norm": 1.1015625,
"learning_rate": 0.0004996370781708757,
"loss": 6.0176,
"mean_token_accuracy": 0.13138642013072968,
"num_tokens": 5787037.0,
"step": 3135
},
{
"entropy": 6.067586517333984,
"epoch": 0.26381012392354547,
"grad_norm": 1.015625,
"learning_rate": 0.0004996353759739382,
"loss": 5.9517,
"mean_token_accuracy": 0.14147150069475173,
"num_tokens": 5796630.0,
"step": 3140
},
{
"entropy": 5.96584677696228,
"epoch": 0.26423020373871037,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996336697977007,
"loss": 5.9623,
"mean_token_accuracy": 0.13759462237358094,
"num_tokens": 5806402.0,
"step": 3145
},
{
"entropy": 5.93000898361206,
"epoch": 0.2646502835538752,
"grad_norm": 1.0703125,
"learning_rate": 0.0004996319596421933,
"loss": 5.9198,
"mean_token_accuracy": 0.14076491743326186,
"num_tokens": 5815742.0,
"step": 3150
},
{
"entropy": 5.965633296966553,
"epoch": 0.2650703633690401,
"grad_norm": 1.0,
"learning_rate": 0.0004996302455074466,
"loss": 5.9267,
"mean_token_accuracy": 0.13825056850910186,
"num_tokens": 5824915.0,
"step": 3155
},
{
"entropy": 6.006742715835571,
"epoch": 0.265490443184205,
"grad_norm": 0.98828125,
"learning_rate": 0.0004996285273934906,
"loss": 5.9521,
"mean_token_accuracy": 0.13590887412428856,
"num_tokens": 5834978.0,
"step": 3160
},
{
"entropy": 6.0410035133361815,
"epoch": 0.2659105229993699,
"grad_norm": 0.94921875,
"learning_rate": 0.000499626805300356,
"loss": 6.1124,
"mean_token_accuracy": 0.13741038590669633,
"num_tokens": 5845684.0,
"step": 3165
},
{
"entropy": 6.130275392532349,
"epoch": 0.26633060281453474,
"grad_norm": 1.0546875,
"learning_rate": 0.0004996250792280732,
"loss": 5.9976,
"mean_token_accuracy": 0.1319746233522892,
"num_tokens": 5854905.0,
"step": 3170
},
{
"entropy": 5.999055051803589,
"epoch": 0.26675068262969964,
"grad_norm": 1.0703125,
"learning_rate": 0.0004996233491766727,
"loss": 5.9884,
"mean_token_accuracy": 0.13657371699810028,
"num_tokens": 5863654.0,
"step": 3175
},
{
"entropy": 6.02589430809021,
"epoch": 0.26717076244486454,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996216151461854,
"loss": 6.0131,
"mean_token_accuracy": 0.13810913935303687,
"num_tokens": 5872442.0,
"step": 3180
},
{
"entropy": 6.0054370880126955,
"epoch": 0.2675908422600294,
"grad_norm": 1.046875,
"learning_rate": 0.0004996198771366417,
"loss": 5.917,
"mean_token_accuracy": 0.14127852842211724,
"num_tokens": 5882372.0,
"step": 3185
},
{
"entropy": 5.776381587982177,
"epoch": 0.2680109220751943,
"grad_norm": 1.03125,
"learning_rate": 0.0004996181351480726,
"loss": 5.7292,
"mean_token_accuracy": 0.14581410139799117,
"num_tokens": 5891113.0,
"step": 3190
},
{
"entropy": 5.928227233886719,
"epoch": 0.2684310018903592,
"grad_norm": 1.03125,
"learning_rate": 0.0004996163891805089,
"loss": 5.9689,
"mean_token_accuracy": 0.14188458919525146,
"num_tokens": 5899582.0,
"step": 3195
},
{
"entropy": 6.024863529205322,
"epoch": 0.2688510817055241,
"grad_norm": 1.0234375,
"learning_rate": 0.0004996146392339815,
"loss": 5.9231,
"mean_token_accuracy": 0.13705718368291855,
"num_tokens": 5908938.0,
"step": 3200
},
{
"entropy": 5.970179176330566,
"epoch": 0.2692711615206889,
"grad_norm": 1.03125,
"learning_rate": 0.0004996128853085215,
"loss": 5.8847,
"mean_token_accuracy": 0.13865636065602302,
"num_tokens": 5918055.0,
"step": 3205
},
{
"entropy": 6.023169231414795,
"epoch": 0.2696912413358538,
"grad_norm": 1.0,
"learning_rate": 0.0004996111274041598,
"loss": 5.8801,
"mean_token_accuracy": 0.13727672547101974,
"num_tokens": 5926744.0,
"step": 3210
},
{
"entropy": 5.933250093460083,
"epoch": 0.2701113211510187,
"grad_norm": 0.9375,
"learning_rate": 0.0004996093655209277,
"loss": 5.9569,
"mean_token_accuracy": 0.13804723769426347,
"num_tokens": 5936521.0,
"step": 3215
},
{
"entropy": 6.115011072158813,
"epoch": 0.27053140096618356,
"grad_norm": 1.046875,
"learning_rate": 0.0004996075996588563,
"loss": 6.0278,
"mean_token_accuracy": 0.13099832609295844,
"num_tokens": 5945010.0,
"step": 3220
},
{
"entropy": 5.968009567260742,
"epoch": 0.27095148078134845,
"grad_norm": 1.046875,
"learning_rate": 0.000499605829817977,
"loss": 5.9283,
"mean_token_accuracy": 0.14273589625954627,
"num_tokens": 5953766.0,
"step": 3225
},
{
"entropy": 5.991914701461792,
"epoch": 0.27137156059651335,
"grad_norm": 1.0078125,
"learning_rate": 0.000499604055998321,
"loss": 5.854,
"mean_token_accuracy": 0.14187208637595178,
"num_tokens": 5962168.0,
"step": 3230
},
{
"entropy": 5.884540939331055,
"epoch": 0.2717916404116782,
"grad_norm": 0.94921875,
"learning_rate": 0.0004996022781999198,
"loss": 5.8717,
"mean_token_accuracy": 0.14200249686837196,
"num_tokens": 5971627.0,
"step": 3235
},
{
"entropy": 5.9543530464172365,
"epoch": 0.2722117202268431,
"grad_norm": 1.0390625,
"learning_rate": 0.000499600496422805,
"loss": 5.9681,
"mean_token_accuracy": 0.13829579949378967,
"num_tokens": 5981775.0,
"step": 3240
},
{
"entropy": 5.955832004547119,
"epoch": 0.272631800042008,
"grad_norm": 1.03125,
"learning_rate": 0.000499598710667008,
"loss": 5.8916,
"mean_token_accuracy": 0.13946632146835328,
"num_tokens": 5991097.0,
"step": 3245
},
{
"entropy": 5.936240339279175,
"epoch": 0.2730518798571729,
"grad_norm": 1.078125,
"learning_rate": 0.0004995969209325604,
"loss": 5.9431,
"mean_token_accuracy": 0.13132257238030434,
"num_tokens": 5999517.0,
"step": 3250
},
{
"entropy": 5.978598117828369,
"epoch": 0.2734719596723377,
"grad_norm": 1.0078125,
"learning_rate": 0.0004995951272194941,
"loss": 5.9235,
"mean_token_accuracy": 0.13358060643076897,
"num_tokens": 6008545.0,
"step": 3255
},
{
"entropy": 6.072739219665527,
"epoch": 0.2738920394875026,
"grad_norm": 1.0625,
"learning_rate": 0.0004995933295278407,
"loss": 5.9109,
"mean_token_accuracy": 0.1385839842259884,
"num_tokens": 6017366.0,
"step": 3260
},
{
"entropy": 5.855804061889648,
"epoch": 0.2743121193026675,
"grad_norm": 1.109375,
"learning_rate": 0.0004995915278576321,
"loss": 5.8665,
"mean_token_accuracy": 0.14388442635536194,
"num_tokens": 6025597.0,
"step": 3265
},
{
"entropy": 6.032962656021118,
"epoch": 0.27473219911783237,
"grad_norm": 0.99609375,
"learning_rate": 0.0004995897222089004,
"loss": 5.9639,
"mean_token_accuracy": 0.14044651240110398,
"num_tokens": 6034239.0,
"step": 3270
},
{
"entropy": 6.106841373443603,
"epoch": 0.27515227893299726,
"grad_norm": 1.03125,
"learning_rate": 0.0004995879125816772,
"loss": 6.0055,
"mean_token_accuracy": 0.13598284870386124,
"num_tokens": 6043837.0,
"step": 3275
},
{
"entropy": 5.915260076522827,
"epoch": 0.27557235874816216,
"grad_norm": 0.94921875,
"learning_rate": 0.0004995860989759949,
"loss": 5.941,
"mean_token_accuracy": 0.14384470880031586,
"num_tokens": 6053217.0,
"step": 3280
},
{
"entropy": 6.076294040679931,
"epoch": 0.27599243856332706,
"grad_norm": 1.0859375,
"learning_rate": 0.0004995842813918855,
"loss": 5.9613,
"mean_token_accuracy": 0.13819462284445763,
"num_tokens": 6061553.0,
"step": 3285
},
{
"entropy": 5.935478210449219,
"epoch": 0.2764125183784919,
"grad_norm": 1.125,
"learning_rate": 0.0004995824598293812,
"loss": 5.8427,
"mean_token_accuracy": 0.14168552458286285,
"num_tokens": 6070080.0,
"step": 3290
},
{
"entropy": 6.007778882980347,
"epoch": 0.2768325981936568,
"grad_norm": 1.0,
"learning_rate": 0.0004995806342885142,
"loss": 5.9809,
"mean_token_accuracy": 0.14499464854598046,
"num_tokens": 6078438.0,
"step": 3295
},
{
"entropy": 6.007369041442871,
"epoch": 0.2772526780088217,
"grad_norm": 1.1015625,
"learning_rate": 0.000499578804769317,
"loss": 5.9557,
"mean_token_accuracy": 0.1349240206182003,
"num_tokens": 6087794.0,
"step": 3300
},
{
"entropy": 5.998343896865845,
"epoch": 0.27767275782398654,
"grad_norm": 0.984375,
"learning_rate": 0.0004995769712718218,
"loss": 5.9682,
"mean_token_accuracy": 0.13966120928525924,
"num_tokens": 6096709.0,
"step": 3305
},
{
"entropy": 5.972318410873413,
"epoch": 0.27809283763915144,
"grad_norm": 1.0859375,
"learning_rate": 0.0004995751337960613,
"loss": 5.8993,
"mean_token_accuracy": 0.1387478418648243,
"num_tokens": 6105866.0,
"step": 3310
},
{
"entropy": 5.972552490234375,
"epoch": 0.27851291745431633,
"grad_norm": 1.0625,
"learning_rate": 0.0004995732923420679,
"loss": 5.8559,
"mean_token_accuracy": 0.14280831515789033,
"num_tokens": 6114882.0,
"step": 3315
},
{
"entropy": 5.830796527862549,
"epoch": 0.2789329972694812,
"grad_norm": 1.0390625,
"learning_rate": 0.0004995714469098743,
"loss": 5.8455,
"mean_token_accuracy": 0.13632447943091391,
"num_tokens": 6123978.0,
"step": 3320
},
{
"entropy": 5.958771753311157,
"epoch": 0.2793530770846461,
"grad_norm": 1.0078125,
"learning_rate": 0.000499569597499513,
"loss": 5.9729,
"mean_token_accuracy": 0.13917672261595726,
"num_tokens": 6133246.0,
"step": 3325
},
{
"entropy": 5.980595827102661,
"epoch": 0.27977315689981097,
"grad_norm": 0.94921875,
"learning_rate": 0.0004995677441110172,
"loss": 5.8311,
"mean_token_accuracy": 0.14128057211637496,
"num_tokens": 6142865.0,
"step": 3330
},
{
"entropy": 5.997343969345093,
"epoch": 0.28019323671497587,
"grad_norm": 1.0390625,
"learning_rate": 0.0004995658867444192,
"loss": 5.9328,
"mean_token_accuracy": 0.1337972767651081,
"num_tokens": 6152492.0,
"step": 3335
},
{
"entropy": 5.931694602966308,
"epoch": 0.2806133165301407,
"grad_norm": 1.0625,
"learning_rate": 0.0004995640253997523,
"loss": 5.9348,
"mean_token_accuracy": 0.13617469519376754,
"num_tokens": 6161953.0,
"step": 3340
},
{
"entropy": 5.863255500793457,
"epoch": 0.2810333963453056,
"grad_norm": 0.96875,
"learning_rate": 0.0004995621600770492,
"loss": 5.7754,
"mean_token_accuracy": 0.14608248248696326,
"num_tokens": 6171467.0,
"step": 3345
},
{
"entropy": 5.920885229110718,
"epoch": 0.2814534761604705,
"grad_norm": 0.9453125,
"learning_rate": 0.0004995602907763431,
"loss": 5.8645,
"mean_token_accuracy": 0.13830695673823357,
"num_tokens": 6180646.0,
"step": 3350
},
{
"entropy": 5.936646366119385,
"epoch": 0.28187355597563535,
"grad_norm": 1.1171875,
"learning_rate": 0.0004995584174976672,
"loss": 5.8904,
"mean_token_accuracy": 0.13721169158816338,
"num_tokens": 6189832.0,
"step": 3355
},
{
"entropy": 5.9191412925720215,
"epoch": 0.28229363579080025,
"grad_norm": 1.03125,
"learning_rate": 0.0004995565402410544,
"loss": 5.7818,
"mean_token_accuracy": 0.148978391289711,
"num_tokens": 6198339.0,
"step": 3360
},
{
"entropy": 5.912237596511841,
"epoch": 0.28271371560596514,
"grad_norm": 1.21875,
"learning_rate": 0.0004995546590065383,
"loss": 5.8655,
"mean_token_accuracy": 0.1437646083533764,
"num_tokens": 6207564.0,
"step": 3365
},
{
"entropy": 5.889131784439087,
"epoch": 0.28313379542113004,
"grad_norm": 1.0546875,
"learning_rate": 0.0004995527737941518,
"loss": 5.9398,
"mean_token_accuracy": 0.13947484493255616,
"num_tokens": 6216056.0,
"step": 3370
},
{
"entropy": 5.946989393234253,
"epoch": 0.2835538752362949,
"grad_norm": 1.015625,
"learning_rate": 0.0004995508846039287,
"loss": 5.8891,
"mean_token_accuracy": 0.14209089279174805,
"num_tokens": 6225573.0,
"step": 3375
},
{
"entropy": 6.02515664100647,
"epoch": 0.2839739550514598,
"grad_norm": 1.0625,
"learning_rate": 0.0004995489914359023,
"loss": 6.0174,
"mean_token_accuracy": 0.1341426320374012,
"num_tokens": 6235057.0,
"step": 3380
},
{
"entropy": 6.066995286941529,
"epoch": 0.2843940348666247,
"grad_norm": 1.078125,
"learning_rate": 0.0004995470942901061,
"loss": 5.9375,
"mean_token_accuracy": 0.13986450731754302,
"num_tokens": 6244164.0,
"step": 3385
},
{
"entropy": 5.99746265411377,
"epoch": 0.2848141146817895,
"grad_norm": 1.125,
"learning_rate": 0.0004995451931665738,
"loss": 5.9431,
"mean_token_accuracy": 0.1334978774189949,
"num_tokens": 6253095.0,
"step": 3390
},
{
"entropy": 5.890120506286621,
"epoch": 0.2852341944969544,
"grad_norm": 1.0546875,
"learning_rate": 0.000499543288065339,
"loss": 5.8901,
"mean_token_accuracy": 0.1397278554737568,
"num_tokens": 6261134.0,
"step": 3395
},
{
"entropy": 5.932734107971191,
"epoch": 0.2856542743121193,
"grad_norm": 1.1015625,
"learning_rate": 0.0004995413789864354,
"loss": 5.8767,
"mean_token_accuracy": 0.14287681505084038,
"num_tokens": 6270384.0,
"step": 3400
},
{
"entropy": 5.902149295806884,
"epoch": 0.28607435412728416,
"grad_norm": 0.98046875,
"learning_rate": 0.0004995394659298971,
"loss": 5.8266,
"mean_token_accuracy": 0.14894422441720961,
"num_tokens": 6279702.0,
"step": 3405
},
{
"entropy": 5.952029037475586,
"epoch": 0.28649443394244906,
"grad_norm": 1.0078125,
"learning_rate": 0.0004995375488957576,
"loss": 5.8642,
"mean_token_accuracy": 0.14061814397573472,
"num_tokens": 6288297.0,
"step": 3410
},
{
"entropy": 5.962431287765503,
"epoch": 0.28691451375761395,
"grad_norm": 1.0234375,
"learning_rate": 0.000499535627884051,
"loss": 5.9584,
"mean_token_accuracy": 0.13575505912303926,
"num_tokens": 6297288.0,
"step": 3415
},
{
"entropy": 6.059740591049194,
"epoch": 0.28733459357277885,
"grad_norm": 1.0078125,
"learning_rate": 0.0004995337028948115,
"loss": 5.9824,
"mean_token_accuracy": 0.1375635452568531,
"num_tokens": 6306719.0,
"step": 3420
},
{
"entropy": 5.898637819290161,
"epoch": 0.2877546733879437,
"grad_norm": 1.0703125,
"learning_rate": 0.0004995317739280731,
"loss": 5.8103,
"mean_token_accuracy": 0.14930489808320999,
"num_tokens": 6316639.0,
"step": 3425
},
{
"entropy": 5.959413146972656,
"epoch": 0.2881747532031086,
"grad_norm": 1.03125,
"learning_rate": 0.0004995298409838699,
"loss": 5.9312,
"mean_token_accuracy": 0.14022086784243584,
"num_tokens": 6326879.0,
"step": 3430
},
{
"entropy": 5.916826486587524,
"epoch": 0.2885948330182735,
"grad_norm": 0.984375,
"learning_rate": 0.000499527904062236,
"loss": 5.8308,
"mean_token_accuracy": 0.14133042842149734,
"num_tokens": 6335729.0,
"step": 3435
},
{
"entropy": 5.9654303073883055,
"epoch": 0.28901491283343833,
"grad_norm": 0.96484375,
"learning_rate": 0.0004995259631632061,
"loss": 5.9046,
"mean_token_accuracy": 0.13657819852232933,
"num_tokens": 6345154.0,
"step": 3440
},
{
"entropy": 5.953763580322265,
"epoch": 0.28943499264860323,
"grad_norm": 1.0,
"learning_rate": 0.0004995240182868143,
"loss": 5.8628,
"mean_token_accuracy": 0.1393618740141392,
"num_tokens": 6354309.0,
"step": 3445
},
{
"entropy": 5.8329935550689695,
"epoch": 0.2898550724637681,
"grad_norm": 0.953125,
"learning_rate": 0.0004995220694330951,
"loss": 5.8319,
"mean_token_accuracy": 0.1423378512263298,
"num_tokens": 6363389.0,
"step": 3450
},
{
"entropy": 5.91787896156311,
"epoch": 0.290275152278933,
"grad_norm": 0.9921875,
"learning_rate": 0.0004995201166020832,
"loss": 5.8932,
"mean_token_accuracy": 0.13977494835853577,
"num_tokens": 6372475.0,
"step": 3455
},
{
"entropy": 6.0139179706573485,
"epoch": 0.29069523209409787,
"grad_norm": 1.0546875,
"learning_rate": 0.000499518159793813,
"loss": 5.8548,
"mean_token_accuracy": 0.13958276361227034,
"num_tokens": 6380906.0,
"step": 3460
},
{
"entropy": 5.906954050064087,
"epoch": 0.29111531190926276,
"grad_norm": 1.0390625,
"learning_rate": 0.000499516199008319,
"loss": 5.8626,
"mean_token_accuracy": 0.1415554866194725,
"num_tokens": 6390085.0,
"step": 3465
},
{
"entropy": 5.950432682037354,
"epoch": 0.29153539172442766,
"grad_norm": 1.0859375,
"learning_rate": 0.0004995142342456364,
"loss": 5.9291,
"mean_token_accuracy": 0.1387101523578167,
"num_tokens": 6399441.0,
"step": 3470
},
{
"entropy": 6.014185523986816,
"epoch": 0.2919554715395925,
"grad_norm": 1.0234375,
"learning_rate": 0.0004995122655057997,
"loss": 5.9967,
"mean_token_accuracy": 0.1390436626970768,
"num_tokens": 6408995.0,
"step": 3475
},
{
"entropy": 5.861843252182007,
"epoch": 0.2923755513547574,
"grad_norm": 1.0390625,
"learning_rate": 0.0004995102927888437,
"loss": 5.7559,
"mean_token_accuracy": 0.15259871631860733,
"num_tokens": 6418080.0,
"step": 3480
},
{
"entropy": 5.965292978286743,
"epoch": 0.2927956311699223,
"grad_norm": 1.140625,
"learning_rate": 0.0004995083160948036,
"loss": 5.9191,
"mean_token_accuracy": 0.13854052796959876,
"num_tokens": 6426732.0,
"step": 3485
},
{
"entropy": 5.914570665359497,
"epoch": 0.29321571098508714,
"grad_norm": 1.046875,
"learning_rate": 0.0004995063354237141,
"loss": 5.9308,
"mean_token_accuracy": 0.14200378581881523,
"num_tokens": 6435957.0,
"step": 3490
},
{
"entropy": 5.950191402435303,
"epoch": 0.29363579080025204,
"grad_norm": 1.1875,
"learning_rate": 0.0004995043507756107,
"loss": 5.879,
"mean_token_accuracy": 0.138716172426939,
"num_tokens": 6445642.0,
"step": 3495
},
{
"entropy": 5.972989511489868,
"epoch": 0.29405587061541694,
"grad_norm": 1.109375,
"learning_rate": 0.0004995023621505282,
"loss": 5.9175,
"mean_token_accuracy": 0.13968884497880935,
"num_tokens": 6454664.0,
"step": 3500
},
{
"entropy": 5.872178792953491,
"epoch": 0.29447595043058183,
"grad_norm": 1.0546875,
"learning_rate": 0.000499500369548502,
"loss": 5.8333,
"mean_token_accuracy": 0.1440023772418499,
"num_tokens": 6463224.0,
"step": 3505
},
{
"entropy": 6.124553251266479,
"epoch": 0.2948960302457467,
"grad_norm": 0.97265625,
"learning_rate": 0.0004994983729695674,
"loss": 6.0621,
"mean_token_accuracy": 0.13571332320570945,
"num_tokens": 6473112.0,
"step": 3510
},
{
"entropy": 5.9443089962005615,
"epoch": 0.2953161100609116,
"grad_norm": 1.1796875,
"learning_rate": 0.0004994963724137595,
"loss": 5.902,
"mean_token_accuracy": 0.14207390323281288,
"num_tokens": 6482062.0,
"step": 3515
},
{
"entropy": 5.893982458114624,
"epoch": 0.29573618987607647,
"grad_norm": 1.1484375,
"learning_rate": 0.0004994943678811142,
"loss": 5.8777,
"mean_token_accuracy": 0.1399894528090954,
"num_tokens": 6490568.0,
"step": 3520
},
{
"entropy": 5.980154275894165,
"epoch": 0.2961562696912413,
"grad_norm": 0.9921875,
"learning_rate": 0.0004994923593716667,
"loss": 5.9442,
"mean_token_accuracy": 0.14206611216068268,
"num_tokens": 6500815.0,
"step": 3525
},
{
"entropy": 5.961334371566773,
"epoch": 0.2965763495064062,
"grad_norm": 1.0390625,
"learning_rate": 0.0004994903468854527,
"loss": 5.8139,
"mean_token_accuracy": 0.1528165690600872,
"num_tokens": 6509529.0,
"step": 3530
},
{
"entropy": 5.880562591552734,
"epoch": 0.2969964293215711,
"grad_norm": 1.0546875,
"learning_rate": 0.0004994883304225077,
"loss": 5.8963,
"mean_token_accuracy": 0.1385498009622097,
"num_tokens": 6517934.0,
"step": 3535
},
{
"entropy": 5.994669198989868,
"epoch": 0.297416509136736,
"grad_norm": 1.0078125,
"learning_rate": 0.0004994863099828675,
"loss": 5.8493,
"mean_token_accuracy": 0.1401224449276924,
"num_tokens": 6526098.0,
"step": 3540
},
{
"entropy": 5.8993443012237545,
"epoch": 0.29783658895190085,
"grad_norm": 1.0546875,
"learning_rate": 0.000499484285566568,
"loss": 5.8767,
"mean_token_accuracy": 0.13881718441843988,
"num_tokens": 6535831.0,
"step": 3545
},
{
"entropy": 5.915220880508423,
"epoch": 0.29825666876706575,
"grad_norm": 0.98046875,
"learning_rate": 0.0004994822571736449,
"loss": 5.797,
"mean_token_accuracy": 0.13917236104607583,
"num_tokens": 6545704.0,
"step": 3550
},
{
"entropy": 5.904978036880493,
"epoch": 0.29867674858223064,
"grad_norm": 1.109375,
"learning_rate": 0.0004994802248041342,
"loss": 5.8276,
"mean_token_accuracy": 0.1413833126425743,
"num_tokens": 6554423.0,
"step": 3555
},
{
"entropy": 5.953101110458374,
"epoch": 0.2990968283973955,
"grad_norm": 1.0703125,
"learning_rate": 0.000499478188458072,
"loss": 5.8845,
"mean_token_accuracy": 0.14304101467132568,
"num_tokens": 6563989.0,
"step": 3560
},
{
"entropy": 5.932208251953125,
"epoch": 0.2995169082125604,
"grad_norm": 1.1796875,
"learning_rate": 0.0004994761481354943,
"loss": 6.0148,
"mean_token_accuracy": 0.13859598264098166,
"num_tokens": 6572745.0,
"step": 3565
},
{
"entropy": 6.1440201759338375,
"epoch": 0.2999369880277253,
"grad_norm": 1.046875,
"learning_rate": 0.0004994741038364371,
"loss": 6.0089,
"mean_token_accuracy": 0.136870276927948,
"num_tokens": 6581723.0,
"step": 3570
},
{
"entropy": 5.866305208206176,
"epoch": 0.3003570678428901,
"grad_norm": 1.0703125,
"learning_rate": 0.0004994720555609369,
"loss": 5.7375,
"mean_token_accuracy": 0.14511004835367203,
"num_tokens": 6590342.0,
"step": 3575
},
{
"entropy": 5.784994506835938,
"epoch": 0.300777147658055,
"grad_norm": 1.1875,
"learning_rate": 0.0004994700033090297,
"loss": 5.7982,
"mean_token_accuracy": 0.1514626145362854,
"num_tokens": 6599206.0,
"step": 3580
},
{
"entropy": 6.044550800323487,
"epoch": 0.3011972274732199,
"grad_norm": 1.0859375,
"learning_rate": 0.000499467947080752,
"loss": 6.1055,
"mean_token_accuracy": 0.13087693974375725,
"num_tokens": 6608947.0,
"step": 3585
},
{
"entropy": 5.973358917236328,
"epoch": 0.3016173072883848,
"grad_norm": 1.0625,
"learning_rate": 0.0004994658868761402,
"loss": 5.8986,
"mean_token_accuracy": 0.14760937988758088,
"num_tokens": 6618378.0,
"step": 3590
},
{
"entropy": 6.014848804473877,
"epoch": 0.30203738710354966,
"grad_norm": 1.09375,
"learning_rate": 0.0004994638226952307,
"loss": 5.9391,
"mean_token_accuracy": 0.13312117457389833,
"num_tokens": 6627527.0,
"step": 3595
},
{
"entropy": 6.01325945854187,
"epoch": 0.30245746691871456,
"grad_norm": 1.0546875,
"learning_rate": 0.0004994617545380604,
"loss": 5.8769,
"mean_token_accuracy": 0.14055330455303192,
"num_tokens": 6636964.0,
"step": 3600
},
{
"entropy": 5.821473407745361,
"epoch": 0.30287754673387945,
"grad_norm": 1.1171875,
"learning_rate": 0.0004994596824046656,
"loss": 5.8354,
"mean_token_accuracy": 0.14282228872179986,
"num_tokens": 6646074.0,
"step": 3605
},
{
"entropy": 5.973945283889771,
"epoch": 0.3032976265490443,
"grad_norm": 1.03125,
"learning_rate": 0.000499457606295083,
"loss": 5.9055,
"mean_token_accuracy": 0.13781704679131507,
"num_tokens": 6655027.0,
"step": 3610
},
{
"entropy": 5.746331119537354,
"epoch": 0.3037177063642092,
"grad_norm": 1.15625,
"learning_rate": 0.0004994555262093495,
"loss": 5.699,
"mean_token_accuracy": 0.15337992906570436,
"num_tokens": 6663747.0,
"step": 3615
},
{
"entropy": 6.0847930908203125,
"epoch": 0.3041377861793741,
"grad_norm": 1.1015625,
"learning_rate": 0.000499453442147502,
"loss": 6.0169,
"mean_token_accuracy": 0.1323746047914028,
"num_tokens": 6672922.0,
"step": 3620
},
{
"entropy": 5.927127981185913,
"epoch": 0.304557865994539,
"grad_norm": 1.0546875,
"learning_rate": 0.0004994513541095773,
"loss": 5.8436,
"mean_token_accuracy": 0.1500074289739132,
"num_tokens": 6682233.0,
"step": 3625
},
{
"entropy": 5.871771383285522,
"epoch": 0.30497794580970383,
"grad_norm": 1.0625,
"learning_rate": 0.0004994492620956126,
"loss": 5.8909,
"mean_token_accuracy": 0.14380869269371033,
"num_tokens": 6691593.0,
"step": 3630
},
{
"entropy": 5.89651346206665,
"epoch": 0.30539802562486873,
"grad_norm": 0.98046875,
"learning_rate": 0.0004994471661056445,
"loss": 5.8834,
"mean_token_accuracy": 0.14558402746915816,
"num_tokens": 6701318.0,
"step": 3635
},
{
"entropy": 6.059045791625977,
"epoch": 0.3058181054400336,
"grad_norm": 0.9921875,
"learning_rate": 0.0004994450661397106,
"loss": 5.8977,
"mean_token_accuracy": 0.14569492712616922,
"num_tokens": 6710059.0,
"step": 3640
},
{
"entropy": 6.073701477050781,
"epoch": 0.30623818525519847,
"grad_norm": 1.0078125,
"learning_rate": 0.000499442962197848,
"loss": 5.9894,
"mean_token_accuracy": 0.1346615768969059,
"num_tokens": 6719811.0,
"step": 3645
},
{
"entropy": 5.866523504257202,
"epoch": 0.30665826507036337,
"grad_norm": 0.9921875,
"learning_rate": 0.0004994408542800937,
"loss": 5.8596,
"mean_token_accuracy": 0.14546327590942382,
"num_tokens": 6728789.0,
"step": 3650
},
{
"entropy": 5.874834203720093,
"epoch": 0.30707834488552826,
"grad_norm": 1.0703125,
"learning_rate": 0.0004994387423864855,
"loss": 5.8507,
"mean_token_accuracy": 0.13754900097846984,
"num_tokens": 6737706.0,
"step": 3655
},
{
"entropy": 5.860865497589112,
"epoch": 0.3074984247006931,
"grad_norm": 1.0625,
"learning_rate": 0.0004994366265170603,
"loss": 5.7969,
"mean_token_accuracy": 0.15615564733743667,
"num_tokens": 6746861.0,
"step": 3660
},
{
"entropy": 6.020911169052124,
"epoch": 0.307918504515858,
"grad_norm": 1.1484375,
"learning_rate": 0.0004994345066718558,
"loss": 6.0006,
"mean_token_accuracy": 0.1347304418683052,
"num_tokens": 6755242.0,
"step": 3665
},
{
"entropy": 6.019271802902222,
"epoch": 0.3083385843310229,
"grad_norm": 1.046875,
"learning_rate": 0.0004994323828509098,
"loss": 5.9331,
"mean_token_accuracy": 0.1355019263923168,
"num_tokens": 6764549.0,
"step": 3670
},
{
"entropy": 5.885806322097778,
"epoch": 0.3087586641461878,
"grad_norm": 1.140625,
"learning_rate": 0.0004994302550542596,
"loss": 5.917,
"mean_token_accuracy": 0.1445144943892956,
"num_tokens": 6774123.0,
"step": 3675
},
{
"entropy": 5.832986640930176,
"epoch": 0.30917874396135264,
"grad_norm": 1.140625,
"learning_rate": 0.000499428123281943,
"loss": 5.6888,
"mean_token_accuracy": 0.150942887365818,
"num_tokens": 6782922.0,
"step": 3680
},
{
"entropy": 5.9098718643188475,
"epoch": 0.30959882377651754,
"grad_norm": 1.046875,
"learning_rate": 0.0004994259875339978,
"loss": 5.9422,
"mean_token_accuracy": 0.1405124768614769,
"num_tokens": 6792042.0,
"step": 3685
},
{
"entropy": 6.074847984313965,
"epoch": 0.31001890359168244,
"grad_norm": 1.1328125,
"learning_rate": 0.0004994238478104617,
"loss": 5.9376,
"mean_token_accuracy": 0.1440422020852566,
"num_tokens": 6800994.0,
"step": 3690
},
{
"entropy": 5.9522175788879395,
"epoch": 0.3104389834068473,
"grad_norm": 1.0234375,
"learning_rate": 0.0004994217041113727,
"loss": 5.8718,
"mean_token_accuracy": 0.14600413292646408,
"num_tokens": 6809938.0,
"step": 3695
},
{
"entropy": 5.933554553985596,
"epoch": 0.3108590632220122,
"grad_norm": 0.98046875,
"learning_rate": 0.0004994195564367688,
"loss": 5.9991,
"mean_token_accuracy": 0.13622643798589706,
"num_tokens": 6820289.0,
"step": 3700
},
{
"entropy": 5.9958281993865965,
"epoch": 0.3112791430371771,
"grad_norm": 1.09375,
"learning_rate": 0.0004994174047866882,
"loss": 5.822,
"mean_token_accuracy": 0.14527520388364792,
"num_tokens": 6830068.0,
"step": 3705
},
{
"entropy": 5.786840772628784,
"epoch": 0.3116992228523419,
"grad_norm": 1.09375,
"learning_rate": 0.0004994152491611686,
"loss": 5.8625,
"mean_token_accuracy": 0.14006299674510955,
"num_tokens": 6838591.0,
"step": 3710
},
{
"entropy": 5.904484987258911,
"epoch": 0.3121193026675068,
"grad_norm": 1.0234375,
"learning_rate": 0.0004994130895602485,
"loss": 5.8153,
"mean_token_accuracy": 0.14202122390270233,
"num_tokens": 6847796.0,
"step": 3715
},
{
"entropy": 6.0328771591186525,
"epoch": 0.3125393824826717,
"grad_norm": 1.015625,
"learning_rate": 0.000499410925983966,
"loss": 5.9232,
"mean_token_accuracy": 0.1450626164674759,
"num_tokens": 6856585.0,
"step": 3720
},
{
"entropy": 5.84233341217041,
"epoch": 0.3129594622978366,
"grad_norm": 1.109375,
"learning_rate": 0.0004994087584323596,
"loss": 5.8286,
"mean_token_accuracy": 0.1493520975112915,
"num_tokens": 6865757.0,
"step": 3725
},
{
"entropy": 5.8724321842193605,
"epoch": 0.31337954211300145,
"grad_norm": 1.0390625,
"learning_rate": 0.0004994065869054676,
"loss": 5.8674,
"mean_token_accuracy": 0.13832397535443305,
"num_tokens": 6875371.0,
"step": 3730
},
{
"entropy": 5.988558006286621,
"epoch": 0.31379962192816635,
"grad_norm": 1.109375,
"learning_rate": 0.0004994044114033283,
"loss": 5.9369,
"mean_token_accuracy": 0.1350552909076214,
"num_tokens": 6884050.0,
"step": 3735
},
{
"entropy": 5.992891883850097,
"epoch": 0.31421970174333125,
"grad_norm": 1.1953125,
"learning_rate": 0.0004994022319259806,
"loss": 5.889,
"mean_token_accuracy": 0.14552320092916488,
"num_tokens": 6893079.0,
"step": 3740
},
{
"entropy": 5.952337551116943,
"epoch": 0.3146397815584961,
"grad_norm": 1.0625,
"learning_rate": 0.0004994000484734629,
"loss": 5.995,
"mean_token_accuracy": 0.14312526509165763,
"num_tokens": 6903100.0,
"step": 3745
},
{
"entropy": 5.9438494682312015,
"epoch": 0.315059861373661,
"grad_norm": 0.984375,
"learning_rate": 0.0004993978610458137,
"loss": 5.8256,
"mean_token_accuracy": 0.14954894483089448,
"num_tokens": 6912164.0,
"step": 3750
},
{
"entropy": 5.8575239181518555,
"epoch": 0.3154799411888259,
"grad_norm": 1.0546875,
"learning_rate": 0.0004993956696430721,
"loss": 5.8486,
"mean_token_accuracy": 0.14094497933983802,
"num_tokens": 6921183.0,
"step": 3755
},
{
"entropy": 6.00494794845581,
"epoch": 0.3159000210039908,
"grad_norm": 0.98046875,
"learning_rate": 0.0004993934742652768,
"loss": 5.926,
"mean_token_accuracy": 0.1435340739786625,
"num_tokens": 6931325.0,
"step": 3760
},
{
"entropy": 5.935520792007447,
"epoch": 0.3163201008191556,
"grad_norm": 1.03125,
"learning_rate": 0.0004993912749124665,
"loss": 5.8321,
"mean_token_accuracy": 0.14375170618295668,
"num_tokens": 6940234.0,
"step": 3765
},
{
"entropy": 5.882375431060791,
"epoch": 0.3167401806343205,
"grad_norm": 1.0,
"learning_rate": 0.0004993890715846804,
"loss": 5.9072,
"mean_token_accuracy": 0.14548940509557723,
"num_tokens": 6949067.0,
"step": 3770
},
{
"entropy": 5.975930404663086,
"epoch": 0.3171602604494854,
"grad_norm": 1.0625,
"learning_rate": 0.0004993868642819574,
"loss": 5.8933,
"mean_token_accuracy": 0.14246345758438111,
"num_tokens": 6959085.0,
"step": 3775
},
{
"entropy": 5.952158069610595,
"epoch": 0.31758034026465026,
"grad_norm": 1.0625,
"learning_rate": 0.0004993846530043367,
"loss": 5.9278,
"mean_token_accuracy": 0.13835494145750998,
"num_tokens": 6967392.0,
"step": 3780
},
{
"entropy": 5.894070625305176,
"epoch": 0.31800042007981516,
"grad_norm": 1.1328125,
"learning_rate": 0.0004993824377518574,
"loss": 5.8551,
"mean_token_accuracy": 0.14527128487825394,
"num_tokens": 6976369.0,
"step": 3785
},
{
"entropy": 5.92194676399231,
"epoch": 0.31842049989498006,
"grad_norm": 1.0625,
"learning_rate": 0.0004993802185245587,
"loss": 5.8617,
"mean_token_accuracy": 0.14414689913392068,
"num_tokens": 6985889.0,
"step": 3790
},
{
"entropy": 5.896907377243042,
"epoch": 0.3188405797101449,
"grad_norm": 1.0703125,
"learning_rate": 0.00049937799532248,
"loss": 5.8865,
"mean_token_accuracy": 0.1328159935772419,
"num_tokens": 6995396.0,
"step": 3795
},
{
"entropy": 6.125208377838135,
"epoch": 0.3192606595253098,
"grad_norm": 1.0078125,
"learning_rate": 0.0004993757681456607,
"loss": 5.9486,
"mean_token_accuracy": 0.1384855605661869,
"num_tokens": 7004666.0,
"step": 3800
},
{
"entropy": 5.907473802566528,
"epoch": 0.3196807393404747,
"grad_norm": 1.0,
"learning_rate": 0.0004993735369941401,
"loss": 5.9629,
"mean_token_accuracy": 0.1332864873111248,
"num_tokens": 7014608.0,
"step": 3805
},
{
"entropy": 5.944202995300293,
"epoch": 0.3201008191556396,
"grad_norm": 0.984375,
"learning_rate": 0.0004993713018679579,
"loss": 5.8458,
"mean_token_accuracy": 0.14125867262482644,
"num_tokens": 7023671.0,
"step": 3810
},
{
"entropy": 5.928729820251465,
"epoch": 0.32052089897080444,
"grad_norm": 0.99609375,
"learning_rate": 0.0004993690627671536,
"loss": 5.8973,
"mean_token_accuracy": 0.13645214289426805,
"num_tokens": 7033786.0,
"step": 3815
},
{
"entropy": 5.887518405914307,
"epoch": 0.32094097878596933,
"grad_norm": 1.0546875,
"learning_rate": 0.0004993668196917669,
"loss": 5.8101,
"mean_token_accuracy": 0.14580376744270324,
"num_tokens": 7042162.0,
"step": 3820
},
{
"entropy": 5.971746206283569,
"epoch": 0.32136105860113423,
"grad_norm": 1.046875,
"learning_rate": 0.0004993645726418375,
"loss": 5.9551,
"mean_token_accuracy": 0.14017711579799652,
"num_tokens": 7051903.0,
"step": 3825
},
{
"entropy": 5.813205862045288,
"epoch": 0.3217811384162991,
"grad_norm": 1.0546875,
"learning_rate": 0.0004993623216174053,
"loss": 5.7747,
"mean_token_accuracy": 0.15284693986177444,
"num_tokens": 7060229.0,
"step": 3830
},
{
"entropy": 5.849071598052978,
"epoch": 0.32220121823146397,
"grad_norm": 1.0859375,
"learning_rate": 0.00049936006661851,
"loss": 5.8722,
"mean_token_accuracy": 0.1437354326248169,
"num_tokens": 7069040.0,
"step": 3835
},
{
"entropy": 5.901387882232666,
"epoch": 0.32262129804662887,
"grad_norm": 1.1171875,
"learning_rate": 0.0004993578076451917,
"loss": 5.7592,
"mean_token_accuracy": 0.14521230906248092,
"num_tokens": 7078409.0,
"step": 3840
},
{
"entropy": 5.755710220336914,
"epoch": 0.32304137786179377,
"grad_norm": 1.046875,
"learning_rate": 0.0004993555446974903,
"loss": 5.8413,
"mean_token_accuracy": 0.13679178506135942,
"num_tokens": 7087983.0,
"step": 3845
},
{
"entropy": 5.907869100570679,
"epoch": 0.3234614576769586,
"grad_norm": 1.1328125,
"learning_rate": 0.000499353277775446,
"loss": 5.7939,
"mean_token_accuracy": 0.14731610864400863,
"num_tokens": 7097277.0,
"step": 3850
},
{
"entropy": 5.8764000415802,
"epoch": 0.3238815374921235,
"grad_norm": 1.1875,
"learning_rate": 0.0004993510068790989,
"loss": 5.7001,
"mean_token_accuracy": 0.1560800179839134,
"num_tokens": 7105918.0,
"step": 3855
},
{
"entropy": 5.7582238674163815,
"epoch": 0.3243016173072884,
"grad_norm": 1.0234375,
"learning_rate": 0.0004993487320084892,
"loss": 5.7782,
"mean_token_accuracy": 0.15001367256045342,
"num_tokens": 7115049.0,
"step": 3860
},
{
"entropy": 5.903066968917846,
"epoch": 0.32472169712245325,
"grad_norm": 1.015625,
"learning_rate": 0.0004993464531636573,
"loss": 5.8538,
"mean_token_accuracy": 0.1425308421254158,
"num_tokens": 7124862.0,
"step": 3865
},
{
"entropy": 5.880357646942139,
"epoch": 0.32514177693761814,
"grad_norm": 1.046875,
"learning_rate": 0.0004993441703446435,
"loss": 5.7615,
"mean_token_accuracy": 0.15210687071084977,
"num_tokens": 7133280.0,
"step": 3870
},
{
"entropy": 5.987690162658692,
"epoch": 0.32556185675278304,
"grad_norm": 1.078125,
"learning_rate": 0.0004993418835514882,
"loss": 5.9443,
"mean_token_accuracy": 0.14041659682989122,
"num_tokens": 7142446.0,
"step": 3875
},
{
"entropy": 5.899273204803467,
"epoch": 0.3259819365679479,
"grad_norm": 0.94921875,
"learning_rate": 0.0004993395927842321,
"loss": 5.8526,
"mean_token_accuracy": 0.13959464728832244,
"num_tokens": 7152143.0,
"step": 3880
},
{
"entropy": 5.969484949111939,
"epoch": 0.3264020163831128,
"grad_norm": 1.078125,
"learning_rate": 0.0004993372980429155,
"loss": 5.9318,
"mean_token_accuracy": 0.13812349438667298,
"num_tokens": 7162046.0,
"step": 3885
},
{
"entropy": 5.943978548049927,
"epoch": 0.3268220961982777,
"grad_norm": 1.03125,
"learning_rate": 0.0004993349993275792,
"loss": 5.8136,
"mean_token_accuracy": 0.14231757447123528,
"num_tokens": 7171557.0,
"step": 3890
},
{
"entropy": 5.693315172195435,
"epoch": 0.3272421760134426,
"grad_norm": 0.9765625,
"learning_rate": 0.0004993326966382639,
"loss": 5.72,
"mean_token_accuracy": 0.14861621260643004,
"num_tokens": 7180927.0,
"step": 3895
},
{
"entropy": 5.866424655914306,
"epoch": 0.3276622558286074,
"grad_norm": 1.1015625,
"learning_rate": 0.0004993303899750104,
"loss": 5.7842,
"mean_token_accuracy": 0.14696542471647261,
"num_tokens": 7189552.0,
"step": 3900
},
{
"entropy": 5.99456205368042,
"epoch": 0.3280823356437723,
"grad_norm": 1.109375,
"learning_rate": 0.0004993280793378595,
"loss": 5.8131,
"mean_token_accuracy": 0.14061524197459221,
"num_tokens": 7197857.0,
"step": 3905
},
{
"entropy": 5.852747583389283,
"epoch": 0.3285024154589372,
"grad_norm": 1.125,
"learning_rate": 0.0004993257647268522,
"loss": 5.784,
"mean_token_accuracy": 0.1499074526131153,
"num_tokens": 7206785.0,
"step": 3910
},
{
"entropy": 5.897435235977173,
"epoch": 0.32892249527410206,
"grad_norm": 1.0,
"learning_rate": 0.0004993234461420295,
"loss": 5.877,
"mean_token_accuracy": 0.14293809309601785,
"num_tokens": 7216360.0,
"step": 3915
},
{
"entropy": 5.796799039840698,
"epoch": 0.32934257508926695,
"grad_norm": 1.140625,
"learning_rate": 0.0004993211235834326,
"loss": 5.691,
"mean_token_accuracy": 0.1605226844549179,
"num_tokens": 7224890.0,
"step": 3920
},
{
"entropy": 5.739530944824219,
"epoch": 0.32976265490443185,
"grad_norm": 1.1796875,
"learning_rate": 0.0004993187970511023,
"loss": 5.7395,
"mean_token_accuracy": 0.16714823096990586,
"num_tokens": 7234442.0,
"step": 3925
},
{
"entropy": 5.890788459777832,
"epoch": 0.33018273471959675,
"grad_norm": 1.046875,
"learning_rate": 0.0004993164665450801,
"loss": 5.8905,
"mean_token_accuracy": 0.14693281054496765,
"num_tokens": 7244023.0,
"step": 3930
},
{
"entropy": 5.90093765258789,
"epoch": 0.3306028145347616,
"grad_norm": 1.0390625,
"learning_rate": 0.0004993141320654072,
"loss": 5.7506,
"mean_token_accuracy": 0.15061969310045242,
"num_tokens": 7253548.0,
"step": 3935
},
{
"entropy": 5.896792697906494,
"epoch": 0.3310228943499265,
"grad_norm": 1.046875,
"learning_rate": 0.000499311793612125,
"loss": 5.7993,
"mean_token_accuracy": 0.14582014903426171,
"num_tokens": 7262962.0,
"step": 3940
},
{
"entropy": 5.883127927780151,
"epoch": 0.3314429741650914,
"grad_norm": 1.0234375,
"learning_rate": 0.0004993094511852748,
"loss": 5.8366,
"mean_token_accuracy": 0.14408671855926514,
"num_tokens": 7272234.0,
"step": 3945
},
{
"entropy": 5.896127986907959,
"epoch": 0.33186305398025623,
"grad_norm": 1.0703125,
"learning_rate": 0.0004993071047848983,
"loss": 5.8305,
"mean_token_accuracy": 0.14289655536413193,
"num_tokens": 7281524.0,
"step": 3950
},
{
"entropy": 5.824465751647949,
"epoch": 0.3322831337954211,
"grad_norm": 1.0625,
"learning_rate": 0.0004993047544110368,
"loss": 5.7238,
"mean_token_accuracy": 0.1518530122935772,
"num_tokens": 7289601.0,
"step": 3955
},
{
"entropy": 5.694170951843262,
"epoch": 0.332703213610586,
"grad_norm": 1.125,
"learning_rate": 0.0004993024000637321,
"loss": 5.6752,
"mean_token_accuracy": 0.15183946788311004,
"num_tokens": 7298508.0,
"step": 3960
},
{
"entropy": 5.792486143112183,
"epoch": 0.33312329342575087,
"grad_norm": 1.015625,
"learning_rate": 0.0004993000417430259,
"loss": 5.8899,
"mean_token_accuracy": 0.14279956892132759,
"num_tokens": 7309065.0,
"step": 3965
},
{
"entropy": 6.024036836624146,
"epoch": 0.33354337324091576,
"grad_norm": 0.9765625,
"learning_rate": 0.00049929767944896,
"loss": 5.9246,
"mean_token_accuracy": 0.14718690514564514,
"num_tokens": 7319669.0,
"step": 3970
},
{
"entropy": 5.950971412658691,
"epoch": 0.33396345305608066,
"grad_norm": 1.0390625,
"learning_rate": 0.0004992953131815761,
"loss": 5.8623,
"mean_token_accuracy": 0.14798057675361634,
"num_tokens": 7328425.0,
"step": 3975
},
{
"entropy": 5.795519065856934,
"epoch": 0.33438353287124556,
"grad_norm": 1.1484375,
"learning_rate": 0.0004992929429409164,
"loss": 5.7591,
"mean_token_accuracy": 0.15135881006717683,
"num_tokens": 7337369.0,
"step": 3980
},
{
"entropy": 5.837197399139404,
"epoch": 0.3348036126864104,
"grad_norm": 1.0,
"learning_rate": 0.0004992905687270225,
"loss": 5.8149,
"mean_token_accuracy": 0.14630376771092415,
"num_tokens": 7346829.0,
"step": 3985
},
{
"entropy": 5.964364957809448,
"epoch": 0.3352236925015753,
"grad_norm": 1.0625,
"learning_rate": 0.0004992881905399368,
"loss": 5.8666,
"mean_token_accuracy": 0.14199195429682732,
"num_tokens": 7355976.0,
"step": 3990
},
{
"entropy": 5.900812721252441,
"epoch": 0.3356437723167402,
"grad_norm": 1.1796875,
"learning_rate": 0.0004992858083797013,
"loss": 5.8223,
"mean_token_accuracy": 0.1406247913837433,
"num_tokens": 7365210.0,
"step": 3995
},
{
"entropy": 5.871991872787476,
"epoch": 0.33606385213190504,
"grad_norm": 1.140625,
"learning_rate": 0.0004992834222463581,
"loss": 5.8811,
"mean_token_accuracy": 0.13051848039031028,
"num_tokens": 7374175.0,
"step": 4000
},
{
"entropy": 5.91477222442627,
"epoch": 0.33648393194706994,
"grad_norm": 1.046875,
"learning_rate": 0.0004992810321399496,
"loss": 5.9021,
"mean_token_accuracy": 0.13812239095568657,
"num_tokens": 7383302.0,
"step": 4005
},
{
"entropy": 5.950973415374756,
"epoch": 0.33690401176223483,
"grad_norm": 1.0703125,
"learning_rate": 0.0004992786380605182,
"loss": 5.8954,
"mean_token_accuracy": 0.13927265778183937,
"num_tokens": 7392746.0,
"step": 4010
},
{
"entropy": 5.840262222290039,
"epoch": 0.33732409157739973,
"grad_norm": 1.09375,
"learning_rate": 0.0004992762400081062,
"loss": 5.7351,
"mean_token_accuracy": 0.1521330937743187,
"num_tokens": 7401604.0,
"step": 4015
},
{
"entropy": 5.792042589187622,
"epoch": 0.3377441713925646,
"grad_norm": 1.0625,
"learning_rate": 0.0004992738379827559,
"loss": 5.8413,
"mean_token_accuracy": 0.1433362640440464,
"num_tokens": 7410594.0,
"step": 4020
},
{
"entropy": 5.919311571121216,
"epoch": 0.33816425120772947,
"grad_norm": 1.1015625,
"learning_rate": 0.0004992714319845101,
"loss": 5.7434,
"mean_token_accuracy": 0.15505423694849013,
"num_tokens": 7418831.0,
"step": 4025
},
{
"entropy": 5.792124700546265,
"epoch": 0.33858433102289437,
"grad_norm": 1.046875,
"learning_rate": 0.0004992690220134116,
"loss": 5.7874,
"mean_token_accuracy": 0.14773694202303886,
"num_tokens": 7427731.0,
"step": 4030
},
{
"entropy": 5.9903051376342775,
"epoch": 0.3390044108380592,
"grad_norm": 1.125,
"learning_rate": 0.0004992666080695027,
"loss": 5.901,
"mean_token_accuracy": 0.14006555154919625,
"num_tokens": 7436447.0,
"step": 4035
},
{
"entropy": 5.859370946884155,
"epoch": 0.3394244906532241,
"grad_norm": 1.1171875,
"learning_rate": 0.0004992641901528262,
"loss": 5.8003,
"mean_token_accuracy": 0.14703143313527106,
"num_tokens": 7445352.0,
"step": 4040
},
{
"entropy": 5.9202173233032225,
"epoch": 0.339844570468389,
"grad_norm": 1.0078125,
"learning_rate": 0.0004992617682634252,
"loss": 5.8533,
"mean_token_accuracy": 0.14578376188874245,
"num_tokens": 7454298.0,
"step": 4045
},
{
"entropy": 5.925920295715332,
"epoch": 0.34026465028355385,
"grad_norm": 0.9921875,
"learning_rate": 0.0004992593424013424,
"loss": 5.8687,
"mean_token_accuracy": 0.14278641790151597,
"num_tokens": 7463543.0,
"step": 4050
},
{
"entropy": 5.906185626983643,
"epoch": 0.34068473009871875,
"grad_norm": 1.078125,
"learning_rate": 0.0004992569125666209,
"loss": 5.8916,
"mean_token_accuracy": 0.14117056503891945,
"num_tokens": 7472701.0,
"step": 4055
},
{
"entropy": 5.985479879379272,
"epoch": 0.34110480991388364,
"grad_norm": 0.96484375,
"learning_rate": 0.0004992544787593037,
"loss": 5.877,
"mean_token_accuracy": 0.14191561043262482,
"num_tokens": 7481123.0,
"step": 4060
},
{
"entropy": 5.9610504627227785,
"epoch": 0.34152488972904854,
"grad_norm": 1.0234375,
"learning_rate": 0.0004992520409794338,
"loss": 5.9189,
"mean_token_accuracy": 0.1452765129506588,
"num_tokens": 7490439.0,
"step": 4065
},
{
"entropy": 5.836852645874023,
"epoch": 0.3419449695442134,
"grad_norm": 1.0546875,
"learning_rate": 0.0004992495992270544,
"loss": 5.7926,
"mean_token_accuracy": 0.14515199437737464,
"num_tokens": 7499326.0,
"step": 4070
},
{
"entropy": 5.891369199752807,
"epoch": 0.3423650493593783,
"grad_norm": 1.0859375,
"learning_rate": 0.0004992471535022089,
"loss": 5.8635,
"mean_token_accuracy": 0.14169666543602943,
"num_tokens": 7509407.0,
"step": 4075
},
{
"entropy": 5.9069582462310795,
"epoch": 0.3427851291745432,
"grad_norm": 1.015625,
"learning_rate": 0.0004992447038049405,
"loss": 5.8996,
"mean_token_accuracy": 0.14123916029930114,
"num_tokens": 7518443.0,
"step": 4080
},
{
"entropy": 5.812238025665283,
"epoch": 0.343205208989708,
"grad_norm": 1.1328125,
"learning_rate": 0.0004992422501352927,
"loss": 5.7668,
"mean_token_accuracy": 0.15378406941890715,
"num_tokens": 7527609.0,
"step": 4085
},
{
"entropy": 5.93126916885376,
"epoch": 0.3436252888048729,
"grad_norm": 1.1328125,
"learning_rate": 0.0004992397924933089,
"loss": 5.8537,
"mean_token_accuracy": 0.1427117370069027,
"num_tokens": 7536890.0,
"step": 4090
},
{
"entropy": 5.9002909660339355,
"epoch": 0.3440453686200378,
"grad_norm": 1.109375,
"learning_rate": 0.0004992373308790325,
"loss": 5.8216,
"mean_token_accuracy": 0.14809030741453172,
"num_tokens": 7546509.0,
"step": 4095
},
{
"entropy": 5.803368282318115,
"epoch": 0.3444654484352027,
"grad_norm": 1.0546875,
"learning_rate": 0.0004992348652925074,
"loss": 5.8462,
"mean_token_accuracy": 0.14304676800966262,
"num_tokens": 7555336.0,
"step": 4100
},
{
"entropy": 5.942346906661987,
"epoch": 0.34488552825036756,
"grad_norm": 1.1640625,
"learning_rate": 0.0004992323957337771,
"loss": 5.791,
"mean_token_accuracy": 0.14322008267045022,
"num_tokens": 7565210.0,
"step": 4105
},
{
"entropy": 5.929241180419922,
"epoch": 0.34530560806553245,
"grad_norm": 0.9609375,
"learning_rate": 0.0004992299222028855,
"loss": 5.8774,
"mean_token_accuracy": 0.15235665440559387,
"num_tokens": 7574516.0,
"step": 4110
},
{
"entropy": 5.811958742141724,
"epoch": 0.34572568788069735,
"grad_norm": 1.078125,
"learning_rate": 0.0004992274446998761,
"loss": 5.7417,
"mean_token_accuracy": 0.14640501961112024,
"num_tokens": 7583219.0,
"step": 4115
},
{
"entropy": 5.927424812316895,
"epoch": 0.3461457676958622,
"grad_norm": 1.09375,
"learning_rate": 0.0004992249632247929,
"loss": 5.9848,
"mean_token_accuracy": 0.13824489042162896,
"num_tokens": 7592050.0,
"step": 4120
},
{
"entropy": 5.978273487091064,
"epoch": 0.3465658475110271,
"grad_norm": 1.046875,
"learning_rate": 0.0004992224777776802,
"loss": 5.804,
"mean_token_accuracy": 0.14370897114276887,
"num_tokens": 7600718.0,
"step": 4125
},
{
"entropy": 5.832902526855468,
"epoch": 0.346985927326192,
"grad_norm": 1.0859375,
"learning_rate": 0.0004992199883585816,
"loss": 5.8337,
"mean_token_accuracy": 0.14620699509978294,
"num_tokens": 7609191.0,
"step": 4130
},
{
"entropy": 5.898161125183106,
"epoch": 0.34740600714135683,
"grad_norm": 1.0703125,
"learning_rate": 0.0004992174949675413,
"loss": 5.8663,
"mean_token_accuracy": 0.14224207326769828,
"num_tokens": 7618509.0,
"step": 4135
},
{
"entropy": 5.8946198463439945,
"epoch": 0.34782608695652173,
"grad_norm": 1.1796875,
"learning_rate": 0.0004992149976046037,
"loss": 5.7879,
"mean_token_accuracy": 0.144022449105978,
"num_tokens": 7627851.0,
"step": 4140
},
{
"entropy": 5.815617799758911,
"epoch": 0.3482461667716866,
"grad_norm": 1.0234375,
"learning_rate": 0.0004992124962698128,
"loss": 5.8457,
"mean_token_accuracy": 0.14384627863764762,
"num_tokens": 7636748.0,
"step": 4145
},
{
"entropy": 5.8924195766448975,
"epoch": 0.3486662465868515,
"grad_norm": 1.109375,
"learning_rate": 0.000499209990963213,
"loss": 5.7665,
"mean_token_accuracy": 0.14772162735462188,
"num_tokens": 7645436.0,
"step": 4150
},
{
"entropy": 5.927510690689087,
"epoch": 0.34908632640201637,
"grad_norm": 1.109375,
"learning_rate": 0.0004992074816848487,
"loss": 5.906,
"mean_token_accuracy": 0.14214019924402238,
"num_tokens": 7655414.0,
"step": 4155
},
{
"entropy": 5.804220294952392,
"epoch": 0.34950640621718126,
"grad_norm": 1.125,
"learning_rate": 0.0004992049684347642,
"loss": 5.6855,
"mean_token_accuracy": 0.1492043748497963,
"num_tokens": 7664295.0,
"step": 4160
},
{
"entropy": 5.893027591705322,
"epoch": 0.34992648603234616,
"grad_norm": 1.125,
"learning_rate": 0.0004992024512130042,
"loss": 5.8168,
"mean_token_accuracy": 0.14553705751895904,
"num_tokens": 7673295.0,
"step": 4165
},
{
"entropy": 5.80792088508606,
"epoch": 0.350346565847511,
"grad_norm": 0.98046875,
"learning_rate": 0.0004991999300196132,
"loss": 5.8163,
"mean_token_accuracy": 0.14397938475012778,
"num_tokens": 7682932.0,
"step": 4170
},
{
"entropy": 5.955667972564697,
"epoch": 0.3507666456626759,
"grad_norm": 1.1328125,
"learning_rate": 0.0004991974048546359,
"loss": 5.8403,
"mean_token_accuracy": 0.14294423833489417,
"num_tokens": 7692105.0,
"step": 4175
},
{
"entropy": 5.830784654617309,
"epoch": 0.3511867254778408,
"grad_norm": 1.09375,
"learning_rate": 0.000499194875718117,
"loss": 5.8283,
"mean_token_accuracy": 0.1480561077594757,
"num_tokens": 7701294.0,
"step": 4180
},
{
"entropy": 5.911309385299683,
"epoch": 0.3516068052930057,
"grad_norm": 1.0390625,
"learning_rate": 0.0004991923426101013,
"loss": 5.8251,
"mean_token_accuracy": 0.14229920953512193,
"num_tokens": 7710964.0,
"step": 4185
},
{
"entropy": 5.952438926696777,
"epoch": 0.35202688510817054,
"grad_norm": 1.0703125,
"learning_rate": 0.0004991898055306337,
"loss": 5.958,
"mean_token_accuracy": 0.13736522495746611,
"num_tokens": 7719938.0,
"step": 4190
},
{
"entropy": 5.8736350536346436,
"epoch": 0.35244696492333544,
"grad_norm": 0.984375,
"learning_rate": 0.0004991872644797591,
"loss": 5.8462,
"mean_token_accuracy": 0.14402214288711548,
"num_tokens": 7729129.0,
"step": 4195
},
{
"entropy": 5.931869459152222,
"epoch": 0.35286704473850034,
"grad_norm": 1.15625,
"learning_rate": 0.0004991847194575226,
"loss": 5.869,
"mean_token_accuracy": 0.1402227133512497,
"num_tokens": 7738506.0,
"step": 4200
},
{
"entropy": 5.974721908569336,
"epoch": 0.3532871245536652,
"grad_norm": 1.0390625,
"learning_rate": 0.0004991821704639693,
"loss": 5.9687,
"mean_token_accuracy": 0.14027536809444427,
"num_tokens": 7749320.0,
"step": 4205
},
{
"entropy": 6.00723237991333,
"epoch": 0.3537072043688301,
"grad_norm": 1.1015625,
"learning_rate": 0.0004991796174991443,
"loss": 5.8353,
"mean_token_accuracy": 0.14073035642504692,
"num_tokens": 7758735.0,
"step": 4210
},
{
"entropy": 5.794616985321045,
"epoch": 0.354127284183995,
"grad_norm": 1.140625,
"learning_rate": 0.0004991770605630927,
"loss": 5.7909,
"mean_token_accuracy": 0.14388300105929375,
"num_tokens": 7767556.0,
"step": 4215
},
{
"entropy": 5.851093196868897,
"epoch": 0.3545473639991598,
"grad_norm": 1.1015625,
"learning_rate": 0.0004991744996558599,
"loss": 5.8025,
"mean_token_accuracy": 0.14704317823052407,
"num_tokens": 7776615.0,
"step": 4220
},
{
"entropy": 5.890424919128418,
"epoch": 0.3549674438143247,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991719347774913,
"loss": 5.8605,
"mean_token_accuracy": 0.14900021702051164,
"num_tokens": 7785288.0,
"step": 4225
},
{
"entropy": 5.859505701065063,
"epoch": 0.3553875236294896,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991693659280324,
"loss": 5.7568,
"mean_token_accuracy": 0.14839265495538712,
"num_tokens": 7794381.0,
"step": 4230
},
{
"entropy": 5.84209361076355,
"epoch": 0.3558076034446545,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991667931075284,
"loss": 5.7275,
"mean_token_accuracy": 0.14752706736326218,
"num_tokens": 7803265.0,
"step": 4235
},
{
"entropy": 5.866020679473877,
"epoch": 0.35622768325981935,
"grad_norm": 1.046875,
"learning_rate": 0.0004991642163160252,
"loss": 5.8504,
"mean_token_accuracy": 0.14277284890413283,
"num_tokens": 7812445.0,
"step": 4240
},
{
"entropy": 5.960299301147461,
"epoch": 0.35664776307498425,
"grad_norm": 0.95703125,
"learning_rate": 0.0004991616355535684,
"loss": 5.8377,
"mean_token_accuracy": 0.14702583178877832,
"num_tokens": 7822073.0,
"step": 4245
},
{
"entropy": 5.9013489246368405,
"epoch": 0.35706784289014915,
"grad_norm": 1.09375,
"learning_rate": 0.0004991590508202036,
"loss": 5.8132,
"mean_token_accuracy": 0.1407523714005947,
"num_tokens": 7831193.0,
"step": 4250
},
{
"entropy": 5.895477247238159,
"epoch": 0.357487922705314,
"grad_norm": 1.1171875,
"learning_rate": 0.0004991564621159766,
"loss": 5.8444,
"mean_token_accuracy": 0.14087399318814278,
"num_tokens": 7840311.0,
"step": 4255
},
{
"entropy": 5.826603794097901,
"epoch": 0.3579080025204789,
"grad_norm": 1.1015625,
"learning_rate": 0.0004991538694409334,
"loss": 5.8748,
"mean_token_accuracy": 0.13955658972263335,
"num_tokens": 7849622.0,
"step": 4260
},
{
"entropy": 5.89751443862915,
"epoch": 0.3583280823356438,
"grad_norm": 1.125,
"learning_rate": 0.0004991512727951198,
"loss": 5.8243,
"mean_token_accuracy": 0.14570727497339248,
"num_tokens": 7859494.0,
"step": 4265
},
{
"entropy": 6.055752372741699,
"epoch": 0.3587481621508087,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991486721785818,
"loss": 5.9341,
"mean_token_accuracy": 0.1410772293806076,
"num_tokens": 7868526.0,
"step": 4270
},
{
"entropy": 5.842230558395386,
"epoch": 0.3591682419659735,
"grad_norm": 1.1015625,
"learning_rate": 0.0004991460675913655,
"loss": 5.7697,
"mean_token_accuracy": 0.14887222051620483,
"num_tokens": 7877631.0,
"step": 4275
},
{
"entropy": 5.889631223678589,
"epoch": 0.3595883217811384,
"grad_norm": 1.0703125,
"learning_rate": 0.000499143459033517,
"loss": 5.8122,
"mean_token_accuracy": 0.15351531505584717,
"num_tokens": 7886814.0,
"step": 4280
},
{
"entropy": 5.774262809753418,
"epoch": 0.3600084015963033,
"grad_norm": 1.078125,
"learning_rate": 0.0004991408465050825,
"loss": 5.6496,
"mean_token_accuracy": 0.15586639940738678,
"num_tokens": 7896337.0,
"step": 4285
},
{
"entropy": 5.794591951370239,
"epoch": 0.36042848141146816,
"grad_norm": 0.9765625,
"learning_rate": 0.0004991382300061084,
"loss": 5.9096,
"mean_token_accuracy": 0.13906644135713578,
"num_tokens": 7906071.0,
"step": 4290
},
{
"entropy": 5.954898834228516,
"epoch": 0.36084856122663306,
"grad_norm": 0.9765625,
"learning_rate": 0.0004991356095366409,
"loss": 5.8891,
"mean_token_accuracy": 0.14360055327415466,
"num_tokens": 7915003.0,
"step": 4295
},
{
"entropy": 5.958932161331177,
"epoch": 0.36126864104179796,
"grad_norm": 1.0625,
"learning_rate": 0.0004991329850967266,
"loss": 5.7565,
"mean_token_accuracy": 0.14655721336603164,
"num_tokens": 7924408.0,
"step": 4300
},
{
"entropy": 5.7926774501800535,
"epoch": 0.3616887208569628,
"grad_norm": 1.0078125,
"learning_rate": 0.0004991303566864118,
"loss": 5.7071,
"mean_token_accuracy": 0.15100994557142258,
"num_tokens": 7934717.0,
"step": 4305
},
{
"entropy": 5.794317770004272,
"epoch": 0.3621088006721277,
"grad_norm": 1.0078125,
"learning_rate": 0.0004991277243057431,
"loss": 5.7798,
"mean_token_accuracy": 0.14485913664102554,
"num_tokens": 7944278.0,
"step": 4310
},
{
"entropy": 5.806375885009766,
"epoch": 0.3625288804872926,
"grad_norm": 1.078125,
"learning_rate": 0.0004991250879547673,
"loss": 5.789,
"mean_token_accuracy": 0.14822428524494172,
"num_tokens": 7953344.0,
"step": 4315
},
{
"entropy": 5.846715068817138,
"epoch": 0.3629489603024575,
"grad_norm": 1.0078125,
"learning_rate": 0.0004991224476335309,
"loss": 5.8149,
"mean_token_accuracy": 0.14335916191339493,
"num_tokens": 7962869.0,
"step": 4320
},
{
"entropy": 5.923276138305664,
"epoch": 0.36336904011762233,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991198033420807,
"loss": 5.8196,
"mean_token_accuracy": 0.14247879534959793,
"num_tokens": 7971981.0,
"step": 4325
},
{
"entropy": 5.807793760299683,
"epoch": 0.36378911993278723,
"grad_norm": 1.0,
"learning_rate": 0.0004991171550804636,
"loss": 5.7905,
"mean_token_accuracy": 0.1423468828201294,
"num_tokens": 7980979.0,
"step": 4330
},
{
"entropy": 5.894701766967773,
"epoch": 0.36420919974795213,
"grad_norm": 1.0546875,
"learning_rate": 0.0004991145028487266,
"loss": 5.8441,
"mean_token_accuracy": 0.14491954892873765,
"num_tokens": 7989607.0,
"step": 4335
},
{
"entropy": 5.807421636581421,
"epoch": 0.36462927956311697,
"grad_norm": 1.078125,
"learning_rate": 0.0004991118466469165,
"loss": 5.6743,
"mean_token_accuracy": 0.14987035244703292,
"num_tokens": 7998356.0,
"step": 4340
},
{
"entropy": 5.801383352279663,
"epoch": 0.36504935937828187,
"grad_norm": 1.1171875,
"learning_rate": 0.0004991091864750805,
"loss": 5.7912,
"mean_token_accuracy": 0.14811926037073136,
"num_tokens": 8007596.0,
"step": 4345
},
{
"entropy": 5.899381494522094,
"epoch": 0.36546943919344677,
"grad_norm": 1.078125,
"learning_rate": 0.0004991065223332655,
"loss": 5.8413,
"mean_token_accuracy": 0.13843609243631363,
"num_tokens": 8016493.0,
"step": 4350
},
{
"entropy": 5.873028373718261,
"epoch": 0.36588951900861166,
"grad_norm": 1.0703125,
"learning_rate": 0.0004991038542215191,
"loss": 5.8016,
"mean_token_accuracy": 0.1431220918893814,
"num_tokens": 8025867.0,
"step": 4355
},
{
"entropy": 5.8522546768188475,
"epoch": 0.3663095988237765,
"grad_norm": 1.0703125,
"learning_rate": 0.0004991011821398882,
"loss": 5.8429,
"mean_token_accuracy": 0.1501036748290062,
"num_tokens": 8036251.0,
"step": 4360
},
{
"entropy": 5.921151638031006,
"epoch": 0.3667296786389414,
"grad_norm": 1.1171875,
"learning_rate": 0.0004990985060884202,
"loss": 5.7996,
"mean_token_accuracy": 0.14999555051326752,
"num_tokens": 8045647.0,
"step": 4365
},
{
"entropy": 5.919665145874023,
"epoch": 0.3671497584541063,
"grad_norm": 1.046875,
"learning_rate": 0.0004990958260671627,
"loss": 5.8552,
"mean_token_accuracy": 0.13897678777575492,
"num_tokens": 8056025.0,
"step": 4370
},
{
"entropy": 5.852178335189819,
"epoch": 0.36756983826927114,
"grad_norm": 1.28125,
"learning_rate": 0.0004990931420761629,
"loss": 5.7784,
"mean_token_accuracy": 0.15142176076769828,
"num_tokens": 8065029.0,
"step": 4375
},
{
"entropy": 5.9026415824890135,
"epoch": 0.36798991808443604,
"grad_norm": 1.1171875,
"learning_rate": 0.0004990904541154685,
"loss": 5.7475,
"mean_token_accuracy": 0.1552293211221695,
"num_tokens": 8073249.0,
"step": 4380
},
{
"entropy": 5.88547682762146,
"epoch": 0.36840999789960094,
"grad_norm": 1.09375,
"learning_rate": 0.0004990877621851271,
"loss": 5.881,
"mean_token_accuracy": 0.14117127507925034,
"num_tokens": 8082039.0,
"step": 4385
},
{
"entropy": 5.770593690872192,
"epoch": 0.3688300777147658,
"grad_norm": 1.1640625,
"learning_rate": 0.0004990850662851863,
"loss": 5.7234,
"mean_token_accuracy": 0.15070880651474,
"num_tokens": 8090011.0,
"step": 4390
},
{
"entropy": 5.923220920562744,
"epoch": 0.3692501575299307,
"grad_norm": 1.1484375,
"learning_rate": 0.0004990823664156941,
"loss": 5.8305,
"mean_token_accuracy": 0.1554453194141388,
"num_tokens": 8099934.0,
"step": 4395
},
{
"entropy": 5.9510838985443115,
"epoch": 0.3696702373450956,
"grad_norm": 1.1484375,
"learning_rate": 0.0004990796625766981,
"loss": 5.8418,
"mean_token_accuracy": 0.14417348951101303,
"num_tokens": 8108969.0,
"step": 4400
},
{
"entropy": 5.8232039451599125,
"epoch": 0.3700903171602605,
"grad_norm": 1.1015625,
"learning_rate": 0.0004990769547682462,
"loss": 5.7724,
"mean_token_accuracy": 0.1469231814146042,
"num_tokens": 8117372.0,
"step": 4405
},
{
"entropy": 5.944585466384888,
"epoch": 0.3705103969754253,
"grad_norm": 1.0546875,
"learning_rate": 0.0004990742429903866,
"loss": 5.96,
"mean_token_accuracy": 0.1372426740825176,
"num_tokens": 8127108.0,
"step": 4410
},
{
"entropy": 6.011254787445068,
"epoch": 0.3709304767905902,
"grad_norm": 1.0234375,
"learning_rate": 0.000499071527243167,
"loss": 5.9317,
"mean_token_accuracy": 0.1404549315571785,
"num_tokens": 8137392.0,
"step": 4415
},
{
"entropy": 5.8784748077392575,
"epoch": 0.3713505566057551,
"grad_norm": 1.0625,
"learning_rate": 0.0004990688075266357,
"loss": 5.7757,
"mean_token_accuracy": 0.1502571687102318,
"num_tokens": 8146257.0,
"step": 4420
},
{
"entropy": 5.852525138854981,
"epoch": 0.37177063642091995,
"grad_norm": 1.078125,
"learning_rate": 0.0004990660838408409,
"loss": 5.7482,
"mean_token_accuracy": 0.1445622481405735,
"num_tokens": 8154952.0,
"step": 4425
},
{
"entropy": 5.867915201187134,
"epoch": 0.37219071623608485,
"grad_norm": 1.0703125,
"learning_rate": 0.0004990633561858308,
"loss": 5.7956,
"mean_token_accuracy": 0.14470055550336838,
"num_tokens": 8164365.0,
"step": 4430
},
{
"entropy": 5.882321166992187,
"epoch": 0.37261079605124975,
"grad_norm": 1.125,
"learning_rate": 0.0004990606245616537,
"loss": 5.8081,
"mean_token_accuracy": 0.14612025395035744,
"num_tokens": 8172614.0,
"step": 4435
},
{
"entropy": 5.951104545593262,
"epoch": 0.37303087586641465,
"grad_norm": 1.1171875,
"learning_rate": 0.0004990578889683579,
"loss": 5.8748,
"mean_token_accuracy": 0.14046685621142388,
"num_tokens": 8182445.0,
"step": 4440
},
{
"entropy": 5.873566579818726,
"epoch": 0.3734509556815795,
"grad_norm": 1.0703125,
"learning_rate": 0.0004990551494059921,
"loss": 5.7487,
"mean_token_accuracy": 0.14927588850259782,
"num_tokens": 8191871.0,
"step": 4445
},
{
"entropy": 5.879621505737305,
"epoch": 0.3738710354967444,
"grad_norm": 1.046875,
"learning_rate": 0.0004990524058746047,
"loss": 5.918,
"mean_token_accuracy": 0.1478033483028412,
"num_tokens": 8200658.0,
"step": 4450
},
{
"entropy": 5.87360520362854,
"epoch": 0.3742911153119093,
"grad_norm": 1.1328125,
"learning_rate": 0.0004990496583742443,
"loss": 5.8291,
"mean_token_accuracy": 0.14281335473060608,
"num_tokens": 8209776.0,
"step": 4455
},
{
"entropy": 5.873605442047119,
"epoch": 0.3747111951270741,
"grad_norm": 1.15625,
"learning_rate": 0.0004990469069049596,
"loss": 5.7478,
"mean_token_accuracy": 0.15005824714899063,
"num_tokens": 8219401.0,
"step": 4460
},
{
"entropy": 5.832020092010498,
"epoch": 0.375131274942239,
"grad_norm": 1.0859375,
"learning_rate": 0.0004990441514667993,
"loss": 5.799,
"mean_token_accuracy": 0.15009642094373704,
"num_tokens": 8228762.0,
"step": 4465
},
{
"entropy": 5.921987819671631,
"epoch": 0.3755513547574039,
"grad_norm": 1.09375,
"learning_rate": 0.0004990413920598121,
"loss": 5.816,
"mean_token_accuracy": 0.14943203181028367,
"num_tokens": 8236612.0,
"step": 4470
},
{
"entropy": 5.8898248195648195,
"epoch": 0.37597143457256876,
"grad_norm": 1.1328125,
"learning_rate": 0.0004990386286840471,
"loss": 5.8002,
"mean_token_accuracy": 0.14471676647663118,
"num_tokens": 8245043.0,
"step": 4475
},
{
"entropy": 5.907721614837646,
"epoch": 0.37639151438773366,
"grad_norm": 1.1015625,
"learning_rate": 0.0004990358613395532,
"loss": 5.9009,
"mean_token_accuracy": 0.14435933604836465,
"num_tokens": 8255270.0,
"step": 4480
},
{
"entropy": 5.959932088851929,
"epoch": 0.37681159420289856,
"grad_norm": 0.98828125,
"learning_rate": 0.0004990330900263792,
"loss": 5.8703,
"mean_token_accuracy": 0.13668696507811545,
"num_tokens": 8264761.0,
"step": 4485
},
{
"entropy": 5.932935667037964,
"epoch": 0.37723167401806346,
"grad_norm": 1.0234375,
"learning_rate": 0.0004990303147445745,
"loss": 5.8345,
"mean_token_accuracy": 0.14731877371668817,
"num_tokens": 8274308.0,
"step": 4490
},
{
"entropy": 5.771492147445679,
"epoch": 0.3776517538332283,
"grad_norm": 1.09375,
"learning_rate": 0.0004990275354941881,
"loss": 5.7201,
"mean_token_accuracy": 0.1533718004822731,
"num_tokens": 8283323.0,
"step": 4495
},
{
"entropy": 5.900288772583008,
"epoch": 0.3780718336483932,
"grad_norm": 1.0,
"learning_rate": 0.0004990247522752694,
"loss": 6.0389,
"mean_token_accuracy": 0.1318746455013752,
"num_tokens": 8293452.0,
"step": 4500
},
{
"entropy": 5.938249158859253,
"epoch": 0.3784919134635581,
"grad_norm": 1.0390625,
"learning_rate": 0.0004990219650878674,
"loss": 5.7189,
"mean_token_accuracy": 0.15227051973342895,
"num_tokens": 8302941.0,
"step": 4505
},
{
"entropy": 5.7787196159362795,
"epoch": 0.37891199327872294,
"grad_norm": 1.609375,
"learning_rate": 0.0004990191739320318,
"loss": 5.7343,
"mean_token_accuracy": 0.14977421835064889,
"num_tokens": 8311811.0,
"step": 4510
},
{
"entropy": 5.729694271087647,
"epoch": 0.37933207309388783,
"grad_norm": 1.03125,
"learning_rate": 0.0004990163788078117,
"loss": 5.6485,
"mean_token_accuracy": 0.15700877755880355,
"num_tokens": 8321130.0,
"step": 4515
},
{
"entropy": 5.794627618789673,
"epoch": 0.37975215290905273,
"grad_norm": 1.046875,
"learning_rate": 0.0004990135797152569,
"loss": 5.7616,
"mean_token_accuracy": 0.14539173543453215,
"num_tokens": 8330233.0,
"step": 4520
},
{
"entropy": 5.780507516860962,
"epoch": 0.3801722327242176,
"grad_norm": 1.125,
"learning_rate": 0.0004990107766544169,
"loss": 5.74,
"mean_token_accuracy": 0.1528162345290184,
"num_tokens": 8338585.0,
"step": 4525
},
{
"entropy": 5.822282552719116,
"epoch": 0.38059231253938247,
"grad_norm": 1.1171875,
"learning_rate": 0.0004990079696253413,
"loss": 5.7744,
"mean_token_accuracy": 0.15210666060447692,
"num_tokens": 8346618.0,
"step": 4530
},
{
"entropy": 5.853319835662842,
"epoch": 0.38101239235454737,
"grad_norm": 1.1015625,
"learning_rate": 0.0004990051586280799,
"loss": 5.7773,
"mean_token_accuracy": 0.14609165936708451,
"num_tokens": 8356273.0,
"step": 4535
},
{
"entropy": 5.819488286972046,
"epoch": 0.38143247216971227,
"grad_norm": 0.99609375,
"learning_rate": 0.0004990023436626824,
"loss": 5.7532,
"mean_token_accuracy": 0.14789900034666062,
"num_tokens": 8366668.0,
"step": 4540
},
{
"entropy": 5.977391242980957,
"epoch": 0.3818525519848771,
"grad_norm": 1.1796875,
"learning_rate": 0.0004989995247291988,
"loss": 5.8693,
"mean_token_accuracy": 0.14475300461053847,
"num_tokens": 8375610.0,
"step": 4545
},
{
"entropy": 5.869308757781982,
"epoch": 0.382272631800042,
"grad_norm": 1.09375,
"learning_rate": 0.0004989967018276789,
"loss": 5.7532,
"mean_token_accuracy": 0.1510820835828781,
"num_tokens": 8384455.0,
"step": 4550
},
{
"entropy": 5.768916749954224,
"epoch": 0.3826927116152069,
"grad_norm": 1.0546875,
"learning_rate": 0.0004989938749581727,
"loss": 5.7794,
"mean_token_accuracy": 0.14365579262375833,
"num_tokens": 8393868.0,
"step": 4555
},
{
"entropy": 5.859430313110352,
"epoch": 0.38311279143037175,
"grad_norm": 1.03125,
"learning_rate": 0.0004989910441207305,
"loss": 5.8063,
"mean_token_accuracy": 0.14428819417953492,
"num_tokens": 8402916.0,
"step": 4560
},
{
"entropy": 5.804932451248169,
"epoch": 0.38353287124553664,
"grad_norm": 1.2109375,
"learning_rate": 0.0004989882093154023,
"loss": 5.7313,
"mean_token_accuracy": 0.1530011385679245,
"num_tokens": 8411649.0,
"step": 4565
},
{
"entropy": 5.838339900970459,
"epoch": 0.38395295106070154,
"grad_norm": 1.0390625,
"learning_rate": 0.0004989853705422381,
"loss": 5.8416,
"mean_token_accuracy": 0.1406970351934433,
"num_tokens": 8420393.0,
"step": 4570
},
{
"entropy": 5.8445580959320065,
"epoch": 0.38437303087586644,
"grad_norm": 1.109375,
"learning_rate": 0.0004989825278012886,
"loss": 5.7338,
"mean_token_accuracy": 0.14734147042036055,
"num_tokens": 8429404.0,
"step": 4575
},
{
"entropy": 5.827604675292969,
"epoch": 0.3847931106910313,
"grad_norm": 1.1875,
"learning_rate": 0.000498979681092604,
"loss": 5.7896,
"mean_token_accuracy": 0.1458117887377739,
"num_tokens": 8438299.0,
"step": 4580
},
{
"entropy": 5.774310302734375,
"epoch": 0.3852131905061962,
"grad_norm": 1.0078125,
"learning_rate": 0.0004989768304162345,
"loss": 5.7287,
"mean_token_accuracy": 0.14704838395118713,
"num_tokens": 8447392.0,
"step": 4585
},
{
"entropy": 5.8903289318084715,
"epoch": 0.3856332703213611,
"grad_norm": 1.015625,
"learning_rate": 0.0004989739757722308,
"loss": 5.8349,
"mean_token_accuracy": 0.1420938104391098,
"num_tokens": 8456361.0,
"step": 4590
},
{
"entropy": 5.8679603099822994,
"epoch": 0.3860533501365259,
"grad_norm": 1.09375,
"learning_rate": 0.0004989711171606436,
"loss": 5.7636,
"mean_token_accuracy": 0.1483553446829319,
"num_tokens": 8465548.0,
"step": 4595
},
{
"entropy": 5.890800905227661,
"epoch": 0.3864734299516908,
"grad_norm": 1.078125,
"learning_rate": 0.0004989682545815232,
"loss": 5.7864,
"mean_token_accuracy": 0.14626673310995103,
"num_tokens": 8474454.0,
"step": 4600
},
{
"entropy": 5.799082851409912,
"epoch": 0.3868935097668557,
"grad_norm": 1.203125,
"learning_rate": 0.0004989653880349207,
"loss": 5.6976,
"mean_token_accuracy": 0.14468655437231065,
"num_tokens": 8482694.0,
"step": 4605
},
{
"entropy": 5.846567821502686,
"epoch": 0.38731358958202056,
"grad_norm": 1.140625,
"learning_rate": 0.0004989625175208864,
"loss": 5.809,
"mean_token_accuracy": 0.14379870519042015,
"num_tokens": 8491162.0,
"step": 4610
},
{
"entropy": 5.696895551681519,
"epoch": 0.38773366939718545,
"grad_norm": 1.0859375,
"learning_rate": 0.0004989596430394717,
"loss": 5.663,
"mean_token_accuracy": 0.1604851856827736,
"num_tokens": 8500716.0,
"step": 4615
},
{
"entropy": 5.810876178741455,
"epoch": 0.38815374921235035,
"grad_norm": 1.1171875,
"learning_rate": 0.000498956764590727,
"loss": 5.7038,
"mean_token_accuracy": 0.14853276833891868,
"num_tokens": 8508871.0,
"step": 4620
},
{
"entropy": 5.960144567489624,
"epoch": 0.38857382902751525,
"grad_norm": 1.1484375,
"learning_rate": 0.0004989538821747037,
"loss": 5.9266,
"mean_token_accuracy": 0.1409666955471039,
"num_tokens": 8518450.0,
"step": 4625
},
{
"entropy": 5.916182136535644,
"epoch": 0.3889939088426801,
"grad_norm": 1.0,
"learning_rate": 0.0004989509957914527,
"loss": 5.8179,
"mean_token_accuracy": 0.14130397886037827,
"num_tokens": 8528238.0,
"step": 4630
},
{
"entropy": 5.778582000732422,
"epoch": 0.389413988657845,
"grad_norm": 1.046875,
"learning_rate": 0.0004989481054410251,
"loss": 5.7111,
"mean_token_accuracy": 0.14815662652254105,
"num_tokens": 8537587.0,
"step": 4635
},
{
"entropy": 5.860828018188476,
"epoch": 0.3898340684730099,
"grad_norm": 1.078125,
"learning_rate": 0.0004989452111234721,
"loss": 5.8194,
"mean_token_accuracy": 0.14564207270741464,
"num_tokens": 8547703.0,
"step": 4640
},
{
"entropy": 5.833293724060058,
"epoch": 0.39025414828817473,
"grad_norm": 1.203125,
"learning_rate": 0.000498942312838845,
"loss": 5.7527,
"mean_token_accuracy": 0.1507646232843399,
"num_tokens": 8557001.0,
"step": 4645
},
{
"entropy": 5.812011241912842,
"epoch": 0.3906742281033396,
"grad_norm": 1.1015625,
"learning_rate": 0.0004989394105871952,
"loss": 5.6446,
"mean_token_accuracy": 0.15711887776851655,
"num_tokens": 8565638.0,
"step": 4650
},
{
"entropy": 5.877040576934815,
"epoch": 0.3910943079185045,
"grad_norm": 1.2578125,
"learning_rate": 0.000498936504368574,
"loss": 5.8199,
"mean_token_accuracy": 0.14237563014030458,
"num_tokens": 8574428.0,
"step": 4655
},
{
"entropy": 5.787962818145752,
"epoch": 0.3915143877336694,
"grad_norm": 1.0859375,
"learning_rate": 0.0004989335941830329,
"loss": 5.7797,
"mean_token_accuracy": 0.14808216989040374,
"num_tokens": 8583157.0,
"step": 4660
},
{
"entropy": 5.842103910446167,
"epoch": 0.39193446754883426,
"grad_norm": 1.1640625,
"learning_rate": 0.0004989306800306236,
"loss": 5.7493,
"mean_token_accuracy": 0.1453156664967537,
"num_tokens": 8592382.0,
"step": 4665
},
{
"entropy": 5.755923700332642,
"epoch": 0.39235454736399916,
"grad_norm": 1.1484375,
"learning_rate": 0.0004989277619113975,
"loss": 5.7158,
"mean_token_accuracy": 0.15265436917543412,
"num_tokens": 8601058.0,
"step": 4670
},
{
"entropy": 5.874063444137573,
"epoch": 0.39277462717916406,
"grad_norm": 1.1484375,
"learning_rate": 0.0004989248398254065,
"loss": 5.8198,
"mean_token_accuracy": 0.14102344885468482,
"num_tokens": 8609479.0,
"step": 4675
},
{
"entropy": 5.8090002059936525,
"epoch": 0.3931947069943289,
"grad_norm": 1.1015625,
"learning_rate": 0.0004989219137727021,
"loss": 5.7837,
"mean_token_accuracy": 0.14906230419874192,
"num_tokens": 8618860.0,
"step": 4680
},
{
"entropy": 5.830205297470092,
"epoch": 0.3936147868094938,
"grad_norm": 1.078125,
"learning_rate": 0.0004989189837533365,
"loss": 5.7182,
"mean_token_accuracy": 0.14967697113752365,
"num_tokens": 8627462.0,
"step": 4685
},
{
"entropy": 5.934653282165527,
"epoch": 0.3940348666246587,
"grad_norm": 0.9921875,
"learning_rate": 0.0004989160497673613,
"loss": 5.9021,
"mean_token_accuracy": 0.14029332622885704,
"num_tokens": 8637569.0,
"step": 4690
},
{
"entropy": 5.910148859024048,
"epoch": 0.39445494643982354,
"grad_norm": 1.171875,
"learning_rate": 0.0004989131118148286,
"loss": 5.7088,
"mean_token_accuracy": 0.1493791311979294,
"num_tokens": 8645440.0,
"step": 4695
},
{
"entropy": 5.807815074920654,
"epoch": 0.39487502625498844,
"grad_norm": 1.09375,
"learning_rate": 0.0004989101698957904,
"loss": 5.8673,
"mean_token_accuracy": 0.14714024513959884,
"num_tokens": 8655077.0,
"step": 4700
},
{
"entropy": 5.876171731948853,
"epoch": 0.39529510607015333,
"grad_norm": 1.1875,
"learning_rate": 0.0004989072240102988,
"loss": 5.783,
"mean_token_accuracy": 0.15092868655920028,
"num_tokens": 8663126.0,
"step": 4705
},
{
"entropy": 5.894665288925171,
"epoch": 0.39571518588531823,
"grad_norm": 1.109375,
"learning_rate": 0.0004989042741584061,
"loss": 5.7571,
"mean_token_accuracy": 0.14441050142049788,
"num_tokens": 8672386.0,
"step": 4710
},
{
"entropy": 5.667036533355713,
"epoch": 0.3961352657004831,
"grad_norm": 1.0703125,
"learning_rate": 0.0004989013203401645,
"loss": 5.7166,
"mean_token_accuracy": 0.15284878835082055,
"num_tokens": 8681930.0,
"step": 4715
},
{
"entropy": 5.86320013999939,
"epoch": 0.396555345515648,
"grad_norm": 1.078125,
"learning_rate": 0.0004988983625556264,
"loss": 5.7663,
"mean_token_accuracy": 0.14834155216813089,
"num_tokens": 8690993.0,
"step": 4720
},
{
"entropy": 5.822726678848267,
"epoch": 0.39697542533081287,
"grad_norm": 1.1328125,
"learning_rate": 0.0004988954008048438,
"loss": 5.7459,
"mean_token_accuracy": 0.15125422477722167,
"num_tokens": 8699497.0,
"step": 4725
},
{
"entropy": 5.943749618530274,
"epoch": 0.3973955051459777,
"grad_norm": 1.1015625,
"learning_rate": 0.0004988924350878697,
"loss": 5.9393,
"mean_token_accuracy": 0.1371678613126278,
"num_tokens": 8709274.0,
"step": 4730
},
{
"entropy": 5.918771076202392,
"epoch": 0.3978155849611426,
"grad_norm": 1.0625,
"learning_rate": 0.0004988894654047563,
"loss": 5.7983,
"mean_token_accuracy": 0.14176035523414612,
"num_tokens": 8718158.0,
"step": 4735
},
{
"entropy": 5.741904830932617,
"epoch": 0.3982356647763075,
"grad_norm": 1.046875,
"learning_rate": 0.0004988864917555562,
"loss": 5.6785,
"mean_token_accuracy": 0.14662874788045882,
"num_tokens": 8727459.0,
"step": 4740
},
{
"entropy": 5.864354753494263,
"epoch": 0.3986557445914724,
"grad_norm": 1.34375,
"learning_rate": 0.0004988835141403224,
"loss": 5.8222,
"mean_token_accuracy": 0.15272913128137589,
"num_tokens": 8737614.0,
"step": 4745
},
{
"entropy": 5.787281656265259,
"epoch": 0.39907582440663725,
"grad_norm": 1.1953125,
"learning_rate": 0.0004988805325591073,
"loss": 5.6447,
"mean_token_accuracy": 0.15006959736347197,
"num_tokens": 8746799.0,
"step": 4750
},
{
"entropy": 5.795963478088379,
"epoch": 0.39949590422180214,
"grad_norm": 1.140625,
"learning_rate": 0.0004988775470119639,
"loss": 5.8349,
"mean_token_accuracy": 0.14210814163088797,
"num_tokens": 8756555.0,
"step": 4755
},
{
"entropy": 5.811279678344727,
"epoch": 0.39991598403696704,
"grad_norm": 1.0859375,
"learning_rate": 0.0004988745574989451,
"loss": 5.8472,
"mean_token_accuracy": 0.14978650212287903,
"num_tokens": 8765849.0,
"step": 4760
},
{
"entropy": 6.0015599727630615,
"epoch": 0.4003360638521319,
"grad_norm": 1.015625,
"learning_rate": 0.0004988715640201036,
"loss": 5.904,
"mean_token_accuracy": 0.14214877486228944,
"num_tokens": 8775713.0,
"step": 4765
},
{
"entropy": 5.84947829246521,
"epoch": 0.4007561436672968,
"grad_norm": 1.0625,
"learning_rate": 0.0004988685665754928,
"loss": 5.7345,
"mean_token_accuracy": 0.1462959110736847,
"num_tokens": 8784717.0,
"step": 4770
},
{
"entropy": 5.768613386154175,
"epoch": 0.4011762234824617,
"grad_norm": 1.1171875,
"learning_rate": 0.0004988655651651656,
"loss": 5.7506,
"mean_token_accuracy": 0.1455540031194687,
"num_tokens": 8794388.0,
"step": 4775
},
{
"entropy": 5.7637909889221195,
"epoch": 0.4015963032976265,
"grad_norm": 1.125,
"learning_rate": 0.0004988625597891751,
"loss": 5.7701,
"mean_token_accuracy": 0.14478957056999206,
"num_tokens": 8802436.0,
"step": 4780
},
{
"entropy": 5.9429168701171875,
"epoch": 0.4020163831127914,
"grad_norm": 1.0703125,
"learning_rate": 0.0004988595504475746,
"loss": 5.7261,
"mean_token_accuracy": 0.14802230298519134,
"num_tokens": 8811184.0,
"step": 4785
},
{
"entropy": 5.862749767303467,
"epoch": 0.4024364629279563,
"grad_norm": 1.078125,
"learning_rate": 0.0004988565371404175,
"loss": 5.8073,
"mean_token_accuracy": 0.1479243002831936,
"num_tokens": 8820525.0,
"step": 4790
},
{
"entropy": 5.766486692428589,
"epoch": 0.4028565427431212,
"grad_norm": 1.109375,
"learning_rate": 0.0004988535198677571,
"loss": 5.6627,
"mean_token_accuracy": 0.15454638600349427,
"num_tokens": 8828928.0,
"step": 4795
},
{
"entropy": 5.870850229263306,
"epoch": 0.40327662255828606,
"grad_norm": 1.125,
"learning_rate": 0.0004988504986296469,
"loss": 5.8718,
"mean_token_accuracy": 0.139602579921484,
"num_tokens": 8838615.0,
"step": 4800
},
{
"entropy": 5.950463771820068,
"epoch": 0.40369670237345096,
"grad_norm": 1.046875,
"learning_rate": 0.0004988474734261404,
"loss": 5.856,
"mean_token_accuracy": 0.13832745403051377,
"num_tokens": 8848709.0,
"step": 4805
},
{
"entropy": 5.906767892837524,
"epoch": 0.40411678218861585,
"grad_norm": 1.078125,
"learning_rate": 0.0004988444442572911,
"loss": 5.7919,
"mean_token_accuracy": 0.1422821246087551,
"num_tokens": 8858277.0,
"step": 4810
},
{
"entropy": 5.770445251464844,
"epoch": 0.4045368620037807,
"grad_norm": 1.078125,
"learning_rate": 0.0004988414111231528,
"loss": 5.7716,
"mean_token_accuracy": 0.1510346472263336,
"num_tokens": 8868436.0,
"step": 4815
},
{
"entropy": 5.844740867614746,
"epoch": 0.4049569418189456,
"grad_norm": 1.0078125,
"learning_rate": 0.000498838374023779,
"loss": 5.7534,
"mean_token_accuracy": 0.142206509411335,
"num_tokens": 8877740.0,
"step": 4820
},
{
"entropy": 5.885952091217041,
"epoch": 0.4053770216341105,
"grad_norm": 1.0078125,
"learning_rate": 0.0004988353329592239,
"loss": 5.7489,
"mean_token_accuracy": 0.14901506081223487,
"num_tokens": 8887408.0,
"step": 4825
},
{
"entropy": 5.8851417064666744,
"epoch": 0.4057971014492754,
"grad_norm": 1.15625,
"learning_rate": 0.0004988322879295409,
"loss": 5.9079,
"mean_token_accuracy": 0.1435435011982918,
"num_tokens": 8897141.0,
"step": 4830
},
{
"entropy": 5.849951601028442,
"epoch": 0.40621718126444023,
"grad_norm": 1.078125,
"learning_rate": 0.0004988292389347844,
"loss": 5.6885,
"mean_token_accuracy": 0.15934282243251802,
"num_tokens": 8905747.0,
"step": 4835
},
{
"entropy": 5.889691162109375,
"epoch": 0.40663726107960513,
"grad_norm": 1.125,
"learning_rate": 0.000498826185975008,
"loss": 5.8197,
"mean_token_accuracy": 0.14562582597136497,
"num_tokens": 8914926.0,
"step": 4840
},
{
"entropy": 5.844784164428711,
"epoch": 0.40705734089477,
"grad_norm": 1.109375,
"learning_rate": 0.0004988231290502662,
"loss": 5.8302,
"mean_token_accuracy": 0.1439170941710472,
"num_tokens": 8923956.0,
"step": 4845
},
{
"entropy": 5.8658223152160645,
"epoch": 0.40747742070993487,
"grad_norm": 1.09375,
"learning_rate": 0.0004988200681606127,
"loss": 5.7062,
"mean_token_accuracy": 0.14471790790557862,
"num_tokens": 8932654.0,
"step": 4850
},
{
"entropy": 5.820353651046753,
"epoch": 0.40789750052509977,
"grad_norm": 1.1015625,
"learning_rate": 0.000498817003306102,
"loss": 5.7099,
"mean_token_accuracy": 0.153290569037199,
"num_tokens": 8941716.0,
"step": 4855
},
{
"entropy": 5.757675075531006,
"epoch": 0.40831758034026466,
"grad_norm": 1.125,
"learning_rate": 0.0004988139344867884,
"loss": 5.7752,
"mean_token_accuracy": 0.14610066637396812,
"num_tokens": 8950377.0,
"step": 4860
},
{
"entropy": 5.777942085266114,
"epoch": 0.4087376601554295,
"grad_norm": 1.109375,
"learning_rate": 0.0004988108617027261,
"loss": 5.7374,
"mean_token_accuracy": 0.14812174588441848,
"num_tokens": 8959857.0,
"step": 4865
},
{
"entropy": 5.746048545837402,
"epoch": 0.4091577399705944,
"grad_norm": 1.0625,
"learning_rate": 0.0004988077849539698,
"loss": 5.6789,
"mean_token_accuracy": 0.15261283069849013,
"num_tokens": 8968272.0,
"step": 4870
},
{
"entropy": 5.8564534187316895,
"epoch": 0.4095778197857593,
"grad_norm": 1.15625,
"learning_rate": 0.0004988047042405736,
"loss": 5.7535,
"mean_token_accuracy": 0.1546842411160469,
"num_tokens": 8977445.0,
"step": 4875
},
{
"entropy": 5.9273241519927975,
"epoch": 0.4099978996009242,
"grad_norm": 1.015625,
"learning_rate": 0.0004988016195625924,
"loss": 5.8238,
"mean_token_accuracy": 0.1429767683148384,
"num_tokens": 8987315.0,
"step": 4880
},
{
"entropy": 5.859557819366455,
"epoch": 0.41041797941608904,
"grad_norm": 1.0546875,
"learning_rate": 0.0004987985309200807,
"loss": 5.8188,
"mean_token_accuracy": 0.14291241616010666,
"num_tokens": 8998119.0,
"step": 4885
},
{
"entropy": 5.789018440246582,
"epoch": 0.41083805923125394,
"grad_norm": 1.0859375,
"learning_rate": 0.0004987954383130934,
"loss": 5.6925,
"mean_token_accuracy": 0.15540220364928245,
"num_tokens": 9007167.0,
"step": 4890
},
{
"entropy": 5.792231464385987,
"epoch": 0.41125813904641884,
"grad_norm": 1.0546875,
"learning_rate": 0.000498792341741685,
"loss": 5.7706,
"mean_token_accuracy": 0.14267298579216003,
"num_tokens": 9016690.0,
"step": 4895
},
{
"entropy": 5.932197856903076,
"epoch": 0.4116782188615837,
"grad_norm": 1.046875,
"learning_rate": 0.0004987892412059106,
"loss": 5.8367,
"mean_token_accuracy": 0.1477791592478752,
"num_tokens": 9026117.0,
"step": 4900
},
{
"entropy": 5.7695259094238285,
"epoch": 0.4120982986767486,
"grad_norm": 1.125,
"learning_rate": 0.0004987861367058251,
"loss": 5.7227,
"mean_token_accuracy": 0.14521004483103753,
"num_tokens": 9035754.0,
"step": 4905
},
{
"entropy": 5.844812536239624,
"epoch": 0.4125183784919135,
"grad_norm": 1.1015625,
"learning_rate": 0.0004987830282414833,
"loss": 5.713,
"mean_token_accuracy": 0.15308188572525977,
"num_tokens": 9045453.0,
"step": 4910
},
{
"entropy": 5.901562261581421,
"epoch": 0.41293845830707837,
"grad_norm": 1.09375,
"learning_rate": 0.0004987799158129404,
"loss": 5.8435,
"mean_token_accuracy": 0.14217550978064536,
"num_tokens": 9056045.0,
"step": 4915
},
{
"entropy": 5.7665058135986325,
"epoch": 0.4133585381222432,
"grad_norm": 1.0859375,
"learning_rate": 0.0004987767994202516,
"loss": 5.7055,
"mean_token_accuracy": 0.14514194130897523,
"num_tokens": 9065728.0,
"step": 4920
},
{
"entropy": 5.8297545433044435,
"epoch": 0.4137786179374081,
"grad_norm": 1.09375,
"learning_rate": 0.0004987736790634719,
"loss": 5.7584,
"mean_token_accuracy": 0.14594158828258513,
"num_tokens": 9075522.0,
"step": 4925
},
{
"entropy": 5.812106466293335,
"epoch": 0.414198697752573,
"grad_norm": 1.21875,
"learning_rate": 0.0004987705547426568,
"loss": 5.708,
"mean_token_accuracy": 0.14480755999684333,
"num_tokens": 9084412.0,
"step": 4930
},
{
"entropy": 5.854213285446167,
"epoch": 0.41461877756773785,
"grad_norm": 1.0390625,
"learning_rate": 0.0004987674264578615,
"loss": 5.795,
"mean_token_accuracy": 0.14237734973430632,
"num_tokens": 9094289.0,
"step": 4935
},
{
"entropy": 5.794469690322876,
"epoch": 0.41503885738290275,
"grad_norm": 1.1328125,
"learning_rate": 0.0004987642942091414,
"loss": 5.715,
"mean_token_accuracy": 0.14896973520517348,
"num_tokens": 9103124.0,
"step": 4940
},
{
"entropy": 5.814328575134278,
"epoch": 0.41545893719806765,
"grad_norm": 1.078125,
"learning_rate": 0.0004987611579965523,
"loss": 5.6552,
"mean_token_accuracy": 0.14823724925518036,
"num_tokens": 9112794.0,
"step": 4945
},
{
"entropy": 5.825801277160645,
"epoch": 0.4158790170132325,
"grad_norm": 0.984375,
"learning_rate": 0.0004987580178201492,
"loss": 5.8176,
"mean_token_accuracy": 0.15566431283950805,
"num_tokens": 9122718.0,
"step": 4950
},
{
"entropy": 5.824101305007934,
"epoch": 0.4162990968283974,
"grad_norm": 1.171875,
"learning_rate": 0.0004987548736799882,
"loss": 5.8499,
"mean_token_accuracy": 0.1412522651255131,
"num_tokens": 9131855.0,
"step": 4955
},
{
"entropy": 5.840525722503662,
"epoch": 0.4167191766435623,
"grad_norm": 1.109375,
"learning_rate": 0.0004987517255761248,
"loss": 5.6908,
"mean_token_accuracy": 0.15148798674345015,
"num_tokens": 9141102.0,
"step": 4960
},
{
"entropy": 5.813421249389648,
"epoch": 0.4171392564587272,
"grad_norm": 1.140625,
"learning_rate": 0.0004987485735086148,
"loss": 5.7415,
"mean_token_accuracy": 0.14812534004449845,
"num_tokens": 9150552.0,
"step": 4965
},
{
"entropy": 5.857693719863891,
"epoch": 0.417559336273892,
"grad_norm": 1.09375,
"learning_rate": 0.000498745417477514,
"loss": 5.7528,
"mean_token_accuracy": 0.14549532830715178,
"num_tokens": 9160105.0,
"step": 4970
},
{
"entropy": 5.779812955856324,
"epoch": 0.4179794160890569,
"grad_norm": 1.078125,
"learning_rate": 0.0004987422574828784,
"loss": 5.7475,
"mean_token_accuracy": 0.14596971422433852,
"num_tokens": 9169367.0,
"step": 4975
},
{
"entropy": 5.743670463562012,
"epoch": 0.4183994959042218,
"grad_norm": 1.1640625,
"learning_rate": 0.0004987390935247639,
"loss": 5.6307,
"mean_token_accuracy": 0.15040962547063827,
"num_tokens": 9177872.0,
"step": 4980
},
{
"entropy": 5.8342043399810795,
"epoch": 0.41881957571938666,
"grad_norm": 1.2734375,
"learning_rate": 0.0004987359256032265,
"loss": 5.8324,
"mean_token_accuracy": 0.13960791900753974,
"num_tokens": 9187879.0,
"step": 4985
},
{
"entropy": 5.849059009552002,
"epoch": 0.41923965553455156,
"grad_norm": 1.0390625,
"learning_rate": 0.0004987327537183225,
"loss": 5.7467,
"mean_token_accuracy": 0.1469127669930458,
"num_tokens": 9198281.0,
"step": 4990
},
{
"entropy": 5.84087119102478,
"epoch": 0.41965973534971646,
"grad_norm": 1.0234375,
"learning_rate": 0.0004987295778701078,
"loss": 5.7409,
"mean_token_accuracy": 0.1467703640460968,
"num_tokens": 9207670.0,
"step": 4995
},
{
"entropy": 5.875102853775024,
"epoch": 0.42007981516488135,
"grad_norm": 1.140625,
"learning_rate": 0.000498726398058639,
"loss": 5.7467,
"mean_token_accuracy": 0.15389208644628524,
"num_tokens": 9216995.0,
"step": 5000
},
{
"entropy": 5.806438732147217,
"epoch": 0.4204998949800462,
"grad_norm": 1.015625,
"learning_rate": 0.0004987232142839723,
"loss": 5.8226,
"mean_token_accuracy": 0.14448345750570296,
"num_tokens": 9227330.0,
"step": 5005
},
{
"entropy": 5.8239421367645265,
"epoch": 0.4209199747952111,
"grad_norm": 1.1484375,
"learning_rate": 0.0004987200265461638,
"loss": 5.7538,
"mean_token_accuracy": 0.15376091599464417,
"num_tokens": 9236666.0,
"step": 5010
},
{
"entropy": 5.837996959686279,
"epoch": 0.421340054610376,
"grad_norm": 1.109375,
"learning_rate": 0.0004987168348452705,
"loss": 5.7491,
"mean_token_accuracy": 0.14361261576414108,
"num_tokens": 9246388.0,
"step": 5015
},
{
"entropy": 5.79644136428833,
"epoch": 0.42176013442554083,
"grad_norm": 1.0390625,
"learning_rate": 0.0004987136391813485,
"loss": 5.7039,
"mean_token_accuracy": 0.15708659887313842,
"num_tokens": 9255239.0,
"step": 5020
},
{
"entropy": 5.7528785228729244,
"epoch": 0.42218021424070573,
"grad_norm": 1.1015625,
"learning_rate": 0.0004987104395544547,
"loss": 5.6738,
"mean_token_accuracy": 0.14564956724643707,
"num_tokens": 9264468.0,
"step": 5025
},
{
"entropy": 5.801192283630371,
"epoch": 0.42260029405587063,
"grad_norm": 1.1171875,
"learning_rate": 0.0004987072359646455,
"loss": 5.7502,
"mean_token_accuracy": 0.15197303295135497,
"num_tokens": 9274140.0,
"step": 5030
},
{
"entropy": 5.8728814125061035,
"epoch": 0.42302037387103547,
"grad_norm": 1.078125,
"learning_rate": 0.0004987040284119778,
"loss": 5.7389,
"mean_token_accuracy": 0.14780100136995317,
"num_tokens": 9283539.0,
"step": 5035
},
{
"entropy": 5.707630681991577,
"epoch": 0.42344045368620037,
"grad_norm": 1.125,
"learning_rate": 0.0004987008168965087,
"loss": 5.7434,
"mean_token_accuracy": 0.14961520582437515,
"num_tokens": 9292664.0,
"step": 5040
},
{
"entropy": 5.886917591094971,
"epoch": 0.42386053350136527,
"grad_norm": 1.0859375,
"learning_rate": 0.0004986976014182946,
"loss": 5.8367,
"mean_token_accuracy": 0.1453338533639908,
"num_tokens": 9302814.0,
"step": 5045
},
{
"entropy": 5.905004692077637,
"epoch": 0.42428061331653016,
"grad_norm": 1.1484375,
"learning_rate": 0.0004986943819773927,
"loss": 5.8207,
"mean_token_accuracy": 0.15036008954048158,
"num_tokens": 9312654.0,
"step": 5050
},
{
"entropy": 5.895126533508301,
"epoch": 0.424700693131695,
"grad_norm": 1.0703125,
"learning_rate": 0.00049869115857386,
"loss": 5.8318,
"mean_token_accuracy": 0.14068375900387764,
"num_tokens": 9322271.0,
"step": 5055
},
{
"entropy": 5.920522689819336,
"epoch": 0.4251207729468599,
"grad_norm": 1.03125,
"learning_rate": 0.0004986879312077536,
"loss": 5.7907,
"mean_token_accuracy": 0.14867359772324562,
"num_tokens": 9331341.0,
"step": 5060
},
{
"entropy": 5.782687711715698,
"epoch": 0.4255408527620248,
"grad_norm": 1.1875,
"learning_rate": 0.0004986846998791308,
"loss": 5.7009,
"mean_token_accuracy": 0.1479552812874317,
"num_tokens": 9339863.0,
"step": 5065
},
{
"entropy": 5.761766624450684,
"epoch": 0.42596093257718964,
"grad_norm": 1.15625,
"learning_rate": 0.0004986814645880485,
"loss": 5.6913,
"mean_token_accuracy": 0.1528707779943943,
"num_tokens": 9349488.0,
"step": 5070
},
{
"entropy": 5.753648185729981,
"epoch": 0.42638101239235454,
"grad_norm": 1.0546875,
"learning_rate": 0.0004986782253345645,
"loss": 5.7015,
"mean_token_accuracy": 0.14424998313188553,
"num_tokens": 9357977.0,
"step": 5075
},
{
"entropy": 5.822470092773438,
"epoch": 0.42680109220751944,
"grad_norm": 1.078125,
"learning_rate": 0.0004986749821187358,
"loss": 5.8156,
"mean_token_accuracy": 0.1453898549079895,
"num_tokens": 9367449.0,
"step": 5080
},
{
"entropy": 5.918146848678589,
"epoch": 0.42722117202268434,
"grad_norm": 1.1953125,
"learning_rate": 0.00049867173494062,
"loss": 5.8163,
"mean_token_accuracy": 0.14584119468927384,
"num_tokens": 9377070.0,
"step": 5085
},
{
"entropy": 5.796223783493042,
"epoch": 0.4276412518378492,
"grad_norm": 1.140625,
"learning_rate": 0.0004986684838002744,
"loss": 5.6173,
"mean_token_accuracy": 0.14928414672613144,
"num_tokens": 9385881.0,
"step": 5090
},
{
"entropy": 5.753831768035889,
"epoch": 0.4280613316530141,
"grad_norm": 1.0859375,
"learning_rate": 0.0004986652286977569,
"loss": 5.7468,
"mean_token_accuracy": 0.14418202489614487,
"num_tokens": 9395159.0,
"step": 5095
},
{
"entropy": 5.829101705551148,
"epoch": 0.428481411468179,
"grad_norm": 1.046875,
"learning_rate": 0.0004986619696331252,
"loss": 5.6932,
"mean_token_accuracy": 0.14958384707570077,
"num_tokens": 9404590.0,
"step": 5100
},
{
"entropy": 5.858063268661499,
"epoch": 0.4289014912833438,
"grad_norm": 1.03125,
"learning_rate": 0.0004986587066064367,
"loss": 5.726,
"mean_token_accuracy": 0.15162373185157776,
"num_tokens": 9414452.0,
"step": 5105
},
{
"entropy": 5.838721084594726,
"epoch": 0.4293215710985087,
"grad_norm": 1.1484375,
"learning_rate": 0.0004986554396177494,
"loss": 5.8629,
"mean_token_accuracy": 0.14060329645872116,
"num_tokens": 9424004.0,
"step": 5110
},
{
"entropy": 5.8798810005187985,
"epoch": 0.4297416509136736,
"grad_norm": 1.0390625,
"learning_rate": 0.0004986521686671212,
"loss": 5.7375,
"mean_token_accuracy": 0.15874141305685044,
"num_tokens": 9433487.0,
"step": 5115
},
{
"entropy": 5.820468187332153,
"epoch": 0.43016173072883845,
"grad_norm": 1.203125,
"learning_rate": 0.00049864889375461,
"loss": 5.8045,
"mean_token_accuracy": 0.14226291850209236,
"num_tokens": 9442742.0,
"step": 5120
},
{
"entropy": 5.823851108551025,
"epoch": 0.43058181054400335,
"grad_norm": 1.0703125,
"learning_rate": 0.0004986456148802738,
"loss": 5.8512,
"mean_token_accuracy": 0.1478380650281906,
"num_tokens": 9452550.0,
"step": 5125
},
{
"entropy": 5.949095916748047,
"epoch": 0.43100189035916825,
"grad_norm": 1.1015625,
"learning_rate": 0.0004986423320441707,
"loss": 5.8028,
"mean_token_accuracy": 0.1422652445733547,
"num_tokens": 9461920.0,
"step": 5130
},
{
"entropy": 5.89138822555542,
"epoch": 0.43142197017433315,
"grad_norm": 1.171875,
"learning_rate": 0.0004986390452463588,
"loss": 5.7321,
"mean_token_accuracy": 0.1446138709783554,
"num_tokens": 9470817.0,
"step": 5135
},
{
"entropy": 5.692741823196411,
"epoch": 0.431842049989498,
"grad_norm": 1.1484375,
"learning_rate": 0.0004986357544868964,
"loss": 5.6781,
"mean_token_accuracy": 0.1526497036218643,
"num_tokens": 9479936.0,
"step": 5140
},
{
"entropy": 5.840050792694091,
"epoch": 0.4322621298046629,
"grad_norm": 1.15625,
"learning_rate": 0.0004986324597658418,
"loss": 5.7111,
"mean_token_accuracy": 0.15329033583402635,
"num_tokens": 9489818.0,
"step": 5145
},
{
"entropy": 5.679720258712768,
"epoch": 0.4326822096198278,
"grad_norm": 1.09375,
"learning_rate": 0.0004986291610832533,
"loss": 5.7152,
"mean_token_accuracy": 0.14663320183753967,
"num_tokens": 9499688.0,
"step": 5150
},
{
"entropy": 5.932486724853516,
"epoch": 0.4331022894349926,
"grad_norm": 1.15625,
"learning_rate": 0.0004986258584391892,
"loss": 5.7743,
"mean_token_accuracy": 0.14935381412506105,
"num_tokens": 9509581.0,
"step": 5155
},
{
"entropy": 5.936005020141602,
"epoch": 0.4335223692501575,
"grad_norm": 1.1171875,
"learning_rate": 0.0004986225518337084,
"loss": 5.8499,
"mean_token_accuracy": 0.1452954113483429,
"num_tokens": 9518556.0,
"step": 5160
},
{
"entropy": 5.727261352539062,
"epoch": 0.4339424490653224,
"grad_norm": 1.0234375,
"learning_rate": 0.0004986192412668692,
"loss": 5.7533,
"mean_token_accuracy": 0.1502923622727394,
"num_tokens": 9527612.0,
"step": 5165
},
{
"entropy": 5.777105236053467,
"epoch": 0.4343625288804873,
"grad_norm": 1.2109375,
"learning_rate": 0.0004986159267387302,
"loss": 5.6485,
"mean_token_accuracy": 0.15689545422792434,
"num_tokens": 9535882.0,
"step": 5170
},
{
"entropy": 5.798953247070313,
"epoch": 0.43478260869565216,
"grad_norm": 1.0859375,
"learning_rate": 0.0004986126082493502,
"loss": 5.7425,
"mean_token_accuracy": 0.15121322870254517,
"num_tokens": 9544799.0,
"step": 5175
},
{
"entropy": 5.757265424728393,
"epoch": 0.43520268851081706,
"grad_norm": 1.03125,
"learning_rate": 0.0004986092857987881,
"loss": 5.6541,
"mean_token_accuracy": 0.15490222424268724,
"num_tokens": 9553805.0,
"step": 5180
},
{
"entropy": 5.7829601764678955,
"epoch": 0.43562276832598196,
"grad_norm": 1.1015625,
"learning_rate": 0.0004986059593871026,
"loss": 5.6988,
"mean_token_accuracy": 0.1475805327296257,
"num_tokens": 9563493.0,
"step": 5185
},
{
"entropy": 5.794771003723144,
"epoch": 0.4360428481411468,
"grad_norm": 1.09375,
"learning_rate": 0.0004986026290143527,
"loss": 5.7871,
"mean_token_accuracy": 0.1461959660053253,
"num_tokens": 9572297.0,
"step": 5190
},
{
"entropy": 5.942814207077026,
"epoch": 0.4364629279563117,
"grad_norm": 1.2265625,
"learning_rate": 0.0004985992946805973,
"loss": 5.8966,
"mean_token_accuracy": 0.1419041596353054,
"num_tokens": 9581967.0,
"step": 5195
},
{
"entropy": 5.875198268890381,
"epoch": 0.4368830077714766,
"grad_norm": 1.0078125,
"learning_rate": 0.0004985959563858955,
"loss": 5.8092,
"mean_token_accuracy": 0.149876207113266,
"num_tokens": 9590885.0,
"step": 5200
},
{
"entropy": 5.878464889526367,
"epoch": 0.43730308758664144,
"grad_norm": 1.1640625,
"learning_rate": 0.0004985926141303066,
"loss": 5.7457,
"mean_token_accuracy": 0.14725576043128968,
"num_tokens": 9599247.0,
"step": 5205
},
{
"entropy": 5.793534898757935,
"epoch": 0.43772316740180633,
"grad_norm": 1.2421875,
"learning_rate": 0.0004985892679138896,
"loss": 5.6742,
"mean_token_accuracy": 0.15613210648298265,
"num_tokens": 9608296.0,
"step": 5210
},
{
"entropy": 5.832571029663086,
"epoch": 0.43814324721697123,
"grad_norm": 1.2421875,
"learning_rate": 0.0004985859177367038,
"loss": 5.7225,
"mean_token_accuracy": 0.1492369145154953,
"num_tokens": 9616734.0,
"step": 5215
},
{
"entropy": 5.872725296020508,
"epoch": 0.43856332703213613,
"grad_norm": 1.1171875,
"learning_rate": 0.0004985825635988087,
"loss": 5.8159,
"mean_token_accuracy": 0.14337899088859557,
"num_tokens": 9626246.0,
"step": 5220
},
{
"entropy": 5.748163080215454,
"epoch": 0.43898340684730097,
"grad_norm": 1.1875,
"learning_rate": 0.0004985792055002635,
"loss": 5.662,
"mean_token_accuracy": 0.1514718361198902,
"num_tokens": 9634963.0,
"step": 5225
},
{
"entropy": 5.872535848617554,
"epoch": 0.43940348666246587,
"grad_norm": 1.2109375,
"learning_rate": 0.0004985758434411278,
"loss": 5.7475,
"mean_token_accuracy": 0.15245552510023117,
"num_tokens": 9643615.0,
"step": 5230
},
{
"entropy": 5.807539796829223,
"epoch": 0.43982356647763077,
"grad_norm": 1.1328125,
"learning_rate": 0.0004985724774214613,
"loss": 5.7091,
"mean_token_accuracy": 0.1469217024743557,
"num_tokens": 9653306.0,
"step": 5235
},
{
"entropy": 5.8017144203186035,
"epoch": 0.4402436462927956,
"grad_norm": 1.171875,
"learning_rate": 0.0004985691074413233,
"loss": 5.7408,
"mean_token_accuracy": 0.1475864641368389,
"num_tokens": 9662389.0,
"step": 5240
},
{
"entropy": 5.795043849945069,
"epoch": 0.4406637261079605,
"grad_norm": 1.109375,
"learning_rate": 0.0004985657335007739,
"loss": 5.7409,
"mean_token_accuracy": 0.14567713364958762,
"num_tokens": 9671183.0,
"step": 5245
},
{
"entropy": 5.813223123550415,
"epoch": 0.4410838059231254,
"grad_norm": 1.0546875,
"learning_rate": 0.0004985623555998725,
"loss": 5.7139,
"mean_token_accuracy": 0.15571152418851852,
"num_tokens": 9680544.0,
"step": 5250
},
{
"entropy": 5.806698322296143,
"epoch": 0.4415038857382903,
"grad_norm": 1.1796875,
"learning_rate": 0.0004985589737386791,
"loss": 5.7474,
"mean_token_accuracy": 0.14870114251971245,
"num_tokens": 9690137.0,
"step": 5255
},
{
"entropy": 5.791662406921387,
"epoch": 0.44192396555345514,
"grad_norm": 1.125,
"learning_rate": 0.0004985555879172535,
"loss": 5.6861,
"mean_token_accuracy": 0.15161058008670808,
"num_tokens": 9699149.0,
"step": 5260
},
{
"entropy": 5.812884378433227,
"epoch": 0.44234404536862004,
"grad_norm": 1.0859375,
"learning_rate": 0.000498552198135656,
"loss": 5.763,
"mean_token_accuracy": 0.15164322704076766,
"num_tokens": 9709308.0,
"step": 5265
},
{
"entropy": 5.770615005493164,
"epoch": 0.44276412518378494,
"grad_norm": 1.375,
"learning_rate": 0.0004985488043939462,
"loss": 5.7076,
"mean_token_accuracy": 0.1522655814886093,
"num_tokens": 9718462.0,
"step": 5270
},
{
"entropy": 5.8054506301879885,
"epoch": 0.4431842049989498,
"grad_norm": 1.1015625,
"learning_rate": 0.0004985454066921846,
"loss": 5.64,
"mean_token_accuracy": 0.14959139227867127,
"num_tokens": 9727626.0,
"step": 5275
},
{
"entropy": 5.665741491317749,
"epoch": 0.4436042848141147,
"grad_norm": 1.1171875,
"learning_rate": 0.0004985420050304312,
"loss": 5.6701,
"mean_token_accuracy": 0.14974095672369003,
"num_tokens": 9737091.0,
"step": 5280
},
{
"entropy": 5.7471333026885985,
"epoch": 0.4440243646292796,
"grad_norm": 1.1875,
"learning_rate": 0.0004985385994087462,
"loss": 5.7404,
"mean_token_accuracy": 0.14873735308647157,
"num_tokens": 9746135.0,
"step": 5285
},
{
"entropy": 5.838712120056153,
"epoch": 0.4444444444444444,
"grad_norm": 1.2578125,
"learning_rate": 0.0004985351898271901,
"loss": 5.6731,
"mean_token_accuracy": 0.15403366982936859,
"num_tokens": 9754549.0,
"step": 5290
},
{
"entropy": 5.828793716430664,
"epoch": 0.4448645242596093,
"grad_norm": 1.140625,
"learning_rate": 0.0004985317762858231,
"loss": 5.8134,
"mean_token_accuracy": 0.1410953238606453,
"num_tokens": 9764219.0,
"step": 5295
},
{
"entropy": 5.79041018486023,
"epoch": 0.4452846040747742,
"grad_norm": 1.125,
"learning_rate": 0.000498528358784706,
"loss": 5.653,
"mean_token_accuracy": 0.15147143453359604,
"num_tokens": 9772234.0,
"step": 5300
},
{
"entropy": 5.761803770065308,
"epoch": 0.4457046838899391,
"grad_norm": 1.1015625,
"learning_rate": 0.000498524937323899,
"loss": 5.7143,
"mean_token_accuracy": 0.15045253187417984,
"num_tokens": 9781417.0,
"step": 5305
},
{
"entropy": 5.912419557571411,
"epoch": 0.44612476370510395,
"grad_norm": 1.1015625,
"learning_rate": 0.0004985215119034628,
"loss": 5.83,
"mean_token_accuracy": 0.13866035491228104,
"num_tokens": 9791286.0,
"step": 5310
},
{
"entropy": 5.829041481018066,
"epoch": 0.44654484352026885,
"grad_norm": 1.1953125,
"learning_rate": 0.0004985180825234582,
"loss": 5.8266,
"mean_token_accuracy": 0.14558245986700058,
"num_tokens": 9802157.0,
"step": 5315
},
{
"entropy": 5.8994176387786865,
"epoch": 0.44696492333543375,
"grad_norm": 1.09375,
"learning_rate": 0.0004985146491839459,
"loss": 5.8102,
"mean_token_accuracy": 0.13574059829115867,
"num_tokens": 9812646.0,
"step": 5320
},
{
"entropy": 5.954456853866577,
"epoch": 0.4473850031505986,
"grad_norm": 1.1953125,
"learning_rate": 0.0004985112118849865,
"loss": 5.819,
"mean_token_accuracy": 0.14093976765871047,
"num_tokens": 9822274.0,
"step": 5325
},
{
"entropy": 5.737771701812744,
"epoch": 0.4478050829657635,
"grad_norm": 1.15625,
"learning_rate": 0.0004985077706266412,
"loss": 5.6257,
"mean_token_accuracy": 0.14395610615611076,
"num_tokens": 9831337.0,
"step": 5330
},
{
"entropy": 5.7339942932128904,
"epoch": 0.4482251627809284,
"grad_norm": 1.078125,
"learning_rate": 0.0004985043254089708,
"loss": 5.7586,
"mean_token_accuracy": 0.1422972373664379,
"num_tokens": 9840798.0,
"step": 5335
},
{
"entropy": 5.747358274459839,
"epoch": 0.44864524259609323,
"grad_norm": 1.0703125,
"learning_rate": 0.0004985008762320364,
"loss": 5.7263,
"mean_token_accuracy": 0.14806412309408187,
"num_tokens": 9850117.0,
"step": 5340
},
{
"entropy": 5.791873502731323,
"epoch": 0.4490653224112581,
"grad_norm": 1.1640625,
"learning_rate": 0.000498497423095899,
"loss": 5.6724,
"mean_token_accuracy": 0.154633791744709,
"num_tokens": 9858227.0,
"step": 5345
},
{
"entropy": 5.7806925773620605,
"epoch": 0.449485402226423,
"grad_norm": 1.0703125,
"learning_rate": 0.0004984939660006199,
"loss": 5.7529,
"mean_token_accuracy": 0.14604211449623108,
"num_tokens": 9867157.0,
"step": 5350
},
{
"entropy": 5.769717884063721,
"epoch": 0.4499054820415879,
"grad_norm": 1.140625,
"learning_rate": 0.0004984905049462602,
"loss": 5.6816,
"mean_token_accuracy": 0.14660602807998657,
"num_tokens": 9877045.0,
"step": 5355
},
{
"entropy": 5.902087068557739,
"epoch": 0.45032556185675277,
"grad_norm": 1.1484375,
"learning_rate": 0.0004984870399328814,
"loss": 5.8208,
"mean_token_accuracy": 0.14698289036750795,
"num_tokens": 9886637.0,
"step": 5360
},
{
"entropy": 5.751916408538818,
"epoch": 0.45074564167191766,
"grad_norm": 1.171875,
"learning_rate": 0.0004984835709605446,
"loss": 5.691,
"mean_token_accuracy": 0.1520322397351265,
"num_tokens": 9895601.0,
"step": 5365
},
{
"entropy": 5.830451345443725,
"epoch": 0.45116572148708256,
"grad_norm": 1.1875,
"learning_rate": 0.0004984800980293116,
"loss": 5.8414,
"mean_token_accuracy": 0.14870392084121703,
"num_tokens": 9904775.0,
"step": 5370
},
{
"entropy": 5.830691766738892,
"epoch": 0.4515858013022474,
"grad_norm": 1.1875,
"learning_rate": 0.0004984766211392435,
"loss": 5.7698,
"mean_token_accuracy": 0.14393762201070787,
"num_tokens": 9913795.0,
"step": 5375
},
{
"entropy": 5.805560731887818,
"epoch": 0.4520058811174123,
"grad_norm": 1.09375,
"learning_rate": 0.0004984731402904024,
"loss": 5.6142,
"mean_token_accuracy": 0.15417981371283532,
"num_tokens": 9922576.0,
"step": 5380
},
{
"entropy": 5.739026212692261,
"epoch": 0.4524259609325772,
"grad_norm": 1.125,
"learning_rate": 0.0004984696554828496,
"loss": 5.5979,
"mean_token_accuracy": 0.1531780928373337,
"num_tokens": 9930971.0,
"step": 5385
},
{
"entropy": 5.801077175140381,
"epoch": 0.4528460407477421,
"grad_norm": 1.25,
"learning_rate": 0.0004984661667166468,
"loss": 5.716,
"mean_token_accuracy": 0.15592064559459687,
"num_tokens": 9939628.0,
"step": 5390
},
{
"entropy": 5.807399845123291,
"epoch": 0.45326612056290694,
"grad_norm": 1.078125,
"learning_rate": 0.0004984626739918561,
"loss": 5.6836,
"mean_token_accuracy": 0.15975240394473075,
"num_tokens": 9948397.0,
"step": 5395
},
{
"entropy": 5.839657735824585,
"epoch": 0.45368620037807184,
"grad_norm": 1.0234375,
"learning_rate": 0.0004984591773085391,
"loss": 5.7721,
"mean_token_accuracy": 0.15158986151218415,
"num_tokens": 9957683.0,
"step": 5400
},
{
"entropy": 5.840809059143067,
"epoch": 0.45410628019323673,
"grad_norm": 1.078125,
"learning_rate": 0.0004984556766667578,
"loss": 5.7469,
"mean_token_accuracy": 0.14915718138217926,
"num_tokens": 9966756.0,
"step": 5405
},
{
"entropy": 5.824033927917481,
"epoch": 0.4545263600084016,
"grad_norm": 1.1171875,
"learning_rate": 0.0004984521720665743,
"loss": 5.7479,
"mean_token_accuracy": 0.14859457165002823,
"num_tokens": 9976000.0,
"step": 5410
},
{
"entropy": 5.868662643432617,
"epoch": 0.4549464398235665,
"grad_norm": 1.078125,
"learning_rate": 0.0004984486635080507,
"loss": 5.7537,
"mean_token_accuracy": 0.1539094254374504,
"num_tokens": 9985509.0,
"step": 5415
},
{
"entropy": 5.7345482349395756,
"epoch": 0.45536651963873137,
"grad_norm": 1.1640625,
"learning_rate": 0.0004984451509912489,
"loss": 5.7046,
"mean_token_accuracy": 0.1475559502840042,
"num_tokens": 9994342.0,
"step": 5420
},
{
"entropy": 5.76420407295227,
"epoch": 0.4557865994538962,
"grad_norm": 1.1484375,
"learning_rate": 0.0004984416345162315,
"loss": 5.7417,
"mean_token_accuracy": 0.1464938871562481,
"num_tokens": 10004249.0,
"step": 5425
},
{
"entropy": 5.827924537658691,
"epoch": 0.4562066792690611,
"grad_norm": 1.2265625,
"learning_rate": 0.0004984381140830605,
"loss": 5.7002,
"mean_token_accuracy": 0.1514168582856655,
"num_tokens": 10012430.0,
"step": 5430
},
{
"entropy": 5.84401159286499,
"epoch": 0.456626759084226,
"grad_norm": 1.1484375,
"learning_rate": 0.0004984345896917984,
"loss": 5.7136,
"mean_token_accuracy": 0.14923455715179443,
"num_tokens": 10021434.0,
"step": 5435
},
{
"entropy": 5.7987868309021,
"epoch": 0.4570468388993909,
"grad_norm": 1.1953125,
"learning_rate": 0.0004984310613425076,
"loss": 5.7086,
"mean_token_accuracy": 0.14991628527641296,
"num_tokens": 10030473.0,
"step": 5440
},
{
"entropy": 5.8427849292755125,
"epoch": 0.45746691871455575,
"grad_norm": 1.2578125,
"learning_rate": 0.0004984275290352506,
"loss": 5.6998,
"mean_token_accuracy": 0.15363647043704987,
"num_tokens": 10039057.0,
"step": 5445
},
{
"entropy": 5.887651014328003,
"epoch": 0.45788699852972065,
"grad_norm": 1.1171875,
"learning_rate": 0.0004984239927700899,
"loss": 5.7915,
"mean_token_accuracy": 0.14603089392185212,
"num_tokens": 10047998.0,
"step": 5450
},
{
"entropy": 5.905976247787476,
"epoch": 0.45830707834488554,
"grad_norm": 1.15625,
"learning_rate": 0.0004984204525470883,
"loss": 5.7109,
"mean_token_accuracy": 0.14362944588065146,
"num_tokens": 10057479.0,
"step": 5455
},
{
"entropy": 5.703067064285278,
"epoch": 0.4587271581600504,
"grad_norm": 1.1171875,
"learning_rate": 0.0004984169083663084,
"loss": 5.6963,
"mean_token_accuracy": 0.14525847285985946,
"num_tokens": 10067754.0,
"step": 5460
},
{
"entropy": 5.800664043426513,
"epoch": 0.4591472379752153,
"grad_norm": 1.015625,
"learning_rate": 0.0004984133602278129,
"loss": 5.7683,
"mean_token_accuracy": 0.1455842971801758,
"num_tokens": 10076815.0,
"step": 5465
},
{
"entropy": 5.94423246383667,
"epoch": 0.4595673177903802,
"grad_norm": 1.1171875,
"learning_rate": 0.000498409808131665,
"loss": 5.7683,
"mean_token_accuracy": 0.15147839486598969,
"num_tokens": 10086300.0,
"step": 5470
},
{
"entropy": 5.747309446334839,
"epoch": 0.4599873976055451,
"grad_norm": 1.046875,
"learning_rate": 0.0004984062520779272,
"loss": 5.6883,
"mean_token_accuracy": 0.1561483383178711,
"num_tokens": 10095383.0,
"step": 5475
},
{
"entropy": 5.73441481590271,
"epoch": 0.4604074774207099,
"grad_norm": 1.0859375,
"learning_rate": 0.0004984026920666628,
"loss": 5.6559,
"mean_token_accuracy": 0.15058314353227614,
"num_tokens": 10103971.0,
"step": 5480
},
{
"entropy": 5.7511381149292,
"epoch": 0.4608275572358748,
"grad_norm": 1.0859375,
"learning_rate": 0.0004983991280979347,
"loss": 5.6601,
"mean_token_accuracy": 0.1518106997013092,
"num_tokens": 10113028.0,
"step": 5485
},
{
"entropy": 5.799294757843017,
"epoch": 0.4612476370510397,
"grad_norm": 1.140625,
"learning_rate": 0.0004983955601718061,
"loss": 5.6456,
"mean_token_accuracy": 0.14627908915281296,
"num_tokens": 10121890.0,
"step": 5490
},
{
"entropy": 5.870154333114624,
"epoch": 0.46166771686620456,
"grad_norm": 1.1484375,
"learning_rate": 0.0004983919882883401,
"loss": 5.761,
"mean_token_accuracy": 0.14976027309894563,
"num_tokens": 10131655.0,
"step": 5495
},
{
"entropy": 5.851486492156982,
"epoch": 0.46208779668136946,
"grad_norm": 1.15625,
"learning_rate": 0.0004983884124476,
"loss": 5.7597,
"mean_token_accuracy": 0.1464255526661873,
"num_tokens": 10140778.0,
"step": 5500
},
{
"entropy": 5.7697046279907225,
"epoch": 0.46250787649653435,
"grad_norm": 1.140625,
"learning_rate": 0.0004983848326496494,
"loss": 5.8037,
"mean_token_accuracy": 0.14742077738046647,
"num_tokens": 10150229.0,
"step": 5505
},
{
"entropy": 5.8817291259765625,
"epoch": 0.4629279563116992,
"grad_norm": 1.1484375,
"learning_rate": 0.0004983812488945513,
"loss": 5.7155,
"mean_token_accuracy": 0.1507673218846321,
"num_tokens": 10158939.0,
"step": 5510
},
{
"entropy": 5.805754947662353,
"epoch": 0.4633480361268641,
"grad_norm": 1.1875,
"learning_rate": 0.0004983776611823696,
"loss": 5.7042,
"mean_token_accuracy": 0.14701770842075348,
"num_tokens": 10168383.0,
"step": 5515
},
{
"entropy": 5.7591667652130125,
"epoch": 0.463768115942029,
"grad_norm": 1.171875,
"learning_rate": 0.0004983740695131676,
"loss": 5.7041,
"mean_token_accuracy": 0.15164369344711304,
"num_tokens": 10178678.0,
"step": 5520
},
{
"entropy": 5.810311889648437,
"epoch": 0.4641881957571939,
"grad_norm": 1.1015625,
"learning_rate": 0.000498370473887009,
"loss": 5.7131,
"mean_token_accuracy": 0.1485781654715538,
"num_tokens": 10188964.0,
"step": 5525
},
{
"entropy": 5.844456338882447,
"epoch": 0.46460827557235873,
"grad_norm": 1.2109375,
"learning_rate": 0.0004983668743039573,
"loss": 5.7064,
"mean_token_accuracy": 0.15294503271579743,
"num_tokens": 10198333.0,
"step": 5530
},
{
"entropy": 5.732112169265747,
"epoch": 0.46502835538752363,
"grad_norm": 1.203125,
"learning_rate": 0.0004983632707640766,
"loss": 5.741,
"mean_token_accuracy": 0.1478828214108944,
"num_tokens": 10207876.0,
"step": 5535
},
{
"entropy": 5.762799310684204,
"epoch": 0.4654484352026885,
"grad_norm": 1.109375,
"learning_rate": 0.0004983596632674306,
"loss": 5.6777,
"mean_token_accuracy": 0.1528876930475235,
"num_tokens": 10216822.0,
"step": 5540
},
{
"entropy": 5.89079327583313,
"epoch": 0.46586851501785337,
"grad_norm": 1.046875,
"learning_rate": 0.0004983560518140831,
"loss": 5.8051,
"mean_token_accuracy": 0.14096633195877076,
"num_tokens": 10226887.0,
"step": 5545
},
{
"entropy": 5.8007360935211185,
"epoch": 0.46628859483301827,
"grad_norm": 1.140625,
"learning_rate": 0.0004983524364040982,
"loss": 5.6429,
"mean_token_accuracy": 0.154203137755394,
"num_tokens": 10235935.0,
"step": 5550
},
{
"entropy": 5.785525369644165,
"epoch": 0.46670867464818316,
"grad_norm": 1.0703125,
"learning_rate": 0.0004983488170375399,
"loss": 5.6031,
"mean_token_accuracy": 0.15314862504601479,
"num_tokens": 10245590.0,
"step": 5555
},
{
"entropy": 5.682972955703735,
"epoch": 0.46712875446334806,
"grad_norm": 1.203125,
"learning_rate": 0.0004983451937144723,
"loss": 5.6887,
"mean_token_accuracy": 0.14980672895908356,
"num_tokens": 10255104.0,
"step": 5560
},
{
"entropy": 5.612952136993409,
"epoch": 0.4675488342785129,
"grad_norm": 1.078125,
"learning_rate": 0.0004983415664349595,
"loss": 5.5487,
"mean_token_accuracy": 0.1598246991634369,
"num_tokens": 10264236.0,
"step": 5565
},
{
"entropy": 5.829507875442505,
"epoch": 0.4679689140936778,
"grad_norm": 1.109375,
"learning_rate": 0.0004983379351990659,
"loss": 5.6626,
"mean_token_accuracy": 0.15338489711284636,
"num_tokens": 10273335.0,
"step": 5570
},
{
"entropy": 5.7265605449676515,
"epoch": 0.4683889939088427,
"grad_norm": 1.046875,
"learning_rate": 0.0004983343000068559,
"loss": 5.61,
"mean_token_accuracy": 0.15400451123714448,
"num_tokens": 10282206.0,
"step": 5575
},
{
"entropy": 5.658405828475952,
"epoch": 0.46880907372400754,
"grad_norm": 1.359375,
"learning_rate": 0.0004983306608583937,
"loss": 5.5786,
"mean_token_accuracy": 0.16081484854221345,
"num_tokens": 10290056.0,
"step": 5580
},
{
"entropy": 5.776408720016479,
"epoch": 0.46922915353917244,
"grad_norm": 1.1171875,
"learning_rate": 0.0004983270177537438,
"loss": 5.6587,
"mean_token_accuracy": 0.15047800838947295,
"num_tokens": 10299726.0,
"step": 5585
},
{
"entropy": 5.769086027145386,
"epoch": 0.46964923335433734,
"grad_norm": 1.2109375,
"learning_rate": 0.0004983233706929708,
"loss": 5.7277,
"mean_token_accuracy": 0.14831028133630753,
"num_tokens": 10308696.0,
"step": 5590
},
{
"entropy": 5.8635729312896725,
"epoch": 0.4700693131695022,
"grad_norm": 1.078125,
"learning_rate": 0.0004983197196761392,
"loss": 5.8092,
"mean_token_accuracy": 0.14269383996725082,
"num_tokens": 10317845.0,
"step": 5595
},
{
"entropy": 5.799219799041748,
"epoch": 0.4704893929846671,
"grad_norm": 1.1796875,
"learning_rate": 0.0004983160647033139,
"loss": 5.7013,
"mean_token_accuracy": 0.1549665093421936,
"num_tokens": 10326563.0,
"step": 5600
},
{
"entropy": 5.832771301269531,
"epoch": 0.470909472799832,
"grad_norm": 1.1015625,
"learning_rate": 0.0004983124057745595,
"loss": 5.6801,
"mean_token_accuracy": 0.14922733306884767,
"num_tokens": 10335931.0,
"step": 5605
},
{
"entropy": 5.690406656265258,
"epoch": 0.47132955261499687,
"grad_norm": 1.1484375,
"learning_rate": 0.0004983087428899408,
"loss": 5.6809,
"mean_token_accuracy": 0.14426365718245507,
"num_tokens": 10344984.0,
"step": 5610
},
{
"entropy": 5.79549012184143,
"epoch": 0.4717496324301617,
"grad_norm": 1.2734375,
"learning_rate": 0.0004983050760495227,
"loss": 5.709,
"mean_token_accuracy": 0.1500493675470352,
"num_tokens": 10353522.0,
"step": 5615
},
{
"entropy": 5.866241216659546,
"epoch": 0.4721697122453266,
"grad_norm": 1.4375,
"learning_rate": 0.0004983014052533702,
"loss": 5.7266,
"mean_token_accuracy": 0.15051157921552658,
"num_tokens": 10363527.0,
"step": 5620
},
{
"entropy": 5.768464851379394,
"epoch": 0.4725897920604915,
"grad_norm": 1.0390625,
"learning_rate": 0.0004982977305015481,
"loss": 5.6488,
"mean_token_accuracy": 0.1469082362949848,
"num_tokens": 10372040.0,
"step": 5625
},
{
"entropy": 5.791842842102051,
"epoch": 0.47300987187565635,
"grad_norm": 1.1640625,
"learning_rate": 0.0004982940517941219,
"loss": 5.6314,
"mean_token_accuracy": 0.14977569580078126,
"num_tokens": 10381279.0,
"step": 5630
},
{
"entropy": 5.833928823471069,
"epoch": 0.47342995169082125,
"grad_norm": 1.1484375,
"learning_rate": 0.0004982903691311564,
"loss": 5.8003,
"mean_token_accuracy": 0.14435067921876907,
"num_tokens": 10390608.0,
"step": 5635
},
{
"entropy": 5.762945175170898,
"epoch": 0.47385003150598615,
"grad_norm": 1.171875,
"learning_rate": 0.0004982866825127172,
"loss": 5.5751,
"mean_token_accuracy": 0.15733419954776764,
"num_tokens": 10399851.0,
"step": 5640
},
{
"entropy": 5.863043403625488,
"epoch": 0.47427011132115104,
"grad_norm": 1.140625,
"learning_rate": 0.0004982829919388692,
"loss": 5.876,
"mean_token_accuracy": 0.14255642220377923,
"num_tokens": 10410425.0,
"step": 5645
},
{
"entropy": 5.71831259727478,
"epoch": 0.4746901911363159,
"grad_norm": 1.25,
"learning_rate": 0.0004982792974096781,
"loss": 5.6364,
"mean_token_accuracy": 0.15793368965387344,
"num_tokens": 10418783.0,
"step": 5650
},
{
"entropy": 5.846996879577636,
"epoch": 0.4751102709514808,
"grad_norm": 1.296875,
"learning_rate": 0.000498275598925209,
"loss": 5.8087,
"mean_token_accuracy": 0.1399559460580349,
"num_tokens": 10427360.0,
"step": 5655
},
{
"entropy": 5.902381563186646,
"epoch": 0.4755303507666457,
"grad_norm": 1.15625,
"learning_rate": 0.0004982718964855277,
"loss": 5.7724,
"mean_token_accuracy": 0.1477045461535454,
"num_tokens": 10436613.0,
"step": 5660
},
{
"entropy": 5.831106662750244,
"epoch": 0.4759504305818105,
"grad_norm": 1.234375,
"learning_rate": 0.0004982681900907,
"loss": 5.8166,
"mean_token_accuracy": 0.14889360964298248,
"num_tokens": 10445055.0,
"step": 5665
},
{
"entropy": 5.788445329666137,
"epoch": 0.4763705103969754,
"grad_norm": 1.0234375,
"learning_rate": 0.000498264479740791,
"loss": 5.6235,
"mean_token_accuracy": 0.15863950699567794,
"num_tokens": 10454516.0,
"step": 5670
},
{
"entropy": 5.86628303527832,
"epoch": 0.4767905902121403,
"grad_norm": 1.0625,
"learning_rate": 0.0004982607654358668,
"loss": 5.7552,
"mean_token_accuracy": 0.1509106144309044,
"num_tokens": 10463771.0,
"step": 5675
},
{
"entropy": 5.709434938430786,
"epoch": 0.47721067002730516,
"grad_norm": 1.0546875,
"learning_rate": 0.000498257047175993,
"loss": 5.6956,
"mean_token_accuracy": 0.14843147546052932,
"num_tokens": 10473783.0,
"step": 5680
},
{
"entropy": 5.726885223388672,
"epoch": 0.47763074984247006,
"grad_norm": 1.140625,
"learning_rate": 0.0004982533249612357,
"loss": 5.6618,
"mean_token_accuracy": 0.1500977322459221,
"num_tokens": 10483424.0,
"step": 5685
},
{
"entropy": 5.7648450374603275,
"epoch": 0.47805082965763496,
"grad_norm": 1.0546875,
"learning_rate": 0.0004982495987916607,
"loss": 5.5966,
"mean_token_accuracy": 0.15635181814432145,
"num_tokens": 10492536.0,
"step": 5690
},
{
"entropy": 5.7842613697052006,
"epoch": 0.47847090947279985,
"grad_norm": 1.265625,
"learning_rate": 0.0004982458686673339,
"loss": 5.7034,
"mean_token_accuracy": 0.14793817102909088,
"num_tokens": 10501616.0,
"step": 5695
},
{
"entropy": 5.839844703674316,
"epoch": 0.4788909892879647,
"grad_norm": 1.21875,
"learning_rate": 0.0004982421345883217,
"loss": 5.7449,
"mean_token_accuracy": 0.147707599401474,
"num_tokens": 10511190.0,
"step": 5700
},
{
"entropy": 5.733095026016235,
"epoch": 0.4793110691031296,
"grad_norm": 1.203125,
"learning_rate": 0.0004982383965546898,
"loss": 5.6812,
"mean_token_accuracy": 0.14984251707792282,
"num_tokens": 10520310.0,
"step": 5705
},
{
"entropy": 5.840996932983399,
"epoch": 0.4797311489182945,
"grad_norm": 1.171875,
"learning_rate": 0.0004982346545665048,
"loss": 5.6688,
"mean_token_accuracy": 0.1559746041893959,
"num_tokens": 10528711.0,
"step": 5710
},
{
"entropy": 5.825336980819702,
"epoch": 0.48015122873345933,
"grad_norm": 1.1484375,
"learning_rate": 0.0004982309086238328,
"loss": 5.7566,
"mean_token_accuracy": 0.1434244692325592,
"num_tokens": 10538484.0,
"step": 5715
},
{
"entropy": 5.79931492805481,
"epoch": 0.48057130854862423,
"grad_norm": 1.109375,
"learning_rate": 0.0004982271587267403,
"loss": 5.7018,
"mean_token_accuracy": 0.14842950850725173,
"num_tokens": 10547623.0,
"step": 5720
},
{
"entropy": 5.766378021240234,
"epoch": 0.48099138836378913,
"grad_norm": 1.1484375,
"learning_rate": 0.0004982234048752935,
"loss": 5.6452,
"mean_token_accuracy": 0.15442409366369247,
"num_tokens": 10556234.0,
"step": 5725
},
{
"entropy": 5.838606309890747,
"epoch": 0.481411468178954,
"grad_norm": 1.1875,
"learning_rate": 0.000498219647069559,
"loss": 5.875,
"mean_token_accuracy": 0.14210818707942963,
"num_tokens": 10566308.0,
"step": 5730
},
{
"entropy": 5.902328968048096,
"epoch": 0.48183154799411887,
"grad_norm": 1.2109375,
"learning_rate": 0.0004982158853096035,
"loss": 5.8085,
"mean_token_accuracy": 0.1433735728263855,
"num_tokens": 10575212.0,
"step": 5735
},
{
"entropy": 5.795661878585816,
"epoch": 0.48225162780928377,
"grad_norm": 1.140625,
"learning_rate": 0.0004982121195954935,
"loss": 5.5791,
"mean_token_accuracy": 0.15854463130235671,
"num_tokens": 10584590.0,
"step": 5740
},
{
"entropy": 5.72941517829895,
"epoch": 0.48267170762444866,
"grad_norm": 1.1484375,
"learning_rate": 0.0004982083499272957,
"loss": 5.653,
"mean_token_accuracy": 0.15250627547502518,
"num_tokens": 10593997.0,
"step": 5745
},
{
"entropy": 5.777765226364136,
"epoch": 0.4830917874396135,
"grad_norm": 1.0625,
"learning_rate": 0.0004982045763050768,
"loss": 5.7716,
"mean_token_accuracy": 0.1511766344308853,
"num_tokens": 10603299.0,
"step": 5750
},
{
"entropy": 5.8095392227172855,
"epoch": 0.4835118672547784,
"grad_norm": 1.109375,
"learning_rate": 0.0004982007987289041,
"loss": 5.7159,
"mean_token_accuracy": 0.150531367957592,
"num_tokens": 10613546.0,
"step": 5755
},
{
"entropy": 5.776759195327759,
"epoch": 0.4839319470699433,
"grad_norm": 1.25,
"learning_rate": 0.0004981970171988439,
"loss": 5.6703,
"mean_token_accuracy": 0.1606784962117672,
"num_tokens": 10622966.0,
"step": 5760
},
{
"entropy": 5.802293014526367,
"epoch": 0.48435202688510814,
"grad_norm": 1.2421875,
"learning_rate": 0.0004981932317149636,
"loss": 5.7643,
"mean_token_accuracy": 0.14677036479115485,
"num_tokens": 10633441.0,
"step": 5765
},
{
"entropy": 5.871302127838135,
"epoch": 0.48477210670027304,
"grad_norm": 1.078125,
"learning_rate": 0.00049818944227733,
"loss": 5.7431,
"mean_token_accuracy": 0.14571086168289185,
"num_tokens": 10643124.0,
"step": 5770
},
{
"entropy": 5.788599300384521,
"epoch": 0.48519218651543794,
"grad_norm": 1.125,
"learning_rate": 0.0004981856488860105,
"loss": 5.7115,
"mean_token_accuracy": 0.14598402380943298,
"num_tokens": 10652517.0,
"step": 5775
},
{
"entropy": 5.782949590682984,
"epoch": 0.48561226633060284,
"grad_norm": 1.140625,
"learning_rate": 0.0004981818515410721,
"loss": 5.7367,
"mean_token_accuracy": 0.14545634835958482,
"num_tokens": 10663352.0,
"step": 5780
},
{
"entropy": 5.821159505844117,
"epoch": 0.4860323461457677,
"grad_norm": 1.140625,
"learning_rate": 0.0004981780502425821,
"loss": 5.7598,
"mean_token_accuracy": 0.15368367582559586,
"num_tokens": 10672430.0,
"step": 5785
},
{
"entropy": 5.822045946121216,
"epoch": 0.4864524259609326,
"grad_norm": 1.1015625,
"learning_rate": 0.0004981742449906079,
"loss": 5.6982,
"mean_token_accuracy": 0.1539619520306587,
"num_tokens": 10681908.0,
"step": 5790
},
{
"entropy": 5.846343612670898,
"epoch": 0.4868725057760975,
"grad_norm": 1.171875,
"learning_rate": 0.0004981704357852168,
"loss": 5.7077,
"mean_token_accuracy": 0.14915080666542052,
"num_tokens": 10691259.0,
"step": 5795
},
{
"entropy": 5.720254230499267,
"epoch": 0.4872925855912623,
"grad_norm": 1.25,
"learning_rate": 0.0004981666226264764,
"loss": 5.5932,
"mean_token_accuracy": 0.14788943082094191,
"num_tokens": 10699668.0,
"step": 5800
},
{
"entropy": 5.762866544723511,
"epoch": 0.4877126654064272,
"grad_norm": 1.03125,
"learning_rate": 0.0004981628055144542,
"loss": 5.657,
"mean_token_accuracy": 0.1536644384264946,
"num_tokens": 10709146.0,
"step": 5805
},
{
"entropy": 5.773292684555054,
"epoch": 0.4881327452215921,
"grad_norm": 1.1171875,
"learning_rate": 0.0004981589844492177,
"loss": 5.7398,
"mean_token_accuracy": 0.14368562251329423,
"num_tokens": 10718724.0,
"step": 5810
},
{
"entropy": 5.741464710235595,
"epoch": 0.488552825036757,
"grad_norm": 1.171875,
"learning_rate": 0.0004981551594308349,
"loss": 5.6811,
"mean_token_accuracy": 0.1533760130405426,
"num_tokens": 10728101.0,
"step": 5815
},
{
"entropy": 5.868719911575317,
"epoch": 0.48897290485192185,
"grad_norm": 1.203125,
"learning_rate": 0.0004981513304593733,
"loss": 5.7116,
"mean_token_accuracy": 0.15002420991659166,
"num_tokens": 10736750.0,
"step": 5820
},
{
"entropy": 5.850791263580322,
"epoch": 0.48939298466708675,
"grad_norm": 1.296875,
"learning_rate": 0.0004981474975349006,
"loss": 5.8999,
"mean_token_accuracy": 0.14523305892944335,
"num_tokens": 10746914.0,
"step": 5825
},
{
"entropy": 5.826296806335449,
"epoch": 0.48981306448225165,
"grad_norm": 1.2421875,
"learning_rate": 0.000498143660657485,
"loss": 5.735,
"mean_token_accuracy": 0.15287000834941863,
"num_tokens": 10755786.0,
"step": 5830
},
{
"entropy": 5.693120765686035,
"epoch": 0.4902331442974165,
"grad_norm": 1.09375,
"learning_rate": 0.0004981398198271944,
"loss": 5.5906,
"mean_token_accuracy": 0.15199866890907288,
"num_tokens": 10764821.0,
"step": 5835
},
{
"entropy": 5.775625324249267,
"epoch": 0.4906532241125814,
"grad_norm": 1.15625,
"learning_rate": 0.0004981359750440968,
"loss": 5.6958,
"mean_token_accuracy": 0.14958391785621644,
"num_tokens": 10773569.0,
"step": 5840
},
{
"entropy": 5.698379755020142,
"epoch": 0.4910733039277463,
"grad_norm": 1.1796875,
"learning_rate": 0.0004981321263082603,
"loss": 5.6703,
"mean_token_accuracy": 0.14412777721881867,
"num_tokens": 10782298.0,
"step": 5845
},
{
"entropy": 5.716889905929565,
"epoch": 0.4914933837429111,
"grad_norm": 1.109375,
"learning_rate": 0.000498128273619753,
"loss": 5.6402,
"mean_token_accuracy": 0.15635652095079422,
"num_tokens": 10792087.0,
"step": 5850
},
{
"entropy": 5.834375047683716,
"epoch": 0.491913463558076,
"grad_norm": 1.0546875,
"learning_rate": 0.0004981244169786433,
"loss": 5.738,
"mean_token_accuracy": 0.14734297171235083,
"num_tokens": 10801641.0,
"step": 5855
},
{
"entropy": 5.87250828742981,
"epoch": 0.4923335433732409,
"grad_norm": 1.09375,
"learning_rate": 0.0004981205563849994,
"loss": 5.8138,
"mean_token_accuracy": 0.1475193165242672,
"num_tokens": 10811612.0,
"step": 5860
},
{
"entropy": 5.749148321151734,
"epoch": 0.4927536231884058,
"grad_norm": 1.125,
"learning_rate": 0.0004981166918388897,
"loss": 5.6184,
"mean_token_accuracy": 0.15407940298318862,
"num_tokens": 10821608.0,
"step": 5865
},
{
"entropy": 5.674514102935791,
"epoch": 0.49317370300357066,
"grad_norm": 1.1796875,
"learning_rate": 0.0004981128233403828,
"loss": 5.5973,
"mean_token_accuracy": 0.15862181633710862,
"num_tokens": 10830679.0,
"step": 5870
},
{
"entropy": 5.774006509780884,
"epoch": 0.49359378281873556,
"grad_norm": 1.0703125,
"learning_rate": 0.000498108950889547,
"loss": 5.6646,
"mean_token_accuracy": 0.15421441048383713,
"num_tokens": 10839669.0,
"step": 5875
},
{
"entropy": 5.764991903305054,
"epoch": 0.49401386263390046,
"grad_norm": 1.1171875,
"learning_rate": 0.0004981050744864512,
"loss": 5.6346,
"mean_token_accuracy": 0.15262162685394287,
"num_tokens": 10849666.0,
"step": 5880
},
{
"entropy": 5.72649884223938,
"epoch": 0.4944339424490653,
"grad_norm": 1.140625,
"learning_rate": 0.0004981011941311638,
"loss": 5.5555,
"mean_token_accuracy": 0.16141532510519027,
"num_tokens": 10858225.0,
"step": 5885
},
{
"entropy": 5.66461033821106,
"epoch": 0.4948540222642302,
"grad_norm": 1.1328125,
"learning_rate": 0.0004980973098237535,
"loss": 5.6584,
"mean_token_accuracy": 0.14741952568292618,
"num_tokens": 10867466.0,
"step": 5890
},
{
"entropy": 5.8419431209564205,
"epoch": 0.4952741020793951,
"grad_norm": 1.1015625,
"learning_rate": 0.0004980934215642894,
"loss": 5.7012,
"mean_token_accuracy": 0.15556059777736664,
"num_tokens": 10875850.0,
"step": 5895
},
{
"entropy": 5.721167850494385,
"epoch": 0.49569418189456,
"grad_norm": 1.1328125,
"learning_rate": 0.00049808952935284,
"loss": 5.6463,
"mean_token_accuracy": 0.15190516263246537,
"num_tokens": 10885154.0,
"step": 5900
},
{
"entropy": 5.701448440551758,
"epoch": 0.49611426170972484,
"grad_norm": 1.1640625,
"learning_rate": 0.0004980856331894747,
"loss": 5.7224,
"mean_token_accuracy": 0.14825090169906616,
"num_tokens": 10894080.0,
"step": 5905
},
{
"entropy": 5.78351993560791,
"epoch": 0.49653434152488973,
"grad_norm": 1.171875,
"learning_rate": 0.0004980817330742621,
"loss": 5.7187,
"mean_token_accuracy": 0.14427882730960845,
"num_tokens": 10903248.0,
"step": 5910
},
{
"entropy": 5.805410528182984,
"epoch": 0.49695442134005463,
"grad_norm": 1.09375,
"learning_rate": 0.0004980778290072716,
"loss": 5.6918,
"mean_token_accuracy": 0.15272436887025834,
"num_tokens": 10912939.0,
"step": 5915
},
{
"entropy": 5.77779188156128,
"epoch": 0.4973745011552195,
"grad_norm": 1.3359375,
"learning_rate": 0.0004980739209885722,
"loss": 5.7101,
"mean_token_accuracy": 0.15421786904335022,
"num_tokens": 10921505.0,
"step": 5920
},
{
"entropy": 5.821563053131103,
"epoch": 0.49779458097038437,
"grad_norm": 1.109375,
"learning_rate": 0.0004980700090182331,
"loss": 5.7852,
"mean_token_accuracy": 0.15165787041187287,
"num_tokens": 10931861.0,
"step": 5925
},
{
"entropy": 5.8428229808807375,
"epoch": 0.49821466078554927,
"grad_norm": 1.1328125,
"learning_rate": 0.0004980660930963238,
"loss": 5.7011,
"mean_token_accuracy": 0.15149710923433304,
"num_tokens": 10940810.0,
"step": 5930
},
{
"entropy": 5.702348804473877,
"epoch": 0.4986347406007141,
"grad_norm": 1.140625,
"learning_rate": 0.0004980621732229133,
"loss": 5.5848,
"mean_token_accuracy": 0.15044856518507005,
"num_tokens": 10949514.0,
"step": 5935
},
{
"entropy": 5.803748416900635,
"epoch": 0.499054820415879,
"grad_norm": 1.2265625,
"learning_rate": 0.0004980582493980714,
"loss": 5.7968,
"mean_token_accuracy": 0.14371122792363167,
"num_tokens": 10959161.0,
"step": 5940
},
{
"entropy": 5.770447301864624,
"epoch": 0.4994749002310439,
"grad_norm": 1.09375,
"learning_rate": 0.0004980543216218674,
"loss": 5.6712,
"mean_token_accuracy": 0.1631261572241783,
"num_tokens": 10968983.0,
"step": 5945
},
{
"entropy": 5.808981323242188,
"epoch": 0.4998949800462088,
"grad_norm": 1.1328125,
"learning_rate": 0.0004980503898943711,
"loss": 5.8114,
"mean_token_accuracy": 0.15061289817094803,
"num_tokens": 10978044.0,
"step": 5950
},
{
"entropy": 5.788982200622558,
"epoch": 0.5003150598613737,
"grad_norm": 1.1015625,
"learning_rate": 0.0004980464542156519,
"loss": 5.6999,
"mean_token_accuracy": 0.15479907542467117,
"num_tokens": 10986980.0,
"step": 5955
},
{
"entropy": 5.79339451789856,
"epoch": 0.5007351396765385,
"grad_norm": 1.09375,
"learning_rate": 0.0004980425145857796,
"loss": 5.6394,
"mean_token_accuracy": 0.16022832095623016,
"num_tokens": 10995163.0,
"step": 5960
},
{
"entropy": 5.68665771484375,
"epoch": 0.5011552194917034,
"grad_norm": 1.1640625,
"learning_rate": 0.000498038571004824,
"loss": 5.5686,
"mean_token_accuracy": 0.1633736938238144,
"num_tokens": 11003722.0,
"step": 5965
},
{
"entropy": 5.609031295776367,
"epoch": 0.5015752993068683,
"grad_norm": 1.1171875,
"learning_rate": 0.0004980346234728549,
"loss": 5.6442,
"mean_token_accuracy": 0.1569686323404312,
"num_tokens": 11013176.0,
"step": 5970
},
{
"entropy": 5.7894915580749515,
"epoch": 0.5019953791220332,
"grad_norm": 1.109375,
"learning_rate": 0.0004980306719899424,
"loss": 5.6952,
"mean_token_accuracy": 0.151578326523304,
"num_tokens": 11022636.0,
"step": 5975
},
{
"entropy": 5.776892518997192,
"epoch": 0.5024154589371981,
"grad_norm": 1.140625,
"learning_rate": 0.0004980267165561564,
"loss": 5.6302,
"mean_token_accuracy": 0.15875780060887337,
"num_tokens": 11031896.0,
"step": 5980
},
{
"entropy": 5.7132336616516115,
"epoch": 0.502835538752363,
"grad_norm": 1.140625,
"learning_rate": 0.0004980227571715669,
"loss": 5.7054,
"mean_token_accuracy": 0.14930393695831298,
"num_tokens": 11040802.0,
"step": 5985
},
{
"entropy": 5.726763486862183,
"epoch": 0.5032556185675279,
"grad_norm": 1.125,
"learning_rate": 0.0004980187938362441,
"loss": 5.6141,
"mean_token_accuracy": 0.14617264419794082,
"num_tokens": 11049701.0,
"step": 5990
},
{
"entropy": 5.781429719924927,
"epoch": 0.5036756983826927,
"grad_norm": 1.1171875,
"learning_rate": 0.0004980148265502581,
"loss": 5.8043,
"mean_token_accuracy": 0.14090105295181274,
"num_tokens": 11059555.0,
"step": 5995
},
{
"entropy": 5.852024269104004,
"epoch": 0.5040957781978576,
"grad_norm": 1.2265625,
"learning_rate": 0.0004980108553136795,
"loss": 5.7355,
"mean_token_accuracy": 0.1523468405008316,
"num_tokens": 11068940.0,
"step": 6000
},
{
"epoch": 0.5040957781978576,
"eval_entropy": 5.700179389446457,
"eval_loss": 5.713378429412842,
"eval_mean_token_accuracy": 0.15722168653683422,
"eval_num_tokens": 11068940.0,
"eval_runtime": 27.4743,
"eval_samples_per_second": 1360.037,
"eval_steps_per_second": 170.014,
"step": 6000
},
{
"entropy": 5.850462198257446,
"epoch": 0.5045158580130225,
"grad_norm": 1.0546875,
"learning_rate": 0.0004980068801265783,
"loss": 5.6918,
"mean_token_accuracy": 0.15200575068593025,
"num_tokens": 11079014.0,
"step": 6005
},
{
"entropy": 5.7985584259033205,
"epoch": 0.5049359378281874,
"grad_norm": 1.203125,
"learning_rate": 0.0004980029009890251,
"loss": 5.7794,
"mean_token_accuracy": 0.14995476454496384,
"num_tokens": 11089526.0,
"step": 6010
},
{
"entropy": 5.745014381408692,
"epoch": 0.5053560176433523,
"grad_norm": 1.1171875,
"learning_rate": 0.0004979989179010904,
"loss": 5.6762,
"mean_token_accuracy": 0.15582752823829651,
"num_tokens": 11099156.0,
"step": 6015
},
{
"entropy": 5.753205490112305,
"epoch": 0.5057760974585171,
"grad_norm": 1.203125,
"learning_rate": 0.0004979949308628445,
"loss": 5.6531,
"mean_token_accuracy": 0.1576333686709404,
"num_tokens": 11108242.0,
"step": 6020
},
{
"entropy": 5.7076881408691404,
"epoch": 0.506196177273682,
"grad_norm": 1.1328125,
"learning_rate": 0.0004979909398743584,
"loss": 5.6553,
"mean_token_accuracy": 0.15808477848768235,
"num_tokens": 11118076.0,
"step": 6025
},
{
"entropy": 5.797974157333374,
"epoch": 0.5066162570888468,
"grad_norm": 1.0390625,
"learning_rate": 0.0004979869449357026,
"loss": 5.7275,
"mean_token_accuracy": 0.16116174012422563,
"num_tokens": 11127265.0,
"step": 6030
},
{
"entropy": 5.756021356582641,
"epoch": 0.5070363369040117,
"grad_norm": 1.0703125,
"learning_rate": 0.0004979829460469478,
"loss": 5.6513,
"mean_token_accuracy": 0.15123417377471923,
"num_tokens": 11136429.0,
"step": 6035
},
{
"entropy": 5.746025085449219,
"epoch": 0.5074564167191766,
"grad_norm": 1.1015625,
"learning_rate": 0.0004979789432081649,
"loss": 5.6536,
"mean_token_accuracy": 0.15169408172369003,
"num_tokens": 11146201.0,
"step": 6040
},
{
"entropy": 5.7764967918396,
"epoch": 0.5078764965343415,
"grad_norm": 1.1875,
"learning_rate": 0.000497974936419425,
"loss": 5.6814,
"mean_token_accuracy": 0.1552870064973831,
"num_tokens": 11154867.0,
"step": 6045
},
{
"entropy": 5.697817325592041,
"epoch": 0.5082965763495064,
"grad_norm": 1.171875,
"learning_rate": 0.0004979709256807989,
"loss": 5.7063,
"mean_token_accuracy": 0.1530693456530571,
"num_tokens": 11164092.0,
"step": 6050
},
{
"entropy": 5.767863702774048,
"epoch": 0.5087166561646713,
"grad_norm": 1.1171875,
"learning_rate": 0.0004979669109923575,
"loss": 5.7185,
"mean_token_accuracy": 0.15086525678634644,
"num_tokens": 11173176.0,
"step": 6055
},
{
"entropy": 5.823122596740722,
"epoch": 0.5091367359798362,
"grad_norm": 1.0859375,
"learning_rate": 0.0004979628923541721,
"loss": 5.7074,
"mean_token_accuracy": 0.15153964161872863,
"num_tokens": 11182397.0,
"step": 6060
},
{
"entropy": 5.847384881973267,
"epoch": 0.509556815795001,
"grad_norm": 1.1328125,
"learning_rate": 0.000497958869766314,
"loss": 5.736,
"mean_token_accuracy": 0.15451715439558028,
"num_tokens": 11191790.0,
"step": 6065
},
{
"entropy": 5.775505256652832,
"epoch": 0.5099768956101659,
"grad_norm": 1.15625,
"learning_rate": 0.0004979548432288543,
"loss": 5.6589,
"mean_token_accuracy": 0.15639978721737863,
"num_tokens": 11201104.0,
"step": 6070
},
{
"entropy": 5.7940943241119385,
"epoch": 0.5103969754253308,
"grad_norm": 1.1875,
"learning_rate": 0.0004979508127418643,
"loss": 5.6467,
"mean_token_accuracy": 0.156558046489954,
"num_tokens": 11209578.0,
"step": 6075
},
{
"entropy": 5.7526520729064945,
"epoch": 0.5108170552404957,
"grad_norm": 1.1875,
"learning_rate": 0.0004979467783054155,
"loss": 5.6138,
"mean_token_accuracy": 0.15932658165693284,
"num_tokens": 11218380.0,
"step": 6080
},
{
"entropy": 5.653487491607666,
"epoch": 0.5112371350556606,
"grad_norm": 1.1640625,
"learning_rate": 0.0004979427399195793,
"loss": 5.6167,
"mean_token_accuracy": 0.15166352689266205,
"num_tokens": 11227810.0,
"step": 6085
},
{
"entropy": 5.715474700927734,
"epoch": 0.5116572148708255,
"grad_norm": 1.2265625,
"learning_rate": 0.0004979386975844274,
"loss": 5.6336,
"mean_token_accuracy": 0.1557892754673958,
"num_tokens": 11236631.0,
"step": 6090
},
{
"entropy": 5.7959397315979,
"epoch": 0.5120772946859904,
"grad_norm": 0.9765625,
"learning_rate": 0.0004979346513000311,
"loss": 5.7021,
"mean_token_accuracy": 0.1493688315153122,
"num_tokens": 11247418.0,
"step": 6095
},
{
"entropy": 5.743378114700318,
"epoch": 0.5124973745011552,
"grad_norm": 1.1484375,
"learning_rate": 0.0004979306010664623,
"loss": 5.6043,
"mean_token_accuracy": 0.15673113763332366,
"num_tokens": 11256246.0,
"step": 6100
},
{
"entropy": 5.6460973739624025,
"epoch": 0.5129174543163201,
"grad_norm": 1.078125,
"learning_rate": 0.0004979265468837927,
"loss": 5.6109,
"mean_token_accuracy": 0.15803674310445787,
"num_tokens": 11265980.0,
"step": 6105
},
{
"entropy": 5.7585508823394775,
"epoch": 0.513337534131485,
"grad_norm": 1.15625,
"learning_rate": 0.000497922488752094,
"loss": 5.6456,
"mean_token_accuracy": 0.15313291698694229,
"num_tokens": 11276158.0,
"step": 6110
},
{
"entropy": 5.75591721534729,
"epoch": 0.5137576139466499,
"grad_norm": 1.109375,
"learning_rate": 0.0004979184266714383,
"loss": 5.5551,
"mean_token_accuracy": 0.1576843872666359,
"num_tokens": 11284957.0,
"step": 6115
},
{
"entropy": 5.691437435150147,
"epoch": 0.5141776937618148,
"grad_norm": 1.2109375,
"learning_rate": 0.0004979143606418974,
"loss": 5.5873,
"mean_token_accuracy": 0.15532443746924401,
"num_tokens": 11294340.0,
"step": 6120
},
{
"entropy": 5.817380475997925,
"epoch": 0.5145977735769797,
"grad_norm": 1.1171875,
"learning_rate": 0.0004979102906635435,
"loss": 5.8399,
"mean_token_accuracy": 0.14829942509531974,
"num_tokens": 11303344.0,
"step": 6125
},
{
"entropy": 5.825004005432129,
"epoch": 0.5150178533921445,
"grad_norm": 1.3671875,
"learning_rate": 0.0004979062167364486,
"loss": 5.7016,
"mean_token_accuracy": 0.15979470312595367,
"num_tokens": 11311338.0,
"step": 6130
},
{
"entropy": 5.708187103271484,
"epoch": 0.5154379332073094,
"grad_norm": 1.2265625,
"learning_rate": 0.0004979021388606847,
"loss": 5.5348,
"mean_token_accuracy": 0.15981798246502876,
"num_tokens": 11320194.0,
"step": 6135
},
{
"entropy": 5.763344478607178,
"epoch": 0.5158580130224742,
"grad_norm": 1.0703125,
"learning_rate": 0.0004978980570363243,
"loss": 5.7105,
"mean_token_accuracy": 0.15382457673549652,
"num_tokens": 11329952.0,
"step": 6140
},
{
"entropy": 5.736899137496948,
"epoch": 0.5162780928376391,
"grad_norm": 1.21875,
"learning_rate": 0.0004978939712634396,
"loss": 5.6586,
"mean_token_accuracy": 0.15457842200994493,
"num_tokens": 11339384.0,
"step": 6145
},
{
"entropy": 5.811904716491699,
"epoch": 0.516698172652804,
"grad_norm": 1.234375,
"learning_rate": 0.0004978898815421029,
"loss": 5.8281,
"mean_token_accuracy": 0.1484919786453247,
"num_tokens": 11348409.0,
"step": 6150
},
{
"entropy": 5.865073776245117,
"epoch": 0.5171182524679689,
"grad_norm": 1.203125,
"learning_rate": 0.0004978857878723867,
"loss": 5.7251,
"mean_token_accuracy": 0.15440079271793367,
"num_tokens": 11357478.0,
"step": 6155
},
{
"entropy": 5.813732147216797,
"epoch": 0.5175383322831338,
"grad_norm": 1.1484375,
"learning_rate": 0.0004978816902543636,
"loss": 5.7241,
"mean_token_accuracy": 0.15630923956632614,
"num_tokens": 11366379.0,
"step": 6160
},
{
"entropy": 5.804885578155518,
"epoch": 0.5179584120982986,
"grad_norm": 1.125,
"learning_rate": 0.0004978775886881062,
"loss": 5.7586,
"mean_token_accuracy": 0.15422743335366249,
"num_tokens": 11376357.0,
"step": 6165
},
{
"entropy": 5.754732990264893,
"epoch": 0.5183784919134635,
"grad_norm": 1.1484375,
"learning_rate": 0.000497873483173687,
"loss": 5.6229,
"mean_token_accuracy": 0.15853857696056367,
"num_tokens": 11384995.0,
"step": 6170
},
{
"entropy": 5.748101091384887,
"epoch": 0.5187985717286284,
"grad_norm": 1.09375,
"learning_rate": 0.0004978693737111787,
"loss": 5.635,
"mean_token_accuracy": 0.1528910756111145,
"num_tokens": 11395363.0,
"step": 6175
},
{
"entropy": 5.736073970794678,
"epoch": 0.5192186515437933,
"grad_norm": 1.078125,
"learning_rate": 0.0004978652603006543,
"loss": 5.628,
"mean_token_accuracy": 0.15332882851362228,
"num_tokens": 11404511.0,
"step": 6180
},
{
"entropy": 5.778553915023804,
"epoch": 0.5196387313589582,
"grad_norm": 1.1015625,
"learning_rate": 0.0004978611429421866,
"loss": 5.6697,
"mean_token_accuracy": 0.15048506557941438,
"num_tokens": 11413400.0,
"step": 6185
},
{
"entropy": 5.794227361679077,
"epoch": 0.5200588111741231,
"grad_norm": 1.1015625,
"learning_rate": 0.0004978570216358485,
"loss": 5.7234,
"mean_token_accuracy": 0.14682584255933762,
"num_tokens": 11423693.0,
"step": 6190
},
{
"entropy": 5.818920612335205,
"epoch": 0.520478890989288,
"grad_norm": 1.09375,
"learning_rate": 0.000497852896381713,
"loss": 5.6873,
"mean_token_accuracy": 0.1458378776907921,
"num_tokens": 11433195.0,
"step": 6195
},
{
"entropy": 5.760408115386963,
"epoch": 0.5208989708044528,
"grad_norm": 1.1484375,
"learning_rate": 0.0004978487671798531,
"loss": 5.8299,
"mean_token_accuracy": 0.141814424097538,
"num_tokens": 11443416.0,
"step": 6200
},
{
"entropy": 5.829377174377441,
"epoch": 0.5213190506196177,
"grad_norm": 1.15625,
"learning_rate": 0.0004978446340303422,
"loss": 5.6827,
"mean_token_accuracy": 0.1550835467875004,
"num_tokens": 11452487.0,
"step": 6205
},
{
"entropy": 5.777486085891724,
"epoch": 0.5217391304347826,
"grad_norm": 1.140625,
"learning_rate": 0.0004978404969332533,
"loss": 5.6976,
"mean_token_accuracy": 0.1613258346915245,
"num_tokens": 11461893.0,
"step": 6210
},
{
"entropy": 5.6914146900177,
"epoch": 0.5221592102499475,
"grad_norm": 1.125,
"learning_rate": 0.0004978363558886597,
"loss": 5.6347,
"mean_token_accuracy": 0.14582654014229773,
"num_tokens": 11471238.0,
"step": 6215
},
{
"entropy": 5.780894374847412,
"epoch": 0.5225792900651124,
"grad_norm": 1.140625,
"learning_rate": 0.0004978322108966348,
"loss": 5.7322,
"mean_token_accuracy": 0.14352654069662094,
"num_tokens": 11480571.0,
"step": 6220
},
{
"entropy": 5.752029228210449,
"epoch": 0.5229993698802773,
"grad_norm": 1.1328125,
"learning_rate": 0.0004978280619572521,
"loss": 5.7121,
"mean_token_accuracy": 0.15060520470142363,
"num_tokens": 11489552.0,
"step": 6225
},
{
"entropy": 5.808607912063598,
"epoch": 0.5234194496954422,
"grad_norm": 1.203125,
"learning_rate": 0.000497823909070585,
"loss": 5.7575,
"mean_token_accuracy": 0.14814567118883132,
"num_tokens": 11498715.0,
"step": 6230
},
{
"entropy": 5.850186061859131,
"epoch": 0.523839529510607,
"grad_norm": 1.140625,
"learning_rate": 0.0004978197522367071,
"loss": 5.6975,
"mean_token_accuracy": 0.1479704961180687,
"num_tokens": 11508472.0,
"step": 6235
},
{
"entropy": 5.831798458099366,
"epoch": 0.5242596093257719,
"grad_norm": 1.140625,
"learning_rate": 0.0004978155914556919,
"loss": 5.6532,
"mean_token_accuracy": 0.15581255853176118,
"num_tokens": 11517620.0,
"step": 6240
},
{
"entropy": 5.682782220840454,
"epoch": 0.5246796891409368,
"grad_norm": 1.1015625,
"learning_rate": 0.0004978114267276134,
"loss": 5.6977,
"mean_token_accuracy": 0.15499856919050217,
"num_tokens": 11526106.0,
"step": 6245
},
{
"entropy": 5.819548797607422,
"epoch": 0.5250997689561017,
"grad_norm": 1.171875,
"learning_rate": 0.0004978072580525451,
"loss": 5.7264,
"mean_token_accuracy": 0.15035497546195983,
"num_tokens": 11535840.0,
"step": 6250
},
{
"entropy": 5.8192667961120605,
"epoch": 0.5255198487712666,
"grad_norm": 1.15625,
"learning_rate": 0.000497803085430561,
"loss": 5.703,
"mean_token_accuracy": 0.1543534591794014,
"num_tokens": 11545110.0,
"step": 6255
},
{
"entropy": 5.857662773132324,
"epoch": 0.5259399285864315,
"grad_norm": 1.15625,
"learning_rate": 0.0004977989088617349,
"loss": 5.7236,
"mean_token_accuracy": 0.1509743146598339,
"num_tokens": 11554382.0,
"step": 6260
},
{
"entropy": 5.714362955093383,
"epoch": 0.5263600084015964,
"grad_norm": 1.1875,
"learning_rate": 0.000497794728346141,
"loss": 5.5827,
"mean_token_accuracy": 0.162162946164608,
"num_tokens": 11562821.0,
"step": 6265
},
{
"entropy": 5.852323722839356,
"epoch": 0.5267800882167611,
"grad_norm": 1.125,
"learning_rate": 0.0004977905438838531,
"loss": 5.8009,
"mean_token_accuracy": 0.13918681740760802,
"num_tokens": 11571705.0,
"step": 6270
},
{
"entropy": 5.715672159194947,
"epoch": 0.527200168031926,
"grad_norm": 1.1484375,
"learning_rate": 0.0004977863554749453,
"loss": 5.6274,
"mean_token_accuracy": 0.1484605222940445,
"num_tokens": 11580692.0,
"step": 6275
},
{
"entropy": 5.748901891708374,
"epoch": 0.5276202478470909,
"grad_norm": 1.0859375,
"learning_rate": 0.0004977821631194922,
"loss": 5.6336,
"mean_token_accuracy": 0.14991371780633928,
"num_tokens": 11589966.0,
"step": 6280
},
{
"entropy": 5.8247438907623295,
"epoch": 0.5280403276622558,
"grad_norm": 1.046875,
"learning_rate": 0.0004977779668175677,
"loss": 5.6979,
"mean_token_accuracy": 0.15071138143539428,
"num_tokens": 11599627.0,
"step": 6285
},
{
"entropy": 5.786518621444702,
"epoch": 0.5284604074774207,
"grad_norm": 1.09375,
"learning_rate": 0.0004977737665692461,
"loss": 5.6958,
"mean_token_accuracy": 0.15588323771953583,
"num_tokens": 11608431.0,
"step": 6290
},
{
"entropy": 5.720681619644165,
"epoch": 0.5288804872925856,
"grad_norm": 1.203125,
"learning_rate": 0.0004977695623746021,
"loss": 5.5589,
"mean_token_accuracy": 0.156508731842041,
"num_tokens": 11617552.0,
"step": 6295
},
{
"entropy": 5.7268609523773195,
"epoch": 0.5293005671077504,
"grad_norm": 1.15625,
"learning_rate": 0.0004977653542337099,
"loss": 5.6048,
"mean_token_accuracy": 0.15938956588506697,
"num_tokens": 11626828.0,
"step": 6300
},
{
"entropy": 5.753752660751343,
"epoch": 0.5297206469229153,
"grad_norm": 1.1796875,
"learning_rate": 0.0004977611421466443,
"loss": 5.6933,
"mean_token_accuracy": 0.14879160374403,
"num_tokens": 11635867.0,
"step": 6305
},
{
"entropy": 5.821589660644531,
"epoch": 0.5301407267380802,
"grad_norm": 1.09375,
"learning_rate": 0.0004977569261134797,
"loss": 5.6119,
"mean_token_accuracy": 0.16307313442230226,
"num_tokens": 11644711.0,
"step": 6310
},
{
"entropy": 5.776092433929444,
"epoch": 0.5305608065532451,
"grad_norm": 1.1640625,
"learning_rate": 0.0004977527061342908,
"loss": 5.699,
"mean_token_accuracy": 0.15243458598852158,
"num_tokens": 11653320.0,
"step": 6315
},
{
"entropy": 5.739018869400025,
"epoch": 0.53098088636841,
"grad_norm": 1.0390625,
"learning_rate": 0.0004977484822091524,
"loss": 5.6362,
"mean_token_accuracy": 0.1549768939614296,
"num_tokens": 11662753.0,
"step": 6320
},
{
"entropy": 5.790743827819824,
"epoch": 0.5314009661835749,
"grad_norm": 1.3515625,
"learning_rate": 0.0004977442543381394,
"loss": 5.6776,
"mean_token_accuracy": 0.14615621492266656,
"num_tokens": 11671622.0,
"step": 6325
},
{
"entropy": 5.81527943611145,
"epoch": 0.5318210459987398,
"grad_norm": 1.2109375,
"learning_rate": 0.0004977400225213266,
"loss": 5.6697,
"mean_token_accuracy": 0.14917083755135535,
"num_tokens": 11679964.0,
"step": 6330
},
{
"entropy": 5.709684419631958,
"epoch": 0.5322411258139046,
"grad_norm": 1.1640625,
"learning_rate": 0.000497735786758789,
"loss": 5.6387,
"mean_token_accuracy": 0.1544922798871994,
"num_tokens": 11688700.0,
"step": 6335
},
{
"entropy": 5.761388683319092,
"epoch": 0.5326612056290695,
"grad_norm": 1.1484375,
"learning_rate": 0.0004977315470506016,
"loss": 5.7514,
"mean_token_accuracy": 0.15335165113210678,
"num_tokens": 11698425.0,
"step": 6340
},
{
"entropy": 5.918869924545288,
"epoch": 0.5330812854442344,
"grad_norm": 1.1328125,
"learning_rate": 0.0004977273033968397,
"loss": 5.7348,
"mean_token_accuracy": 0.14367725551128388,
"num_tokens": 11707705.0,
"step": 6345
},
{
"entropy": 5.790050745010376,
"epoch": 0.5335013652593993,
"grad_norm": 1.2109375,
"learning_rate": 0.0004977230557975782,
"loss": 5.6271,
"mean_token_accuracy": 0.1547234535217285,
"num_tokens": 11717079.0,
"step": 6350
},
{
"entropy": 5.737287473678589,
"epoch": 0.5339214450745642,
"grad_norm": 1.2421875,
"learning_rate": 0.0004977188042528923,
"loss": 5.6199,
"mean_token_accuracy": 0.15220877379179001,
"num_tokens": 11725504.0,
"step": 6355
},
{
"entropy": 5.763228845596314,
"epoch": 0.5343415248897291,
"grad_norm": 1.125,
"learning_rate": 0.0004977145487628576,
"loss": 5.7038,
"mean_token_accuracy": 0.14911844879388808,
"num_tokens": 11735282.0,
"step": 6360
},
{
"entropy": 5.766521883010864,
"epoch": 0.534761604704894,
"grad_norm": 1.078125,
"learning_rate": 0.0004977102893275494,
"loss": 5.6834,
"mean_token_accuracy": 0.1509793907403946,
"num_tokens": 11744827.0,
"step": 6365
},
{
"entropy": 5.80914888381958,
"epoch": 0.5351816845200588,
"grad_norm": 1.203125,
"learning_rate": 0.000497706025947043,
"loss": 5.6602,
"mean_token_accuracy": 0.15433914735913276,
"num_tokens": 11753066.0,
"step": 6370
},
{
"entropy": 5.779637861251831,
"epoch": 0.5356017643352237,
"grad_norm": 1.203125,
"learning_rate": 0.0004977017586214142,
"loss": 5.6734,
"mean_token_accuracy": 0.1484221376478672,
"num_tokens": 11761190.0,
"step": 6375
},
{
"entropy": 5.748640012741089,
"epoch": 0.5360218441503886,
"grad_norm": 1.09375,
"learning_rate": 0.0004976974873507382,
"loss": 5.6372,
"mean_token_accuracy": 0.1559848442673683,
"num_tokens": 11770321.0,
"step": 6380
},
{
"entropy": 5.738561058044434,
"epoch": 0.5364419239655535,
"grad_norm": 1.0859375,
"learning_rate": 0.000497693212135091,
"loss": 5.699,
"mean_token_accuracy": 0.1514401003718376,
"num_tokens": 11778388.0,
"step": 6385
},
{
"entropy": 5.765327787399292,
"epoch": 0.5368620037807184,
"grad_norm": 1.1484375,
"learning_rate": 0.0004976889329745482,
"loss": 5.5673,
"mean_token_accuracy": 0.15486714467406273,
"num_tokens": 11786250.0,
"step": 6390
},
{
"entropy": 5.621909475326538,
"epoch": 0.5372820835958833,
"grad_norm": 1.140625,
"learning_rate": 0.0004976846498691857,
"loss": 5.5364,
"mean_token_accuracy": 0.15950624793767929,
"num_tokens": 11794831.0,
"step": 6395
},
{
"entropy": 5.68565411567688,
"epoch": 0.5377021634110482,
"grad_norm": 1.21875,
"learning_rate": 0.0004976803628190792,
"loss": 5.5888,
"mean_token_accuracy": 0.16010595262050628,
"num_tokens": 11803550.0,
"step": 6400
},
{
"entropy": 5.74244441986084,
"epoch": 0.5381222432262129,
"grad_norm": 1.1796875,
"learning_rate": 0.0004976760718243047,
"loss": 5.651,
"mean_token_accuracy": 0.15274949222803116,
"num_tokens": 11812478.0,
"step": 6405
},
{
"entropy": 5.778332471847534,
"epoch": 0.5385423230413778,
"grad_norm": 1.15625,
"learning_rate": 0.0004976717768849383,
"loss": 5.6248,
"mean_token_accuracy": 0.14805713966488837,
"num_tokens": 11822463.0,
"step": 6410
},
{
"entropy": 5.742837858200073,
"epoch": 0.5389624028565427,
"grad_norm": 1.1796875,
"learning_rate": 0.0004976674780010561,
"loss": 5.6795,
"mean_token_accuracy": 0.14474959224462508,
"num_tokens": 11831853.0,
"step": 6415
},
{
"entropy": 5.798117733001709,
"epoch": 0.5393824826717076,
"grad_norm": 1.1015625,
"learning_rate": 0.000497663175172734,
"loss": 5.6862,
"mean_token_accuracy": 0.14788996651768685,
"num_tokens": 11841574.0,
"step": 6420
},
{
"entropy": 5.805143690109253,
"epoch": 0.5398025624868725,
"grad_norm": 1.140625,
"learning_rate": 0.0004976588684000486,
"loss": 5.7683,
"mean_token_accuracy": 0.13789285868406295,
"num_tokens": 11852489.0,
"step": 6425
},
{
"entropy": 5.778584146499634,
"epoch": 0.5402226423020374,
"grad_norm": 1.171875,
"learning_rate": 0.0004976545576830759,
"loss": 5.6469,
"mean_token_accuracy": 0.14674668833613397,
"num_tokens": 11861499.0,
"step": 6430
},
{
"entropy": 5.825410175323486,
"epoch": 0.5406427221172023,
"grad_norm": 1.1015625,
"learning_rate": 0.0004976502430218924,
"loss": 5.7217,
"mean_token_accuracy": 0.14696074053645133,
"num_tokens": 11871685.0,
"step": 6435
},
{
"entropy": 5.72649827003479,
"epoch": 0.5410628019323671,
"grad_norm": 1.0703125,
"learning_rate": 0.0004976459244165744,
"loss": 5.6456,
"mean_token_accuracy": 0.15092769861221314,
"num_tokens": 11881340.0,
"step": 6440
},
{
"entropy": 5.727269411087036,
"epoch": 0.541482881747532,
"grad_norm": 1.1015625,
"learning_rate": 0.0004976416018671986,
"loss": 5.6649,
"mean_token_accuracy": 0.14953715354204178,
"num_tokens": 11890700.0,
"step": 6445
},
{
"entropy": 5.7685645580291744,
"epoch": 0.5419029615626969,
"grad_norm": 1.171875,
"learning_rate": 0.0004976372753738415,
"loss": 5.6509,
"mean_token_accuracy": 0.14911282658576966,
"num_tokens": 11900329.0,
"step": 6450
},
{
"entropy": 5.83752589225769,
"epoch": 0.5423230413778618,
"grad_norm": 1.171875,
"learning_rate": 0.0004976329449365795,
"loss": 5.6857,
"mean_token_accuracy": 0.14632721096277237,
"num_tokens": 11909915.0,
"step": 6455
},
{
"entropy": 5.7468969345092775,
"epoch": 0.5427431211930267,
"grad_norm": 1.1640625,
"learning_rate": 0.0004976286105554897,
"loss": 5.6999,
"mean_token_accuracy": 0.15292935222387313,
"num_tokens": 11918302.0,
"step": 6460
},
{
"entropy": 5.746342515945434,
"epoch": 0.5431632010081916,
"grad_norm": 1.140625,
"learning_rate": 0.0004976242722306487,
"loss": 5.6634,
"mean_token_accuracy": 0.1509156569838524,
"num_tokens": 11927794.0,
"step": 6465
},
{
"entropy": 5.770822763442993,
"epoch": 0.5435832808233564,
"grad_norm": 1.0234375,
"learning_rate": 0.0004976199299621333,
"loss": 5.6778,
"mean_token_accuracy": 0.15434709787368775,
"num_tokens": 11937701.0,
"step": 6470
},
{
"entropy": 5.691930055618286,
"epoch": 0.5440033606385213,
"grad_norm": 1.25,
"learning_rate": 0.0004976155837500205,
"loss": 5.5987,
"mean_token_accuracy": 0.15657489597797394,
"num_tokens": 11946106.0,
"step": 6475
},
{
"entropy": 5.728176116943359,
"epoch": 0.5444234404536862,
"grad_norm": 1.1875,
"learning_rate": 0.0004976112335943872,
"loss": 5.5326,
"mean_token_accuracy": 0.15170824974775315,
"num_tokens": 11954604.0,
"step": 6480
},
{
"entropy": 5.6393060207366945,
"epoch": 0.5448435202688511,
"grad_norm": 1.3515625,
"learning_rate": 0.0004976068794953106,
"loss": 5.589,
"mean_token_accuracy": 0.15737116038799287,
"num_tokens": 11963664.0,
"step": 6485
},
{
"entropy": 5.765772342681885,
"epoch": 0.545263600084016,
"grad_norm": 1.0546875,
"learning_rate": 0.0004976025214528677,
"loss": 5.5934,
"mean_token_accuracy": 0.15527967661619185,
"num_tokens": 11973426.0,
"step": 6490
},
{
"entropy": 5.675690031051635,
"epoch": 0.5456836798991809,
"grad_norm": 1.15625,
"learning_rate": 0.0004975981594671359,
"loss": 5.6445,
"mean_token_accuracy": 0.1517896443605423,
"num_tokens": 11982339.0,
"step": 6495
},
{
"entropy": 5.829104852676392,
"epoch": 0.5461037597143458,
"grad_norm": 1.1796875,
"learning_rate": 0.0004975937935381921,
"loss": 5.6875,
"mean_token_accuracy": 0.15777059495449067,
"num_tokens": 11992016.0,
"step": 6500
},
{
"entropy": 5.725285243988037,
"epoch": 0.5465238395295106,
"grad_norm": 1.359375,
"learning_rate": 0.000497589423666114,
"loss": 5.6948,
"mean_token_accuracy": 0.1507769599556923,
"num_tokens": 12000616.0,
"step": 6505
},
{
"entropy": 5.61439094543457,
"epoch": 0.5469439193446755,
"grad_norm": 1.09375,
"learning_rate": 0.0004975850498509789,
"loss": 5.5739,
"mean_token_accuracy": 0.1537862613797188,
"num_tokens": 12009717.0,
"step": 6510
},
{
"entropy": 5.743680143356324,
"epoch": 0.5473639991598404,
"grad_norm": 1.21875,
"learning_rate": 0.0004975806720928642,
"loss": 5.6559,
"mean_token_accuracy": 0.15329572409391404,
"num_tokens": 12018020.0,
"step": 6515
},
{
"entropy": 5.767218780517578,
"epoch": 0.5477840789750053,
"grad_norm": 1.1796875,
"learning_rate": 0.0004975762903918475,
"loss": 5.652,
"mean_token_accuracy": 0.150393944978714,
"num_tokens": 12027119.0,
"step": 6520
},
{
"entropy": 5.75718412399292,
"epoch": 0.5482041587901701,
"grad_norm": 1.1640625,
"learning_rate": 0.0004975719047480064,
"loss": 5.6343,
"mean_token_accuracy": 0.15850069671869277,
"num_tokens": 12035566.0,
"step": 6525
},
{
"entropy": 5.718412923812866,
"epoch": 0.548624238605335,
"grad_norm": 1.125,
"learning_rate": 0.0004975675151614187,
"loss": 5.5588,
"mean_token_accuracy": 0.15670061558485032,
"num_tokens": 12044505.0,
"step": 6530
},
{
"entropy": 5.666616725921631,
"epoch": 0.5490443184204999,
"grad_norm": 1.390625,
"learning_rate": 0.000497563121632162,
"loss": 5.6353,
"mean_token_accuracy": 0.15498950183391572,
"num_tokens": 12053338.0,
"step": 6535
},
{
"entropy": 5.739244508743286,
"epoch": 0.5494643982356647,
"grad_norm": 1.1328125,
"learning_rate": 0.0004975587241603142,
"loss": 5.624,
"mean_token_accuracy": 0.15466095507144928,
"num_tokens": 12063235.0,
"step": 6540
},
{
"entropy": 5.815131378173828,
"epoch": 0.5498844780508296,
"grad_norm": 1.2109375,
"learning_rate": 0.0004975543227459533,
"loss": 5.6884,
"mean_token_accuracy": 0.1545080229640007,
"num_tokens": 12072490.0,
"step": 6545
},
{
"entropy": 5.773799467086792,
"epoch": 0.5503045578659945,
"grad_norm": 1.125,
"learning_rate": 0.0004975499173891571,
"loss": 5.755,
"mean_token_accuracy": 0.1480020761489868,
"num_tokens": 12081474.0,
"step": 6550
},
{
"entropy": 5.755259323120117,
"epoch": 0.5507246376811594,
"grad_norm": 1.09375,
"learning_rate": 0.0004975455080900037,
"loss": 5.6168,
"mean_token_accuracy": 0.15658100992441176,
"num_tokens": 12090963.0,
"step": 6555
},
{
"entropy": 5.762655639648438,
"epoch": 0.5511447174963243,
"grad_norm": 1.1640625,
"learning_rate": 0.0004975410948485713,
"loss": 5.621,
"mean_token_accuracy": 0.1500764697790146,
"num_tokens": 12099786.0,
"step": 6560
},
{
"entropy": 5.7010783672332765,
"epoch": 0.5515647973114892,
"grad_norm": 1.1015625,
"learning_rate": 0.0004975366776649379,
"loss": 5.6427,
"mean_token_accuracy": 0.15350330322980882,
"num_tokens": 12108469.0,
"step": 6565
},
{
"entropy": 5.758830261230469,
"epoch": 0.5519848771266541,
"grad_norm": 1.1875,
"learning_rate": 0.0004975322565391818,
"loss": 5.6122,
"mean_token_accuracy": 0.15436404794454575,
"num_tokens": 12118287.0,
"step": 6570
},
{
"entropy": 5.858336734771728,
"epoch": 0.5524049569418189,
"grad_norm": 1.171875,
"learning_rate": 0.0004975278314713814,
"loss": 5.7823,
"mean_token_accuracy": 0.14279426410794258,
"num_tokens": 12127122.0,
"step": 6575
},
{
"entropy": 5.782270908355713,
"epoch": 0.5528250367569838,
"grad_norm": 1.1953125,
"learning_rate": 0.0004975234024616152,
"loss": 5.6667,
"mean_token_accuracy": 0.15639008581638336,
"num_tokens": 12136395.0,
"step": 6580
},
{
"entropy": 5.687395429611206,
"epoch": 0.5532451165721487,
"grad_norm": 1.1640625,
"learning_rate": 0.0004975189695099613,
"loss": 5.6468,
"mean_token_accuracy": 0.15870906561613082,
"num_tokens": 12145025.0,
"step": 6585
},
{
"entropy": 5.795616292953492,
"epoch": 0.5536651963873136,
"grad_norm": 1.125,
"learning_rate": 0.0004975145326164985,
"loss": 5.6945,
"mean_token_accuracy": 0.14933544173836708,
"num_tokens": 12154352.0,
"step": 6590
},
{
"entropy": 5.753207206726074,
"epoch": 0.5540852762024785,
"grad_norm": 1.1640625,
"learning_rate": 0.0004975100917813055,
"loss": 5.6076,
"mean_token_accuracy": 0.14919200241565705,
"num_tokens": 12163802.0,
"step": 6595
},
{
"entropy": 5.691126537322998,
"epoch": 0.5545053560176434,
"grad_norm": 1.1015625,
"learning_rate": 0.0004975056470044606,
"loss": 5.6249,
"mean_token_accuracy": 0.15280142724514006,
"num_tokens": 12173111.0,
"step": 6600
},
{
"entropy": 5.721021461486816,
"epoch": 0.5549254358328082,
"grad_norm": 1.1875,
"learning_rate": 0.0004975011982860428,
"loss": 5.6579,
"mean_token_accuracy": 0.14891051650047302,
"num_tokens": 12182048.0,
"step": 6605
},
{
"entropy": 5.743900537490845,
"epoch": 0.5553455156479731,
"grad_norm": 1.140625,
"learning_rate": 0.0004974967456261309,
"loss": 5.6508,
"mean_token_accuracy": 0.1532616786658764,
"num_tokens": 12191501.0,
"step": 6610
},
{
"entropy": 5.782261896133423,
"epoch": 0.555765595463138,
"grad_norm": 1.1640625,
"learning_rate": 0.0004974922890248036,
"loss": 5.6774,
"mean_token_accuracy": 0.1570911407470703,
"num_tokens": 12201132.0,
"step": 6615
},
{
"entropy": 5.863410520553589,
"epoch": 0.5561856752783029,
"grad_norm": 1.2421875,
"learning_rate": 0.00049748782848214,
"loss": 5.8015,
"mean_token_accuracy": 0.14694183841347694,
"num_tokens": 12211082.0,
"step": 6620
},
{
"entropy": 5.847155141830444,
"epoch": 0.5566057550934678,
"grad_norm": 1.09375,
"learning_rate": 0.0004974833639982192,
"loss": 5.6247,
"mean_token_accuracy": 0.15586238205432892,
"num_tokens": 12219946.0,
"step": 6625
},
{
"entropy": 5.8018636226654055,
"epoch": 0.5570258349086327,
"grad_norm": 1.1640625,
"learning_rate": 0.00049747889557312,
"loss": 5.7296,
"mean_token_accuracy": 0.15069587752223015,
"num_tokens": 12229668.0,
"step": 6630
},
{
"entropy": 5.767895221710205,
"epoch": 0.5574459147237976,
"grad_norm": 1.1640625,
"learning_rate": 0.0004974744232069219,
"loss": 5.713,
"mean_token_accuracy": 0.15386377424001693,
"num_tokens": 12238750.0,
"step": 6635
},
{
"entropy": 5.755981302261352,
"epoch": 0.5578659945389624,
"grad_norm": 1.2578125,
"learning_rate": 0.0004974699468997038,
"loss": 5.6613,
"mean_token_accuracy": 0.15173935890197754,
"num_tokens": 12246825.0,
"step": 6640
},
{
"entropy": 5.728928136825561,
"epoch": 0.5582860743541272,
"grad_norm": 1.1796875,
"learning_rate": 0.0004974654666515452,
"loss": 5.5973,
"mean_token_accuracy": 0.1545877903699875,
"num_tokens": 12256413.0,
"step": 6645
},
{
"entropy": 5.749405002593994,
"epoch": 0.5587061541692921,
"grad_norm": 1.3125,
"learning_rate": 0.0004974609824625254,
"loss": 5.629,
"mean_token_accuracy": 0.16152739375829697,
"num_tokens": 12265458.0,
"step": 6650
},
{
"entropy": 5.596953678131103,
"epoch": 0.559126233984457,
"grad_norm": 1.171875,
"learning_rate": 0.0004974564943327239,
"loss": 5.5653,
"mean_token_accuracy": 0.15982334166765214,
"num_tokens": 12274124.0,
"step": 6655
},
{
"entropy": 5.590590953826904,
"epoch": 0.5595463137996219,
"grad_norm": 1.1171875,
"learning_rate": 0.00049745200226222,
"loss": 5.5282,
"mean_token_accuracy": 0.16951307207345961,
"num_tokens": 12283513.0,
"step": 6660
},
{
"entropy": 5.7416705131530765,
"epoch": 0.5599663936147868,
"grad_norm": 1.3828125,
"learning_rate": 0.0004974475062510936,
"loss": 5.6747,
"mean_token_accuracy": 0.1532605156302452,
"num_tokens": 12292396.0,
"step": 6665
},
{
"entropy": 5.736781597137451,
"epoch": 0.5603864734299517,
"grad_norm": 1.2265625,
"learning_rate": 0.0004974430062994242,
"loss": 5.6779,
"mean_token_accuracy": 0.15212651938199998,
"num_tokens": 12301604.0,
"step": 6670
},
{
"entropy": 5.842722940444946,
"epoch": 0.5608065532451165,
"grad_norm": 1.15625,
"learning_rate": 0.0004974385024072912,
"loss": 5.7193,
"mean_token_accuracy": 0.1500556856393814,
"num_tokens": 12310458.0,
"step": 6675
},
{
"entropy": 5.8713582992553714,
"epoch": 0.5612266330602814,
"grad_norm": 1.1875,
"learning_rate": 0.000497433994574775,
"loss": 5.716,
"mean_token_accuracy": 0.1482469394803047,
"num_tokens": 12319620.0,
"step": 6680
},
{
"entropy": 5.79235725402832,
"epoch": 0.5616467128754463,
"grad_norm": 1.1328125,
"learning_rate": 0.000497429482801955,
"loss": 5.782,
"mean_token_accuracy": 0.14732127711176873,
"num_tokens": 12329518.0,
"step": 6685
},
{
"entropy": 5.753377771377563,
"epoch": 0.5620667926906112,
"grad_norm": 1.0703125,
"learning_rate": 0.0004974249670889111,
"loss": 5.5939,
"mean_token_accuracy": 0.15525755882263184,
"num_tokens": 12338244.0,
"step": 6690
},
{
"entropy": 5.862080144882202,
"epoch": 0.5624868725057761,
"grad_norm": 1.265625,
"learning_rate": 0.0004974204474357237,
"loss": 5.7698,
"mean_token_accuracy": 0.15021772235631942,
"num_tokens": 12347962.0,
"step": 6695
},
{
"entropy": 5.7907792091369625,
"epoch": 0.562906952320941,
"grad_norm": 1.25,
"learning_rate": 0.0004974159238424723,
"loss": 5.6812,
"mean_token_accuracy": 0.15491234064102172,
"num_tokens": 12357020.0,
"step": 6700
},
{
"entropy": 5.760762023925781,
"epoch": 0.5633270321361059,
"grad_norm": 1.1796875,
"learning_rate": 0.0004974113963092376,
"loss": 5.6569,
"mean_token_accuracy": 0.1541620060801506,
"num_tokens": 12366108.0,
"step": 6705
},
{
"entropy": 5.802693128585815,
"epoch": 0.5637471119512707,
"grad_norm": 1.21875,
"learning_rate": 0.0004974068648360995,
"loss": 5.5724,
"mean_token_accuracy": 0.16243264824151993,
"num_tokens": 12374508.0,
"step": 6710
},
{
"entropy": 5.705340957641601,
"epoch": 0.5641671917664356,
"grad_norm": 1.28125,
"learning_rate": 0.0004974023294231383,
"loss": 5.5986,
"mean_token_accuracy": 0.16032086312770844,
"num_tokens": 12383555.0,
"step": 6715
},
{
"entropy": 5.718349361419678,
"epoch": 0.5645872715816005,
"grad_norm": 1.15625,
"learning_rate": 0.0004973977900704342,
"loss": 5.6873,
"mean_token_accuracy": 0.1504265695810318,
"num_tokens": 12392680.0,
"step": 6720
},
{
"entropy": 5.824889087677002,
"epoch": 0.5650073513967654,
"grad_norm": 1.0703125,
"learning_rate": 0.0004973932467780679,
"loss": 5.7349,
"mean_token_accuracy": 0.14873759895563127,
"num_tokens": 12401881.0,
"step": 6725
},
{
"entropy": 5.804216718673706,
"epoch": 0.5654274312119303,
"grad_norm": 1.078125,
"learning_rate": 0.0004973886995461197,
"loss": 5.7209,
"mean_token_accuracy": 0.1467141777276993,
"num_tokens": 12411487.0,
"step": 6730
},
{
"entropy": 5.693055963516235,
"epoch": 0.5658475110270952,
"grad_norm": 1.1171875,
"learning_rate": 0.0004973841483746703,
"loss": 5.5329,
"mean_token_accuracy": 0.16629059463739396,
"num_tokens": 12420376.0,
"step": 6735
},
{
"entropy": 5.600977802276612,
"epoch": 0.5662675908422601,
"grad_norm": 1.125,
"learning_rate": 0.0004973795932638001,
"loss": 5.582,
"mean_token_accuracy": 0.15954945236444473,
"num_tokens": 12429518.0,
"step": 6740
},
{
"entropy": 5.7596508979797365,
"epoch": 0.5666876706574249,
"grad_norm": 1.1171875,
"learning_rate": 0.00049737503421359,
"loss": 5.5539,
"mean_token_accuracy": 0.15875059515237808,
"num_tokens": 12438952.0,
"step": 6745
},
{
"entropy": 5.718114280700684,
"epoch": 0.5671077504725898,
"grad_norm": 1.2265625,
"learning_rate": 0.0004973704712241206,
"loss": 5.5653,
"mean_token_accuracy": 0.1561713308095932,
"num_tokens": 12448576.0,
"step": 6750
},
{
"entropy": 5.647132396697998,
"epoch": 0.5675278302877547,
"grad_norm": 1.1484375,
"learning_rate": 0.0004973659042954729,
"loss": 5.607,
"mean_token_accuracy": 0.1554710254073143,
"num_tokens": 12458166.0,
"step": 6755
},
{
"entropy": 5.595485591888428,
"epoch": 0.5679479101029196,
"grad_norm": 1.203125,
"learning_rate": 0.0004973613334277277,
"loss": 5.5506,
"mean_token_accuracy": 0.16030601412057877,
"num_tokens": 12467271.0,
"step": 6760
},
{
"entropy": 5.731112432479859,
"epoch": 0.5683679899180845,
"grad_norm": 1.171875,
"learning_rate": 0.0004973567586209658,
"loss": 5.7021,
"mean_token_accuracy": 0.1484995201230049,
"num_tokens": 12476255.0,
"step": 6765
},
{
"entropy": 5.772892475128174,
"epoch": 0.5687880697332494,
"grad_norm": 1.140625,
"learning_rate": 0.0004973521798752686,
"loss": 5.6618,
"mean_token_accuracy": 0.15195122808218003,
"num_tokens": 12485096.0,
"step": 6770
},
{
"entropy": 5.885549306869507,
"epoch": 0.5692081495484141,
"grad_norm": 1.1953125,
"learning_rate": 0.000497347597190717,
"loss": 5.6869,
"mean_token_accuracy": 0.15596108734607697,
"num_tokens": 12494405.0,
"step": 6775
},
{
"entropy": 5.71028151512146,
"epoch": 0.569628229363579,
"grad_norm": 1.1484375,
"learning_rate": 0.0004973430105673921,
"loss": 5.6126,
"mean_token_accuracy": 0.1530092716217041,
"num_tokens": 12503349.0,
"step": 6780
},
{
"entropy": 5.719080543518066,
"epoch": 0.5700483091787439,
"grad_norm": 1.1484375,
"learning_rate": 0.0004973384200053754,
"loss": 5.7078,
"mean_token_accuracy": 0.15315269976854323,
"num_tokens": 12513122.0,
"step": 6785
},
{
"entropy": 5.743679618835449,
"epoch": 0.5704683889939088,
"grad_norm": 1.296875,
"learning_rate": 0.000497333825504748,
"loss": 5.6455,
"mean_token_accuracy": 0.1548440471291542,
"num_tokens": 12523614.0,
"step": 6790
},
{
"entropy": 5.7131969928741455,
"epoch": 0.5708884688090737,
"grad_norm": 1.3125,
"learning_rate": 0.0004973292270655914,
"loss": 5.6936,
"mean_token_accuracy": 0.14896309226751328,
"num_tokens": 12532031.0,
"step": 6795
},
{
"entropy": 5.802944564819336,
"epoch": 0.5713085486242386,
"grad_norm": 1.2265625,
"learning_rate": 0.000497324624687987,
"loss": 5.7597,
"mean_token_accuracy": 0.14592752158641814,
"num_tokens": 12542239.0,
"step": 6800
},
{
"entropy": 5.848610973358154,
"epoch": 0.5717286284394035,
"grad_norm": 1.1953125,
"learning_rate": 0.0004973200183720164,
"loss": 5.6903,
"mean_token_accuracy": 0.1458892785012722,
"num_tokens": 12552608.0,
"step": 6805
},
{
"entropy": 5.62038254737854,
"epoch": 0.5721487082545683,
"grad_norm": 1.1640625,
"learning_rate": 0.0004973154081177611,
"loss": 5.5269,
"mean_token_accuracy": 0.15161003023386002,
"num_tokens": 12562020.0,
"step": 6810
},
{
"entropy": 5.707475709915161,
"epoch": 0.5725687880697332,
"grad_norm": 1.25,
"learning_rate": 0.0004973107939253027,
"loss": 5.5979,
"mean_token_accuracy": 0.1621535986661911,
"num_tokens": 12570519.0,
"step": 6815
},
{
"entropy": 5.6555585861206055,
"epoch": 0.5729888678848981,
"grad_norm": 1.1484375,
"learning_rate": 0.0004973061757947233,
"loss": 5.6092,
"mean_token_accuracy": 0.15764508694410323,
"num_tokens": 12579324.0,
"step": 6820
},
{
"entropy": 5.704854106903076,
"epoch": 0.573408947700063,
"grad_norm": 1.2109375,
"learning_rate": 0.0004973015537261043,
"loss": 5.6644,
"mean_token_accuracy": 0.15810555219650269,
"num_tokens": 12588014.0,
"step": 6825
},
{
"entropy": 5.835838747024536,
"epoch": 0.5738290275152279,
"grad_norm": 1.2265625,
"learning_rate": 0.0004972969277195279,
"loss": 5.6713,
"mean_token_accuracy": 0.15615084320306777,
"num_tokens": 12596882.0,
"step": 6830
},
{
"entropy": 5.745213270187378,
"epoch": 0.5742491073303928,
"grad_norm": 1.171875,
"learning_rate": 0.0004972922977750757,
"loss": 5.5888,
"mean_token_accuracy": 0.15589642524719238,
"num_tokens": 12606069.0,
"step": 6835
},
{
"entropy": 5.724167776107788,
"epoch": 0.5746691871455577,
"grad_norm": 1.953125,
"learning_rate": 0.00049728766389283,
"loss": 5.6297,
"mean_token_accuracy": 0.1498618669807911,
"num_tokens": 12615167.0,
"step": 6840
},
{
"entropy": 5.698711585998535,
"epoch": 0.5750892669607225,
"grad_norm": 1.234375,
"learning_rate": 0.0004972830260728729,
"loss": 5.6586,
"mean_token_accuracy": 0.15492099523544312,
"num_tokens": 12624230.0,
"step": 6845
},
{
"entropy": 5.763922691345215,
"epoch": 0.5755093467758874,
"grad_norm": 1.2578125,
"learning_rate": 0.0004972783843152863,
"loss": 5.6375,
"mean_token_accuracy": 0.15997321605682374,
"num_tokens": 12633158.0,
"step": 6850
},
{
"entropy": 5.726704406738281,
"epoch": 0.5759294265910523,
"grad_norm": 1.4765625,
"learning_rate": 0.0004972737386201527,
"loss": 5.5934,
"mean_token_accuracy": 0.15616918057203294,
"num_tokens": 12641465.0,
"step": 6855
},
{
"entropy": 5.6682007789611815,
"epoch": 0.5763495064062172,
"grad_norm": 1.21875,
"learning_rate": 0.0004972690889875541,
"loss": 5.557,
"mean_token_accuracy": 0.15953669995069503,
"num_tokens": 12650437.0,
"step": 6860
},
{
"entropy": 5.859883642196655,
"epoch": 0.5767695862213821,
"grad_norm": 1.1796875,
"learning_rate": 0.0004972644354175732,
"loss": 5.7609,
"mean_token_accuracy": 0.14930831044912338,
"num_tokens": 12660072.0,
"step": 6865
},
{
"entropy": 5.833072233200073,
"epoch": 0.577189666036547,
"grad_norm": 1.1796875,
"learning_rate": 0.0004972597779102922,
"loss": 5.7682,
"mean_token_accuracy": 0.14977988600730896,
"num_tokens": 12670405.0,
"step": 6870
},
{
"entropy": 5.733975601196289,
"epoch": 0.5776097458517119,
"grad_norm": 1.234375,
"learning_rate": 0.0004972551164657937,
"loss": 5.6639,
"mean_token_accuracy": 0.15277623534202575,
"num_tokens": 12679992.0,
"step": 6875
},
{
"entropy": 5.823847150802612,
"epoch": 0.5780298256668767,
"grad_norm": 1.2421875,
"learning_rate": 0.0004972504510841602,
"loss": 5.7063,
"mean_token_accuracy": 0.1545391082763672,
"num_tokens": 12690289.0,
"step": 6880
},
{
"entropy": 5.863948345184326,
"epoch": 0.5784499054820416,
"grad_norm": 1.15625,
"learning_rate": 0.0004972457817654745,
"loss": 5.7185,
"mean_token_accuracy": 0.15025423765182494,
"num_tokens": 12700518.0,
"step": 6885
},
{
"entropy": 5.788507699966431,
"epoch": 0.5788699852972065,
"grad_norm": 1.1875,
"learning_rate": 0.0004972411085098191,
"loss": 5.7685,
"mean_token_accuracy": 0.14289451837539674,
"num_tokens": 12710603.0,
"step": 6890
},
{
"entropy": 5.803801727294922,
"epoch": 0.5792900651123714,
"grad_norm": 1.1640625,
"learning_rate": 0.000497236431317277,
"loss": 5.6505,
"mean_token_accuracy": 0.15858056545257568,
"num_tokens": 12719298.0,
"step": 6895
},
{
"entropy": 5.794863271713257,
"epoch": 0.5797101449275363,
"grad_norm": 1.2421875,
"learning_rate": 0.000497231750187931,
"loss": 5.6389,
"mean_token_accuracy": 0.15869482904672622,
"num_tokens": 12728368.0,
"step": 6900
},
{
"entropy": 5.842878913879394,
"epoch": 0.5801302247427012,
"grad_norm": 1.203125,
"learning_rate": 0.0004972270651218638,
"loss": 5.7213,
"mean_token_accuracy": 0.15775972306728364,
"num_tokens": 12737898.0,
"step": 6905
},
{
"entropy": 5.82655348777771,
"epoch": 0.580550304557866,
"grad_norm": 1.421875,
"learning_rate": 0.0004972223761191587,
"loss": 5.6457,
"mean_token_accuracy": 0.15020321011543275,
"num_tokens": 12746761.0,
"step": 6910
},
{
"entropy": 5.683896875381469,
"epoch": 0.5809703843730308,
"grad_norm": 1.21875,
"learning_rate": 0.0004972176831798986,
"loss": 5.5597,
"mean_token_accuracy": 0.1623480200767517,
"num_tokens": 12755128.0,
"step": 6915
},
{
"entropy": 5.7375232696533205,
"epoch": 0.5813904641881957,
"grad_norm": 1.140625,
"learning_rate": 0.0004972129863041667,
"loss": 5.7634,
"mean_token_accuracy": 0.14732126891613007,
"num_tokens": 12764727.0,
"step": 6920
},
{
"entropy": 5.8072254180908205,
"epoch": 0.5818105440033606,
"grad_norm": 1.1171875,
"learning_rate": 0.0004972082854920462,
"loss": 5.6257,
"mean_token_accuracy": 0.15929616689682008,
"num_tokens": 12773557.0,
"step": 6925
},
{
"entropy": 5.735633420944214,
"epoch": 0.5822306238185255,
"grad_norm": 1.1875,
"learning_rate": 0.0004972035807436203,
"loss": 5.6056,
"mean_token_accuracy": 0.16261496543884277,
"num_tokens": 12782525.0,
"step": 6930
},
{
"entropy": 5.823388385772705,
"epoch": 0.5826507036336904,
"grad_norm": 1.21875,
"learning_rate": 0.0004971988720589723,
"loss": 5.7055,
"mean_token_accuracy": 0.1529671147465706,
"num_tokens": 12791534.0,
"step": 6935
},
{
"entropy": 5.790287637710572,
"epoch": 0.5830707834488553,
"grad_norm": 1.234375,
"learning_rate": 0.0004971941594381858,
"loss": 5.6007,
"mean_token_accuracy": 0.15712943971157073,
"num_tokens": 12800662.0,
"step": 6940
},
{
"entropy": 5.758783388137817,
"epoch": 0.5834908632640201,
"grad_norm": 1.1015625,
"learning_rate": 0.0004971894428813441,
"loss": 5.6603,
"mean_token_accuracy": 0.1564931422472,
"num_tokens": 12809440.0,
"step": 6945
},
{
"entropy": 5.827863645553589,
"epoch": 0.583910943079185,
"grad_norm": 1.25,
"learning_rate": 0.000497184722388531,
"loss": 5.7252,
"mean_token_accuracy": 0.15080841034650802,
"num_tokens": 12818560.0,
"step": 6950
},
{
"entropy": 5.824147796630859,
"epoch": 0.5843310228943499,
"grad_norm": 1.171875,
"learning_rate": 0.0004971799979598297,
"loss": 5.6712,
"mean_token_accuracy": 0.15087602585554122,
"num_tokens": 12827898.0,
"step": 6955
},
{
"entropy": 5.652761316299438,
"epoch": 0.5847511027095148,
"grad_norm": 1.2109375,
"learning_rate": 0.0004971752695953243,
"loss": 5.5959,
"mean_token_accuracy": 0.1591871291399002,
"num_tokens": 12837199.0,
"step": 6960
},
{
"entropy": 5.755173635482788,
"epoch": 0.5851711825246797,
"grad_norm": 1.15625,
"learning_rate": 0.0004971705372950984,
"loss": 5.6386,
"mean_token_accuracy": 0.15210898518562316,
"num_tokens": 12846493.0,
"step": 6965
},
{
"entropy": 5.80136251449585,
"epoch": 0.5855912623398446,
"grad_norm": 1.1484375,
"learning_rate": 0.0004971658010592358,
"loss": 5.6393,
"mean_token_accuracy": 0.15254888236522673,
"num_tokens": 12855026.0,
"step": 6970
},
{
"entropy": 5.743722152709961,
"epoch": 0.5860113421550095,
"grad_norm": 1.3046875,
"learning_rate": 0.0004971610608878205,
"loss": 5.7066,
"mean_token_accuracy": 0.15148996114730834,
"num_tokens": 12864563.0,
"step": 6975
},
{
"entropy": 5.840014123916626,
"epoch": 0.5864314219701743,
"grad_norm": 1.2265625,
"learning_rate": 0.0004971563167809363,
"loss": 5.6431,
"mean_token_accuracy": 0.15958610326051711,
"num_tokens": 12874358.0,
"step": 6980
},
{
"entropy": 5.7371094703674315,
"epoch": 0.5868515017853392,
"grad_norm": 1.140625,
"learning_rate": 0.0004971515687386674,
"loss": 5.6661,
"mean_token_accuracy": 0.15151161104440689,
"num_tokens": 12883110.0,
"step": 6985
},
{
"entropy": 5.755078363418579,
"epoch": 0.5872715816005041,
"grad_norm": 1.2109375,
"learning_rate": 0.0004971468167610978,
"loss": 5.7166,
"mean_token_accuracy": 0.15706731230020524,
"num_tokens": 12892977.0,
"step": 6990
},
{
"entropy": 5.698009443283081,
"epoch": 0.587691661415669,
"grad_norm": 1.171875,
"learning_rate": 0.0004971420608483117,
"loss": 5.5308,
"mean_token_accuracy": 0.16349953562021255,
"num_tokens": 12902327.0,
"step": 6995
},
{
"entropy": 5.649787044525146,
"epoch": 0.5881117412308339,
"grad_norm": 1.2890625,
"learning_rate": 0.0004971373010003936,
"loss": 5.5487,
"mean_token_accuracy": 0.16682940125465393,
"num_tokens": 12911957.0,
"step": 7000
},
{
"entropy": 5.733649349212646,
"epoch": 0.5885318210459988,
"grad_norm": 1.4609375,
"learning_rate": 0.0004971325372174274,
"loss": 5.6528,
"mean_token_accuracy": 0.15108825862407685,
"num_tokens": 12920380.0,
"step": 7005
},
{
"entropy": 5.750072050094604,
"epoch": 0.5889519008611637,
"grad_norm": 1.2265625,
"learning_rate": 0.0004971277694994976,
"loss": 5.7134,
"mean_token_accuracy": 0.1527300551533699,
"num_tokens": 12929670.0,
"step": 7010
},
{
"entropy": 5.763653707504273,
"epoch": 0.5893719806763285,
"grad_norm": 1.2578125,
"learning_rate": 0.000497122997846689,
"loss": 5.6119,
"mean_token_accuracy": 0.16422443389892577,
"num_tokens": 12938185.0,
"step": 7015
},
{
"entropy": 5.749804496765137,
"epoch": 0.5897920604914934,
"grad_norm": 1.2109375,
"learning_rate": 0.0004971182222590857,
"loss": 5.6385,
"mean_token_accuracy": 0.15942007005214692,
"num_tokens": 12947706.0,
"step": 7020
},
{
"entropy": 5.678315114974976,
"epoch": 0.5902121403066583,
"grad_norm": 1.3984375,
"learning_rate": 0.0004971134427367725,
"loss": 5.6385,
"mean_token_accuracy": 0.15062529146671294,
"num_tokens": 12957393.0,
"step": 7025
},
{
"entropy": 5.782941627502441,
"epoch": 0.5906322201218231,
"grad_norm": 1.1484375,
"learning_rate": 0.000497108659279834,
"loss": 5.5173,
"mean_token_accuracy": 0.16111443936824799,
"num_tokens": 12967165.0,
"step": 7030
},
{
"entropy": 5.780286359786987,
"epoch": 0.591052299936988,
"grad_norm": 1.2265625,
"learning_rate": 0.0004971038718883551,
"loss": 5.6612,
"mean_token_accuracy": 0.14747030138969422,
"num_tokens": 12976490.0,
"step": 7035
},
{
"entropy": 5.7380303859710695,
"epoch": 0.5914723797521529,
"grad_norm": 1.234375,
"learning_rate": 0.0004970990805624203,
"loss": 5.6581,
"mean_token_accuracy": 0.14838391840457915,
"num_tokens": 12985423.0,
"step": 7040
},
{
"entropy": 5.707084608078003,
"epoch": 0.5918924595673178,
"grad_norm": 1.28125,
"learning_rate": 0.0004970942853021147,
"loss": 5.5599,
"mean_token_accuracy": 0.1571175068616867,
"num_tokens": 12994510.0,
"step": 7045
},
{
"entropy": 5.783777189254761,
"epoch": 0.5923125393824826,
"grad_norm": 1.078125,
"learning_rate": 0.0004970894861075232,
"loss": 5.6727,
"mean_token_accuracy": 0.15406235307455063,
"num_tokens": 13003383.0,
"step": 7050
},
{
"entropy": 5.771741485595703,
"epoch": 0.5927326191976475,
"grad_norm": 1.1484375,
"learning_rate": 0.0004970846829787309,
"loss": 5.6122,
"mean_token_accuracy": 0.15840743184089662,
"num_tokens": 13012550.0,
"step": 7055
},
{
"entropy": 5.776666688919067,
"epoch": 0.5931526990128124,
"grad_norm": 1.234375,
"learning_rate": 0.0004970798759158227,
"loss": 5.6561,
"mean_token_accuracy": 0.1523322269320488,
"num_tokens": 13022066.0,
"step": 7060
},
{
"entropy": 5.756951570510864,
"epoch": 0.5935727788279773,
"grad_norm": 1.2734375,
"learning_rate": 0.0004970750649188839,
"loss": 5.6481,
"mean_token_accuracy": 0.15815414711833,
"num_tokens": 13031008.0,
"step": 7065
},
{
"entropy": 5.679452228546142,
"epoch": 0.5939928586431422,
"grad_norm": 1.1953125,
"learning_rate": 0.0004970702499879998,
"loss": 5.6268,
"mean_token_accuracy": 0.15596338957548142,
"num_tokens": 13040366.0,
"step": 7070
},
{
"entropy": 5.706829309463501,
"epoch": 0.5944129384583071,
"grad_norm": 1.1015625,
"learning_rate": 0.0004970654311232554,
"loss": 5.6516,
"mean_token_accuracy": 0.15320228338241576,
"num_tokens": 13051140.0,
"step": 7075
},
{
"entropy": 5.750200080871582,
"epoch": 0.594833018273472,
"grad_norm": 1.2734375,
"learning_rate": 0.0004970606083247362,
"loss": 5.5899,
"mean_token_accuracy": 0.15294858068227768,
"num_tokens": 13059835.0,
"step": 7080
},
{
"entropy": 5.651260042190552,
"epoch": 0.5952530980886368,
"grad_norm": 1.1640625,
"learning_rate": 0.0004970557815925278,
"loss": 5.5402,
"mean_token_accuracy": 0.1561323568224907,
"num_tokens": 13068909.0,
"step": 7085
},
{
"entropy": 5.683264064788818,
"epoch": 0.5956731779038017,
"grad_norm": 1.1640625,
"learning_rate": 0.0004970509509267155,
"loss": 5.5946,
"mean_token_accuracy": 0.15663747787475585,
"num_tokens": 13078380.0,
"step": 7090
},
{
"entropy": 5.806794452667236,
"epoch": 0.5960932577189666,
"grad_norm": 1.1796875,
"learning_rate": 0.0004970461163273849,
"loss": 5.6274,
"mean_token_accuracy": 0.15849297642707824,
"num_tokens": 13087774.0,
"step": 7095
},
{
"entropy": 5.651058006286621,
"epoch": 0.5965133375341315,
"grad_norm": 1.234375,
"learning_rate": 0.0004970412777946219,
"loss": 5.4687,
"mean_token_accuracy": 0.1637790635228157,
"num_tokens": 13095938.0,
"step": 7100
},
{
"entropy": 5.640749740600586,
"epoch": 0.5969334173492964,
"grad_norm": 1.203125,
"learning_rate": 0.0004970364353285117,
"loss": 5.6397,
"mean_token_accuracy": 0.1581492930650711,
"num_tokens": 13104661.0,
"step": 7105
},
{
"entropy": 5.792786121368408,
"epoch": 0.5973534971644613,
"grad_norm": 1.2578125,
"learning_rate": 0.0004970315889291405,
"loss": 5.5999,
"mean_token_accuracy": 0.15228829234838487,
"num_tokens": 13114505.0,
"step": 7110
},
{
"entropy": 5.620942783355713,
"epoch": 0.5977735769796261,
"grad_norm": 1.140625,
"learning_rate": 0.0004970267385965941,
"loss": 5.5679,
"mean_token_accuracy": 0.15517654567956923,
"num_tokens": 13124590.0,
"step": 7115
},
{
"entropy": 5.6427405834197994,
"epoch": 0.598193656794791,
"grad_norm": 1.296875,
"learning_rate": 0.0004970218843309583,
"loss": 5.549,
"mean_token_accuracy": 0.16136947572231292,
"num_tokens": 13134026.0,
"step": 7120
},
{
"entropy": 5.7891843795776365,
"epoch": 0.5986137366099559,
"grad_norm": 1.25,
"learning_rate": 0.0004970170261323192,
"loss": 5.7066,
"mean_token_accuracy": 0.1564835265278816,
"num_tokens": 13142654.0,
"step": 7125
},
{
"entropy": 5.692693042755127,
"epoch": 0.5990338164251208,
"grad_norm": 1.2109375,
"learning_rate": 0.0004970121640007627,
"loss": 5.6146,
"mean_token_accuracy": 0.15288418233394624,
"num_tokens": 13151177.0,
"step": 7130
},
{
"entropy": 5.707670879364014,
"epoch": 0.5994538962402857,
"grad_norm": 1.140625,
"learning_rate": 0.0004970072979363751,
"loss": 5.6171,
"mean_token_accuracy": 0.15406128615140915,
"num_tokens": 13159689.0,
"step": 7135
},
{
"entropy": 5.676187992095947,
"epoch": 0.5998739760554506,
"grad_norm": 1.2109375,
"learning_rate": 0.0004970024279392425,
"loss": 5.6408,
"mean_token_accuracy": 0.15284743309020996,
"num_tokens": 13168601.0,
"step": 7140
},
{
"entropy": 5.714297485351563,
"epoch": 0.6002940558706155,
"grad_norm": 1.2578125,
"learning_rate": 0.0004969975540094513,
"loss": 5.6171,
"mean_token_accuracy": 0.1539945885539055,
"num_tokens": 13177035.0,
"step": 7145
},
{
"entropy": 5.777990198135376,
"epoch": 0.6007141356857802,
"grad_norm": 1.2265625,
"learning_rate": 0.0004969926761470876,
"loss": 5.5848,
"mean_token_accuracy": 0.16266828924417495,
"num_tokens": 13185444.0,
"step": 7150
},
{
"entropy": 5.732035493850708,
"epoch": 0.6011342155009451,
"grad_norm": 1.1875,
"learning_rate": 0.000496987794352238,
"loss": 5.5854,
"mean_token_accuracy": 0.16005024313926697,
"num_tokens": 13194987.0,
"step": 7155
},
{
"entropy": 5.668026494979858,
"epoch": 0.60155429531611,
"grad_norm": 1.140625,
"learning_rate": 0.0004969829086249889,
"loss": 5.6295,
"mean_token_accuracy": 0.15632065534591674,
"num_tokens": 13203807.0,
"step": 7160
},
{
"entropy": 5.78542275428772,
"epoch": 0.6019743751312749,
"grad_norm": 1.1953125,
"learning_rate": 0.000496978018965427,
"loss": 5.7228,
"mean_token_accuracy": 0.1500210165977478,
"num_tokens": 13214362.0,
"step": 7165
},
{
"entropy": 5.81239013671875,
"epoch": 0.6023944549464398,
"grad_norm": 1.2421875,
"learning_rate": 0.0004969731253736387,
"loss": 5.7303,
"mean_token_accuracy": 0.15099334120750427,
"num_tokens": 13224192.0,
"step": 7170
},
{
"entropy": 5.714849805831909,
"epoch": 0.6028145347616047,
"grad_norm": 1.140625,
"learning_rate": 0.0004969682278497109,
"loss": 5.694,
"mean_token_accuracy": 0.15593211725354195,
"num_tokens": 13234430.0,
"step": 7175
},
{
"entropy": 5.6827106952667235,
"epoch": 0.6032346145767696,
"grad_norm": 1.1875,
"learning_rate": 0.0004969633263937301,
"loss": 5.5785,
"mean_token_accuracy": 0.15635734647512436,
"num_tokens": 13243681.0,
"step": 7180
},
{
"entropy": 5.8973612785339355,
"epoch": 0.6036546943919344,
"grad_norm": 1.1640625,
"learning_rate": 0.0004969584210057832,
"loss": 5.8472,
"mean_token_accuracy": 0.14447207152843475,
"num_tokens": 13254334.0,
"step": 7185
},
{
"entropy": 5.908036136627198,
"epoch": 0.6040747742070993,
"grad_norm": 1.09375,
"learning_rate": 0.0004969535116859573,
"loss": 5.669,
"mean_token_accuracy": 0.1561342217028141,
"num_tokens": 13263781.0,
"step": 7190
},
{
"entropy": 5.671984529495239,
"epoch": 0.6044948540222642,
"grad_norm": 1.2109375,
"learning_rate": 0.0004969485984343392,
"loss": 5.5792,
"mean_token_accuracy": 0.155980384349823,
"num_tokens": 13272831.0,
"step": 7195
},
{
"entropy": 5.727685928344727,
"epoch": 0.6049149338374291,
"grad_norm": 1.359375,
"learning_rate": 0.000496943681251016,
"loss": 5.6261,
"mean_token_accuracy": 0.15507230162620544,
"num_tokens": 13281621.0,
"step": 7200
},
{
"entropy": 5.654034233093261,
"epoch": 0.605335013652594,
"grad_norm": 1.21875,
"learning_rate": 0.0004969387601360747,
"loss": 5.6026,
"mean_token_accuracy": 0.14952811896800994,
"num_tokens": 13291021.0,
"step": 7205
},
{
"entropy": 5.738556241989135,
"epoch": 0.6057550934677589,
"grad_norm": 1.2890625,
"learning_rate": 0.0004969338350896026,
"loss": 5.6241,
"mean_token_accuracy": 0.15897882282733916,
"num_tokens": 13299752.0,
"step": 7210
},
{
"entropy": 5.826848030090332,
"epoch": 0.6061751732829238,
"grad_norm": 1.2109375,
"learning_rate": 0.0004969289061116869,
"loss": 5.6755,
"mean_token_accuracy": 0.14820482730865478,
"num_tokens": 13309112.0,
"step": 7215
},
{
"entropy": 5.764469861984253,
"epoch": 0.6065952530980886,
"grad_norm": 1.2578125,
"learning_rate": 0.0004969239732024148,
"loss": 5.6566,
"mean_token_accuracy": 0.15721848756074905,
"num_tokens": 13318328.0,
"step": 7220
},
{
"entropy": 5.5925215721130375,
"epoch": 0.6070153329132535,
"grad_norm": 1.125,
"learning_rate": 0.0004969190363618739,
"loss": 5.5457,
"mean_token_accuracy": 0.1585030034184456,
"num_tokens": 13328940.0,
"step": 7225
},
{
"entropy": 5.665217399597168,
"epoch": 0.6074354127284184,
"grad_norm": 1.28125,
"learning_rate": 0.0004969140955901516,
"loss": 5.5687,
"mean_token_accuracy": 0.16284290254116057,
"num_tokens": 13337829.0,
"step": 7230
},
{
"entropy": 5.850957870483398,
"epoch": 0.6078554925435833,
"grad_norm": 1.09375,
"learning_rate": 0.0004969091508873352,
"loss": 5.7334,
"mean_token_accuracy": 0.14851107820868492,
"num_tokens": 13348289.0,
"step": 7235
},
{
"entropy": 5.759013700485229,
"epoch": 0.6082755723587482,
"grad_norm": 1.1875,
"learning_rate": 0.0004969042022535126,
"loss": 5.653,
"mean_token_accuracy": 0.15454121828079223,
"num_tokens": 13357292.0,
"step": 7240
},
{
"entropy": 5.779410743713379,
"epoch": 0.6086956521739131,
"grad_norm": 1.25,
"learning_rate": 0.0004968992496887713,
"loss": 5.687,
"mean_token_accuracy": 0.15323319733142854,
"num_tokens": 13366640.0,
"step": 7245
},
{
"entropy": 5.730707216262817,
"epoch": 0.609115731989078,
"grad_norm": 1.0703125,
"learning_rate": 0.0004968942931931989,
"loss": 5.5924,
"mean_token_accuracy": 0.16110291406512262,
"num_tokens": 13377509.0,
"step": 7250
},
{
"entropy": 5.706106567382813,
"epoch": 0.6095358118042428,
"grad_norm": 1.28125,
"learning_rate": 0.0004968893327668835,
"loss": 5.6956,
"mean_token_accuracy": 0.14979534447193146,
"num_tokens": 13386573.0,
"step": 7255
},
{
"entropy": 5.660001468658447,
"epoch": 0.6099558916194077,
"grad_norm": 1.1015625,
"learning_rate": 0.0004968843684099128,
"loss": 5.5449,
"mean_token_accuracy": 0.15726414173841477,
"num_tokens": 13395790.0,
"step": 7260
},
{
"entropy": 5.721255302429199,
"epoch": 0.6103759714345726,
"grad_norm": 1.328125,
"learning_rate": 0.0004968794001223747,
"loss": 5.5967,
"mean_token_accuracy": 0.15050944983959197,
"num_tokens": 13405265.0,
"step": 7265
},
{
"entropy": 5.687171363830567,
"epoch": 0.6107960512497375,
"grad_norm": 1.265625,
"learning_rate": 0.0004968744279043574,
"loss": 5.5956,
"mean_token_accuracy": 0.1621822014451027,
"num_tokens": 13413796.0,
"step": 7270
},
{
"entropy": 5.776734972000122,
"epoch": 0.6112161310649024,
"grad_norm": 1.234375,
"learning_rate": 0.0004968694517559488,
"loss": 5.6438,
"mean_token_accuracy": 0.15677839443087577,
"num_tokens": 13423299.0,
"step": 7275
},
{
"entropy": 5.688075065612793,
"epoch": 0.6116362108800673,
"grad_norm": 1.21875,
"learning_rate": 0.0004968644716772371,
"loss": 5.5675,
"mean_token_accuracy": 0.16029704958200455,
"num_tokens": 13432267.0,
"step": 7280
},
{
"entropy": 5.672435188293457,
"epoch": 0.612056290695232,
"grad_norm": 1.265625,
"learning_rate": 0.0004968594876683105,
"loss": 5.6545,
"mean_token_accuracy": 0.151273912191391,
"num_tokens": 13442332.0,
"step": 7285
},
{
"entropy": 5.711434841156006,
"epoch": 0.6124763705103969,
"grad_norm": 1.21875,
"learning_rate": 0.0004968544997292572,
"loss": 5.6178,
"mean_token_accuracy": 0.15658531337976456,
"num_tokens": 13451700.0,
"step": 7290
},
{
"entropy": 5.742845582962036,
"epoch": 0.6128964503255618,
"grad_norm": 1.203125,
"learning_rate": 0.0004968495078601659,
"loss": 5.7153,
"mean_token_accuracy": 0.14965228438377381,
"num_tokens": 13461009.0,
"step": 7295
},
{
"entropy": 5.795819425582886,
"epoch": 0.6133165301407267,
"grad_norm": 1.2265625,
"learning_rate": 0.0004968445120611247,
"loss": 5.7035,
"mean_token_accuracy": 0.1513729974627495,
"num_tokens": 13470341.0,
"step": 7300
},
{
"entropy": 5.769411659240722,
"epoch": 0.6137366099558916,
"grad_norm": 1.1328125,
"learning_rate": 0.0004968395123322223,
"loss": 5.6378,
"mean_token_accuracy": 0.15624384433031083,
"num_tokens": 13479898.0,
"step": 7305
},
{
"entropy": 5.633978939056396,
"epoch": 0.6141566897710565,
"grad_norm": 1.1875,
"learning_rate": 0.000496834508673547,
"loss": 5.5368,
"mean_token_accuracy": 0.1608336254954338,
"num_tokens": 13488116.0,
"step": 7310
},
{
"entropy": 5.718822479248047,
"epoch": 0.6145767695862214,
"grad_norm": 1.171875,
"learning_rate": 0.0004968295010851877,
"loss": 5.6043,
"mean_token_accuracy": 0.15767939537763595,
"num_tokens": 13497814.0,
"step": 7315
},
{
"entropy": 5.695955896377564,
"epoch": 0.6149968494013862,
"grad_norm": 1.28125,
"learning_rate": 0.0004968244895672331,
"loss": 5.5826,
"mean_token_accuracy": 0.15185445845127105,
"num_tokens": 13506617.0,
"step": 7320
},
{
"entropy": 5.676547336578369,
"epoch": 0.6154169292165511,
"grad_norm": 1.171875,
"learning_rate": 0.0004968194741197718,
"loss": 5.7503,
"mean_token_accuracy": 0.14818976521492006,
"num_tokens": 13516632.0,
"step": 7325
},
{
"entropy": 5.856671953201294,
"epoch": 0.615837009031716,
"grad_norm": 1.2421875,
"learning_rate": 0.0004968144547428927,
"loss": 5.6583,
"mean_token_accuracy": 0.15702558159828187,
"num_tokens": 13526452.0,
"step": 7330
},
{
"entropy": 5.799902677536011,
"epoch": 0.6162570888468809,
"grad_norm": 1.328125,
"learning_rate": 0.0004968094314366848,
"loss": 5.5676,
"mean_token_accuracy": 0.15817629396915436,
"num_tokens": 13535663.0,
"step": 7335
},
{
"entropy": 5.622874593734741,
"epoch": 0.6166771686620458,
"grad_norm": 1.1328125,
"learning_rate": 0.000496804404201237,
"loss": 5.5007,
"mean_token_accuracy": 0.17091045528650284,
"num_tokens": 13544574.0,
"step": 7340
},
{
"entropy": 5.760069704055786,
"epoch": 0.6170972484772107,
"grad_norm": 1.328125,
"learning_rate": 0.0004967993730366385,
"loss": 5.6752,
"mean_token_accuracy": 0.15604218244552612,
"num_tokens": 13553041.0,
"step": 7345
},
{
"entropy": 5.671512126922607,
"epoch": 0.6175173282923756,
"grad_norm": 1.21875,
"learning_rate": 0.0004967943379429781,
"loss": 5.6417,
"mean_token_accuracy": 0.15232662558555604,
"num_tokens": 13562108.0,
"step": 7350
},
{
"entropy": 5.906947946548462,
"epoch": 0.6179374081075404,
"grad_norm": 1.1484375,
"learning_rate": 0.0004967892989203454,
"loss": 5.7739,
"mean_token_accuracy": 0.14764633178710937,
"num_tokens": 13571500.0,
"step": 7355
},
{
"entropy": 5.8263520240783695,
"epoch": 0.6183574879227053,
"grad_norm": 1.1640625,
"learning_rate": 0.0004967842559688295,
"loss": 5.6844,
"mean_token_accuracy": 0.1520024761557579,
"num_tokens": 13581304.0,
"step": 7360
},
{
"entropy": 5.683232069015503,
"epoch": 0.6187775677378702,
"grad_norm": 1.15625,
"learning_rate": 0.0004967792090885195,
"loss": 5.5609,
"mean_token_accuracy": 0.16256952211260794,
"num_tokens": 13590734.0,
"step": 7365
},
{
"entropy": 5.66082911491394,
"epoch": 0.6191976475530351,
"grad_norm": 1.171875,
"learning_rate": 0.0004967741582795052,
"loss": 5.6304,
"mean_token_accuracy": 0.1554405942559242,
"num_tokens": 13600486.0,
"step": 7370
},
{
"entropy": 5.834723377227784,
"epoch": 0.6196177273682,
"grad_norm": 1.125,
"learning_rate": 0.0004967691035418758,
"loss": 5.6532,
"mean_token_accuracy": 0.14801207035779954,
"num_tokens": 13610542.0,
"step": 7375
},
{
"entropy": 5.717353200912475,
"epoch": 0.6200378071833649,
"grad_norm": 1.2265625,
"learning_rate": 0.000496764044875721,
"loss": 5.5897,
"mean_token_accuracy": 0.160822394490242,
"num_tokens": 13619431.0,
"step": 7380
},
{
"entropy": 5.72810926437378,
"epoch": 0.6204578869985298,
"grad_norm": 1.125,
"learning_rate": 0.0004967589822811303,
"loss": 5.6448,
"mean_token_accuracy": 0.15230367332696915,
"num_tokens": 13629930.0,
"step": 7385
},
{
"entropy": 5.810457468032837,
"epoch": 0.6208779668136946,
"grad_norm": 1.1484375,
"learning_rate": 0.0004967539157581934,
"loss": 5.7607,
"mean_token_accuracy": 0.1508558511734009,
"num_tokens": 13639439.0,
"step": 7390
},
{
"entropy": 5.810669803619385,
"epoch": 0.6212980466288595,
"grad_norm": 1.1953125,
"learning_rate": 0.000496748845307,
"loss": 5.681,
"mean_token_accuracy": 0.15527973473072051,
"num_tokens": 13648548.0,
"step": 7395
},
{
"entropy": 5.788920211791992,
"epoch": 0.6217181264440244,
"grad_norm": 1.1953125,
"learning_rate": 0.0004967437709276401,
"loss": 5.7221,
"mean_token_accuracy": 0.15710752308368683,
"num_tokens": 13657658.0,
"step": 7400
},
{
"entropy": 5.674347543716431,
"epoch": 0.6221382062591893,
"grad_norm": 1.171875,
"learning_rate": 0.0004967386926202034,
"loss": 5.4981,
"mean_token_accuracy": 0.1621797114610672,
"num_tokens": 13666763.0,
"step": 7405
},
{
"entropy": 5.782182407379151,
"epoch": 0.6225582860743542,
"grad_norm": 1.2421875,
"learning_rate": 0.00049673361038478,
"loss": 5.7373,
"mean_token_accuracy": 0.14591761454939842,
"num_tokens": 13676527.0,
"step": 7410
},
{
"entropy": 5.726846313476562,
"epoch": 0.622978365889519,
"grad_norm": 1.1640625,
"learning_rate": 0.0004967285242214599,
"loss": 5.711,
"mean_token_accuracy": 0.15441259145736694,
"num_tokens": 13685404.0,
"step": 7415
},
{
"entropy": 5.775293207168579,
"epoch": 0.6233984457046838,
"grad_norm": 1.3046875,
"learning_rate": 0.000496723434130333,
"loss": 5.5312,
"mean_token_accuracy": 0.15585086047649382,
"num_tokens": 13693118.0,
"step": 7420
},
{
"entropy": 5.682599878311157,
"epoch": 0.6238185255198487,
"grad_norm": 1.1796875,
"learning_rate": 0.0004967183401114898,
"loss": 5.5973,
"mean_token_accuracy": 0.15298613160848618,
"num_tokens": 13702015.0,
"step": 7425
},
{
"entropy": 5.709433698654175,
"epoch": 0.6242386053350136,
"grad_norm": 1.96875,
"learning_rate": 0.0004967132421650203,
"loss": 5.6205,
"mean_token_accuracy": 0.1487925611436367,
"num_tokens": 13711658.0,
"step": 7430
},
{
"entropy": 5.640313625335693,
"epoch": 0.6246586851501785,
"grad_norm": 1.25,
"learning_rate": 0.0004967081402910149,
"loss": 5.6281,
"mean_token_accuracy": 0.15437382012605666,
"num_tokens": 13720718.0,
"step": 7435
},
{
"entropy": 5.696998929977417,
"epoch": 0.6250787649653434,
"grad_norm": 1.3203125,
"learning_rate": 0.000496703034489564,
"loss": 5.4852,
"mean_token_accuracy": 0.16457302123308182,
"num_tokens": 13729364.0,
"step": 7440
},
{
"entropy": 5.812005949020386,
"epoch": 0.6254988447805083,
"grad_norm": 1.2421875,
"learning_rate": 0.0004966979247607579,
"loss": 5.7952,
"mean_token_accuracy": 0.1459818609058857,
"num_tokens": 13739436.0,
"step": 7445
},
{
"entropy": 5.809390115737915,
"epoch": 0.6259189245956732,
"grad_norm": 1.171875,
"learning_rate": 0.0004966928111046873,
"loss": 5.7157,
"mean_token_accuracy": 0.1619449883699417,
"num_tokens": 13749196.0,
"step": 7450
},
{
"entropy": 5.722574186325073,
"epoch": 0.626339004410838,
"grad_norm": 1.1328125,
"learning_rate": 0.0004966876935214426,
"loss": 5.5536,
"mean_token_accuracy": 0.15739037543535234,
"num_tokens": 13758414.0,
"step": 7455
},
{
"entropy": 5.696356010437012,
"epoch": 0.6267590842260029,
"grad_norm": 1.265625,
"learning_rate": 0.0004966825720111147,
"loss": 5.599,
"mean_token_accuracy": 0.15576763302087784,
"num_tokens": 13767496.0,
"step": 7460
},
{
"entropy": 5.731720113754273,
"epoch": 0.6271791640411678,
"grad_norm": 1.328125,
"learning_rate": 0.0004966774465737942,
"loss": 5.7364,
"mean_token_accuracy": 0.15644533336162567,
"num_tokens": 13777033.0,
"step": 7465
},
{
"entropy": 5.762928485870361,
"epoch": 0.6275992438563327,
"grad_norm": 1.140625,
"learning_rate": 0.0004966723172095717,
"loss": 5.6998,
"mean_token_accuracy": 0.15392984971404075,
"num_tokens": 13786313.0,
"step": 7470
},
{
"entropy": 5.734139251708984,
"epoch": 0.6280193236714976,
"grad_norm": 1.234375,
"learning_rate": 0.0004966671839185384,
"loss": 5.6214,
"mean_token_accuracy": 0.16077139377593994,
"num_tokens": 13795257.0,
"step": 7475
},
{
"entropy": 5.636916065216065,
"epoch": 0.6284394034866625,
"grad_norm": 1.2890625,
"learning_rate": 0.0004966620467007851,
"loss": 5.546,
"mean_token_accuracy": 0.16158626824617386,
"num_tokens": 13804582.0,
"step": 7480
},
{
"entropy": 5.686999893188476,
"epoch": 0.6288594833018274,
"grad_norm": 1.140625,
"learning_rate": 0.0004966569055564027,
"loss": 5.5142,
"mean_token_accuracy": 0.1591956153512001,
"num_tokens": 13813248.0,
"step": 7485
},
{
"entropy": 5.8127374172210695,
"epoch": 0.6292795631169922,
"grad_norm": 1.3046875,
"learning_rate": 0.0004966517604854823,
"loss": 5.7932,
"mean_token_accuracy": 0.14702995121479034,
"num_tokens": 13823301.0,
"step": 7490
},
{
"entropy": 5.716786479949951,
"epoch": 0.6296996429321571,
"grad_norm": 1.2265625,
"learning_rate": 0.0004966466114881152,
"loss": 5.527,
"mean_token_accuracy": 0.1616852879524231,
"num_tokens": 13832040.0,
"step": 7495
},
{
"entropy": 5.789406156539917,
"epoch": 0.630119722747322,
"grad_norm": 1.1796875,
"learning_rate": 0.0004966414585643925,
"loss": 5.708,
"mean_token_accuracy": 0.15371926724910737,
"num_tokens": 13841874.0,
"step": 7500
},
{
"entropy": 5.673316860198975,
"epoch": 0.6305398025624869,
"grad_norm": 1.21875,
"learning_rate": 0.0004966363017144055,
"loss": 5.5343,
"mean_token_accuracy": 0.16494474560022354,
"num_tokens": 13850755.0,
"step": 7505
},
{
"entropy": 5.633390045166015,
"epoch": 0.6309598823776518,
"grad_norm": 1.1953125,
"learning_rate": 0.0004966311409382455,
"loss": 5.6069,
"mean_token_accuracy": 0.1526731699705124,
"num_tokens": 13860009.0,
"step": 7510
},
{
"entropy": 5.645658397674561,
"epoch": 0.6313799621928167,
"grad_norm": 1.2890625,
"learning_rate": 0.0004966259762360039,
"loss": 5.5308,
"mean_token_accuracy": 0.15903031826019287,
"num_tokens": 13868476.0,
"step": 7515
},
{
"entropy": 5.6507611751556395,
"epoch": 0.6318000420079816,
"grad_norm": 1.2109375,
"learning_rate": 0.0004966208076077723,
"loss": 5.5521,
"mean_token_accuracy": 0.16062566936016082,
"num_tokens": 13877367.0,
"step": 7520
},
{
"entropy": 5.700343370437622,
"epoch": 0.6322201218231464,
"grad_norm": 1.2265625,
"learning_rate": 0.0004966156350536422,
"loss": 5.6312,
"mean_token_accuracy": 0.15362192541360856,
"num_tokens": 13885985.0,
"step": 7525
},
{
"entropy": 5.673612928390503,
"epoch": 0.6326402016383113,
"grad_norm": 1.1640625,
"learning_rate": 0.0004966104585737054,
"loss": 5.5463,
"mean_token_accuracy": 0.1552409939467907,
"num_tokens": 13895059.0,
"step": 7530
},
{
"entropy": 5.7044392108917235,
"epoch": 0.6330602814534761,
"grad_norm": 1.3828125,
"learning_rate": 0.0004966052781680534,
"loss": 5.6238,
"mean_token_accuracy": 0.15182910710573197,
"num_tokens": 13903789.0,
"step": 7535
},
{
"entropy": 5.762989282608032,
"epoch": 0.633480361268641,
"grad_norm": 1.1953125,
"learning_rate": 0.0004966000938367778,
"loss": 5.5906,
"mean_token_accuracy": 0.1539550706744194,
"num_tokens": 13913377.0,
"step": 7540
},
{
"entropy": 5.600185346603394,
"epoch": 0.6339004410838059,
"grad_norm": 1.1953125,
"learning_rate": 0.0004965949055799708,
"loss": 5.5409,
"mean_token_accuracy": 0.16593605130910874,
"num_tokens": 13922141.0,
"step": 7545
},
{
"entropy": 5.727922582626343,
"epoch": 0.6343205208989708,
"grad_norm": 1.2109375,
"learning_rate": 0.0004965897133977241,
"loss": 5.5956,
"mean_token_accuracy": 0.1511126011610031,
"num_tokens": 13930717.0,
"step": 7550
},
{
"entropy": 5.748683834075928,
"epoch": 0.6347406007141357,
"grad_norm": 1.1640625,
"learning_rate": 0.0004965845172901298,
"loss": 5.6662,
"mean_token_accuracy": 0.15031958371400833,
"num_tokens": 13940344.0,
"step": 7555
},
{
"entropy": 5.686314058303833,
"epoch": 0.6351606805293005,
"grad_norm": 1.375,
"learning_rate": 0.0004965793172572798,
"loss": 5.5218,
"mean_token_accuracy": 0.15951916426420212,
"num_tokens": 13948400.0,
"step": 7560
},
{
"entropy": 5.66354660987854,
"epoch": 0.6355807603444654,
"grad_norm": 1.1953125,
"learning_rate": 0.0004965741132992663,
"loss": 5.6187,
"mean_token_accuracy": 0.1530763328075409,
"num_tokens": 13957939.0,
"step": 7565
},
{
"entropy": 5.766050386428833,
"epoch": 0.6360008401596303,
"grad_norm": 1.21875,
"learning_rate": 0.0004965689054161814,
"loss": 5.6048,
"mean_token_accuracy": 0.15733802318572998,
"num_tokens": 13966943.0,
"step": 7570
},
{
"entropy": 5.686775827407837,
"epoch": 0.6364209199747952,
"grad_norm": 1.21875,
"learning_rate": 0.0004965636936081176,
"loss": 5.5227,
"mean_token_accuracy": 0.1530359521508217,
"num_tokens": 13975850.0,
"step": 7575
},
{
"entropy": 5.774266147613526,
"epoch": 0.6368409997899601,
"grad_norm": 1.2109375,
"learning_rate": 0.000496558477875167,
"loss": 5.6063,
"mean_token_accuracy": 0.16522209197282792,
"num_tokens": 13985059.0,
"step": 7580
},
{
"entropy": 5.737274599075318,
"epoch": 0.637261079605125,
"grad_norm": 1.203125,
"learning_rate": 0.000496553258217422,
"loss": 5.6641,
"mean_token_accuracy": 0.1459086686372757,
"num_tokens": 13993571.0,
"step": 7585
},
{
"entropy": 5.7590330123901365,
"epoch": 0.6376811594202898,
"grad_norm": 1.1640625,
"learning_rate": 0.0004965480346349751,
"loss": 5.6473,
"mean_token_accuracy": 0.15470026284456254,
"num_tokens": 14002326.0,
"step": 7590
},
{
"entropy": 5.850418996810913,
"epoch": 0.6381012392354547,
"grad_norm": 1.203125,
"learning_rate": 0.000496542807127919,
"loss": 5.7856,
"mean_token_accuracy": 0.14904989078640937,
"num_tokens": 14012002.0,
"step": 7595
},
{
"entropy": 5.688018846511841,
"epoch": 0.6385213190506196,
"grad_norm": 1.2265625,
"learning_rate": 0.000496537575696346,
"loss": 5.6618,
"mean_token_accuracy": 0.1517901375889778,
"num_tokens": 14022085.0,
"step": 7600
},
{
"entropy": 5.700750541687012,
"epoch": 0.6389413988657845,
"grad_norm": 1.296875,
"learning_rate": 0.0004965323403403488,
"loss": 5.5531,
"mean_token_accuracy": 0.15710408240556717,
"num_tokens": 14030706.0,
"step": 7605
},
{
"entropy": 5.692938947677613,
"epoch": 0.6393614786809494,
"grad_norm": 1.1875,
"learning_rate": 0.0004965271010600205,
"loss": 5.5622,
"mean_token_accuracy": 0.15845440477132797,
"num_tokens": 14039520.0,
"step": 7610
},
{
"entropy": 5.7362024784088135,
"epoch": 0.6397815584961143,
"grad_norm": 1.2421875,
"learning_rate": 0.0004965218578554535,
"loss": 5.6609,
"mean_token_accuracy": 0.1563500553369522,
"num_tokens": 14048407.0,
"step": 7615
},
{
"entropy": 5.681648254394531,
"epoch": 0.6402016383112792,
"grad_norm": 1.3125,
"learning_rate": 0.000496516610726741,
"loss": 5.5922,
"mean_token_accuracy": 0.16066278517246246,
"num_tokens": 14057534.0,
"step": 7620
},
{
"entropy": 5.708992671966553,
"epoch": 0.640621718126444,
"grad_norm": 1.2421875,
"learning_rate": 0.0004965113596739759,
"loss": 5.5528,
"mean_token_accuracy": 0.16508279591798783,
"num_tokens": 14065992.0,
"step": 7625
},
{
"entropy": 5.664807271957398,
"epoch": 0.6410417979416089,
"grad_norm": 1.2734375,
"learning_rate": 0.0004965061046972508,
"loss": 5.5339,
"mean_token_accuracy": 0.15689536631107331,
"num_tokens": 14074806.0,
"step": 7630
},
{
"entropy": 5.65765414237976,
"epoch": 0.6414618777567738,
"grad_norm": 1.1171875,
"learning_rate": 0.0004965008457966594,
"loss": 5.5964,
"mean_token_accuracy": 0.15954618155956268,
"num_tokens": 14083813.0,
"step": 7635
},
{
"entropy": 5.694348669052124,
"epoch": 0.6418819575719387,
"grad_norm": 1.2734375,
"learning_rate": 0.0004964955829722945,
"loss": 5.5392,
"mean_token_accuracy": 0.1576619863510132,
"num_tokens": 14092193.0,
"step": 7640
},
{
"entropy": 5.807347249984741,
"epoch": 0.6423020373871036,
"grad_norm": 1.1796875,
"learning_rate": 0.0004964903162242493,
"loss": 5.7404,
"mean_token_accuracy": 0.14879937767982482,
"num_tokens": 14102797.0,
"step": 7645
},
{
"entropy": 5.7141985416412355,
"epoch": 0.6427221172022685,
"grad_norm": 1.15625,
"learning_rate": 0.0004964850455526173,
"loss": 5.611,
"mean_token_accuracy": 0.15806291848421097,
"num_tokens": 14112226.0,
"step": 7650
},
{
"entropy": 5.599700927734375,
"epoch": 0.6431421970174334,
"grad_norm": 1.2265625,
"learning_rate": 0.0004964797709574917,
"loss": 5.5215,
"mean_token_accuracy": 0.157790507376194,
"num_tokens": 14121775.0,
"step": 7655
},
{
"entropy": 5.666961145401001,
"epoch": 0.6435622768325981,
"grad_norm": 1.09375,
"learning_rate": 0.000496474492438966,
"loss": 5.5204,
"mean_token_accuracy": 0.16159009486436843,
"num_tokens": 14130415.0,
"step": 7660
},
{
"entropy": 5.703153944015503,
"epoch": 0.643982356647763,
"grad_norm": 1.125,
"learning_rate": 0.0004964692099971338,
"loss": 5.5545,
"mean_token_accuracy": 0.16274037957191467,
"num_tokens": 14140204.0,
"step": 7665
},
{
"entropy": 5.666165065765381,
"epoch": 0.6444024364629279,
"grad_norm": 1.15625,
"learning_rate": 0.0004964639236320885,
"loss": 5.4914,
"mean_token_accuracy": 0.15845234841108322,
"num_tokens": 14149595.0,
"step": 7670
},
{
"entropy": 5.6308153629302975,
"epoch": 0.6448225162780928,
"grad_norm": 1.2421875,
"learning_rate": 0.0004964586333439239,
"loss": 5.5805,
"mean_token_accuracy": 0.15749624222517014,
"num_tokens": 14158865.0,
"step": 7675
},
{
"entropy": 5.667867422103882,
"epoch": 0.6452425960932577,
"grad_norm": 1.2890625,
"learning_rate": 0.0004964533391327335,
"loss": 5.5375,
"mean_token_accuracy": 0.16526372283697127,
"num_tokens": 14167962.0,
"step": 7680
},
{
"entropy": 5.699340200424194,
"epoch": 0.6456626759084226,
"grad_norm": 1.2578125,
"learning_rate": 0.0004964480409986113,
"loss": 5.5873,
"mean_token_accuracy": 0.16477292776107788,
"num_tokens": 14176479.0,
"step": 7685
},
{
"entropy": 5.758196306228638,
"epoch": 0.6460827557235875,
"grad_norm": 1.2109375,
"learning_rate": 0.0004964427389416512,
"loss": 5.5979,
"mean_token_accuracy": 0.1566400408744812,
"num_tokens": 14185408.0,
"step": 7690
},
{
"entropy": 5.606453227996826,
"epoch": 0.6465028355387523,
"grad_norm": 1.3203125,
"learning_rate": 0.000496437432961947,
"loss": 5.6113,
"mean_token_accuracy": 0.15897119492292405,
"num_tokens": 14194155.0,
"step": 7695
},
{
"entropy": 5.640574026107788,
"epoch": 0.6469229153539172,
"grad_norm": 1.1875,
"learning_rate": 0.0004964321230595925,
"loss": 5.6261,
"mean_token_accuracy": 0.1550075277686119,
"num_tokens": 14202779.0,
"step": 7700
},
{
"entropy": 5.836891317367554,
"epoch": 0.6473429951690821,
"grad_norm": 1.1796875,
"learning_rate": 0.0004964268092346821,
"loss": 5.8207,
"mean_token_accuracy": 0.146051287651062,
"num_tokens": 14212552.0,
"step": 7705
},
{
"entropy": 5.879545545578003,
"epoch": 0.647763074984247,
"grad_norm": 1.1484375,
"learning_rate": 0.0004964214914873098,
"loss": 5.6016,
"mean_token_accuracy": 0.15281028002500535,
"num_tokens": 14222783.0,
"step": 7710
},
{
"entropy": 5.548879718780517,
"epoch": 0.6481831547994119,
"grad_norm": 1.3046875,
"learning_rate": 0.0004964161698175697,
"loss": 5.4932,
"mean_token_accuracy": 0.1592596337199211,
"num_tokens": 14232085.0,
"step": 7715
},
{
"entropy": 5.6611487865448,
"epoch": 0.6486032346145768,
"grad_norm": 1.234375,
"learning_rate": 0.0004964108442255562,
"loss": 5.6532,
"mean_token_accuracy": 0.15136753022670746,
"num_tokens": 14241969.0,
"step": 7720
},
{
"entropy": 5.700356197357178,
"epoch": 0.6490233144297417,
"grad_norm": 1.3359375,
"learning_rate": 0.0004964055147113637,
"loss": 5.5627,
"mean_token_accuracy": 0.1610928788781166,
"num_tokens": 14251012.0,
"step": 7725
},
{
"entropy": 5.784811544418335,
"epoch": 0.6494433942449065,
"grad_norm": 1.3046875,
"learning_rate": 0.0004964001812750864,
"loss": 5.6746,
"mean_token_accuracy": 0.1543006032705307,
"num_tokens": 14261110.0,
"step": 7730
},
{
"entropy": 5.703812551498413,
"epoch": 0.6498634740600714,
"grad_norm": 1.234375,
"learning_rate": 0.000496394843916819,
"loss": 5.6434,
"mean_token_accuracy": 0.15608806014060975,
"num_tokens": 14270869.0,
"step": 7735
},
{
"entropy": 5.744104099273682,
"epoch": 0.6502835538752363,
"grad_norm": 1.25,
"learning_rate": 0.0004963895026366558,
"loss": 5.5992,
"mean_token_accuracy": 0.15321880877017974,
"num_tokens": 14279607.0,
"step": 7740
},
{
"entropy": 5.670469808578491,
"epoch": 0.6507036336904012,
"grad_norm": 1.1796875,
"learning_rate": 0.0004963841574346917,
"loss": 5.584,
"mean_token_accuracy": 0.15689192116260528,
"num_tokens": 14289282.0,
"step": 7745
},
{
"entropy": 5.666816091537475,
"epoch": 0.6511237135055661,
"grad_norm": 1.3125,
"learning_rate": 0.0004963788083110212,
"loss": 5.5329,
"mean_token_accuracy": 0.16109126657247544,
"num_tokens": 14298658.0,
"step": 7750
},
{
"entropy": 5.7850751876831055,
"epoch": 0.651543793320731,
"grad_norm": 1.2421875,
"learning_rate": 0.000496373455265739,
"loss": 5.6023,
"mean_token_accuracy": 0.15321561843156814,
"num_tokens": 14307832.0,
"step": 7755
},
{
"entropy": 5.6633306503295895,
"epoch": 0.6519638731358958,
"grad_norm": 1.25,
"learning_rate": 0.0004963680982989402,
"loss": 5.5331,
"mean_token_accuracy": 0.15990484803915023,
"num_tokens": 14317122.0,
"step": 7760
},
{
"entropy": 5.641975736618042,
"epoch": 0.6523839529510607,
"grad_norm": 1.3359375,
"learning_rate": 0.0004963627374107195,
"loss": 5.5617,
"mean_token_accuracy": 0.1605705052614212,
"num_tokens": 14326069.0,
"step": 7765
},
{
"entropy": 5.6536908626556395,
"epoch": 0.6528040327662256,
"grad_norm": 1.171875,
"learning_rate": 0.0004963573726011717,
"loss": 5.5586,
"mean_token_accuracy": 0.15782028138637544,
"num_tokens": 14335260.0,
"step": 7770
},
{
"entropy": 5.781147193908692,
"epoch": 0.6532241125813905,
"grad_norm": 1.25,
"learning_rate": 0.0004963520038703922,
"loss": 5.6589,
"mean_token_accuracy": 0.14450628608465194,
"num_tokens": 14345823.0,
"step": 7775
},
{
"entropy": 5.680127048492432,
"epoch": 0.6536441923965554,
"grad_norm": 1.2890625,
"learning_rate": 0.000496346631218476,
"loss": 5.5181,
"mean_token_accuracy": 0.15935117304325103,
"num_tokens": 14354316.0,
"step": 7780
},
{
"entropy": 5.658060073852539,
"epoch": 0.6540642722117203,
"grad_norm": 1.171875,
"learning_rate": 0.000496341254645518,
"loss": 5.5774,
"mean_token_accuracy": 0.1578880801796913,
"num_tokens": 14364539.0,
"step": 7785
},
{
"entropy": 5.6646044731140135,
"epoch": 0.6544843520268852,
"grad_norm": 1.15625,
"learning_rate": 0.0004963358741516138,
"loss": 5.6768,
"mean_token_accuracy": 0.14892476946115493,
"num_tokens": 14374081.0,
"step": 7790
},
{
"entropy": 5.715250015258789,
"epoch": 0.6549044318420499,
"grad_norm": 1.296875,
"learning_rate": 0.0004963304897368585,
"loss": 5.5744,
"mean_token_accuracy": 0.15301866233348846,
"num_tokens": 14383255.0,
"step": 7795
},
{
"entropy": 5.781266593933106,
"epoch": 0.6553245116572148,
"grad_norm": 1.3984375,
"learning_rate": 0.0004963251014013475,
"loss": 5.7162,
"mean_token_accuracy": 0.15105995386838914,
"num_tokens": 14392417.0,
"step": 7800
},
{
"entropy": 5.869361925125122,
"epoch": 0.6557445914723797,
"grad_norm": 1.546875,
"learning_rate": 0.0004963197091451763,
"loss": 5.7623,
"mean_token_accuracy": 0.14551858603954315,
"num_tokens": 14401899.0,
"step": 7805
},
{
"entropy": 5.800676774978638,
"epoch": 0.6561646712875446,
"grad_norm": 1.1875,
"learning_rate": 0.0004963143129684405,
"loss": 5.7137,
"mean_token_accuracy": 0.15205856338143348,
"num_tokens": 14411245.0,
"step": 7810
},
{
"entropy": 5.656564712524414,
"epoch": 0.6565847511027095,
"grad_norm": 1.265625,
"learning_rate": 0.0004963089128712355,
"loss": 5.5745,
"mean_token_accuracy": 0.1596169352531433,
"num_tokens": 14419710.0,
"step": 7815
},
{
"entropy": 5.685669898986816,
"epoch": 0.6570048309178744,
"grad_norm": 1.1953125,
"learning_rate": 0.0004963035088536571,
"loss": 5.541,
"mean_token_accuracy": 0.16597671955823898,
"num_tokens": 14430266.0,
"step": 7820
},
{
"entropy": 5.71679105758667,
"epoch": 0.6574249107330393,
"grad_norm": 1.2109375,
"learning_rate": 0.0004962981009158012,
"loss": 5.5222,
"mean_token_accuracy": 0.15457252264022828,
"num_tokens": 14439515.0,
"step": 7825
},
{
"entropy": 5.671058607101441,
"epoch": 0.6578449905482041,
"grad_norm": 1.453125,
"learning_rate": 0.0004962926890577632,
"loss": 5.6006,
"mean_token_accuracy": 0.15761883705854415,
"num_tokens": 14448091.0,
"step": 7830
},
{
"entropy": 5.678781509399414,
"epoch": 0.658265070363369,
"grad_norm": 1.1953125,
"learning_rate": 0.000496287273279639,
"loss": 5.6241,
"mean_token_accuracy": 0.15393756926059723,
"num_tokens": 14457744.0,
"step": 7835
},
{
"entropy": 5.75106201171875,
"epoch": 0.6586851501785339,
"grad_norm": 1.3359375,
"learning_rate": 0.000496281853581525,
"loss": 5.6013,
"mean_token_accuracy": 0.16268085986375808,
"num_tokens": 14467597.0,
"step": 7840
},
{
"entropy": 5.71776933670044,
"epoch": 0.6591052299936988,
"grad_norm": 1.3359375,
"learning_rate": 0.0004962764299635168,
"loss": 5.59,
"mean_token_accuracy": 0.158500038087368,
"num_tokens": 14476662.0,
"step": 7845
},
{
"entropy": 5.780879735946655,
"epoch": 0.6595253098088637,
"grad_norm": 1.2109375,
"learning_rate": 0.0004962710024257105,
"loss": 5.6777,
"mean_token_accuracy": 0.15187399610877036,
"num_tokens": 14486583.0,
"step": 7850
},
{
"entropy": 5.816821718215943,
"epoch": 0.6599453896240286,
"grad_norm": 1.171875,
"learning_rate": 0.0004962655709682025,
"loss": 5.6847,
"mean_token_accuracy": 0.15380629003047944,
"num_tokens": 14496528.0,
"step": 7855
},
{
"entropy": 5.759185695648194,
"epoch": 0.6603654694391935,
"grad_norm": 1.0859375,
"learning_rate": 0.0004962601355910887,
"loss": 5.6504,
"mean_token_accuracy": 0.14890926629304885,
"num_tokens": 14507026.0,
"step": 7860
},
{
"entropy": 5.616134738922119,
"epoch": 0.6607855492543583,
"grad_norm": 1.171875,
"learning_rate": 0.0004962546962944656,
"loss": 5.5158,
"mean_token_accuracy": 0.1625874564051628,
"num_tokens": 14516480.0,
"step": 7865
},
{
"entropy": 5.6247007846832275,
"epoch": 0.6612056290695232,
"grad_norm": 1.1875,
"learning_rate": 0.0004962492530784295,
"loss": 5.4622,
"mean_token_accuracy": 0.16726566851139069,
"num_tokens": 14525068.0,
"step": 7870
},
{
"entropy": 5.687610340118408,
"epoch": 0.6616257088846881,
"grad_norm": 1.3203125,
"learning_rate": 0.0004962438059430768,
"loss": 5.5961,
"mean_token_accuracy": 0.15713810473680495,
"num_tokens": 14534441.0,
"step": 7875
},
{
"entropy": 5.733529424667358,
"epoch": 0.662045788699853,
"grad_norm": 1.2734375,
"learning_rate": 0.0004962383548885039,
"loss": 5.6936,
"mean_token_accuracy": 0.15146275758743286,
"num_tokens": 14543026.0,
"step": 7880
},
{
"entropy": 5.70049557685852,
"epoch": 0.6624658685150179,
"grad_norm": 1.265625,
"learning_rate": 0.0004962328999148075,
"loss": 5.5372,
"mean_token_accuracy": 0.162800632417202,
"num_tokens": 14552068.0,
"step": 7885
},
{
"entropy": 5.766546583175659,
"epoch": 0.6628859483301828,
"grad_norm": 1.2421875,
"learning_rate": 0.0004962274410220842,
"loss": 5.6896,
"mean_token_accuracy": 0.15465587973594666,
"num_tokens": 14561587.0,
"step": 7890
},
{
"entropy": 5.830715274810791,
"epoch": 0.6633060281453477,
"grad_norm": 1.28125,
"learning_rate": 0.0004962219782104308,
"loss": 5.6782,
"mean_token_accuracy": 0.15767197906970978,
"num_tokens": 14571020.0,
"step": 7895
},
{
"entropy": 5.752756977081299,
"epoch": 0.6637261079605125,
"grad_norm": 1.265625,
"learning_rate": 0.0004962165114799439,
"loss": 5.6361,
"mean_token_accuracy": 0.15070140063762666,
"num_tokens": 14580638.0,
"step": 7900
},
{
"entropy": 5.657642221450805,
"epoch": 0.6641461877756774,
"grad_norm": 1.2578125,
"learning_rate": 0.0004962110408307204,
"loss": 5.5659,
"mean_token_accuracy": 0.1519331306219101,
"num_tokens": 14590173.0,
"step": 7905
},
{
"entropy": 5.627839040756226,
"epoch": 0.6645662675908423,
"grad_norm": 1.1796875,
"learning_rate": 0.0004962055662628571,
"loss": 5.5544,
"mean_token_accuracy": 0.1591130867600441,
"num_tokens": 14598635.0,
"step": 7910
},
{
"entropy": 5.728003406524659,
"epoch": 0.6649863474060071,
"grad_norm": 1.1796875,
"learning_rate": 0.0004962000877764513,
"loss": 5.5941,
"mean_token_accuracy": 0.15895481109619142,
"num_tokens": 14607233.0,
"step": 7915
},
{
"entropy": 5.8495190143585205,
"epoch": 0.665406427221172,
"grad_norm": 1.34375,
"learning_rate": 0.0004961946053715998,
"loss": 5.7522,
"mean_token_accuracy": 0.1442479744553566,
"num_tokens": 14617483.0,
"step": 7920
},
{
"entropy": 5.6968982219696045,
"epoch": 0.665826507036337,
"grad_norm": 1.265625,
"learning_rate": 0.0004961891190483997,
"loss": 5.5639,
"mean_token_accuracy": 0.15327939689159392,
"num_tokens": 14625805.0,
"step": 7925
},
{
"entropy": 5.619145631790161,
"epoch": 0.6662465868515017,
"grad_norm": 1.15625,
"learning_rate": 0.0004961836288069483,
"loss": 5.4968,
"mean_token_accuracy": 0.16205275803804398,
"num_tokens": 14634605.0,
"step": 7930
},
{
"entropy": 5.767789077758789,
"epoch": 0.6666666666666666,
"grad_norm": 1.1484375,
"learning_rate": 0.0004961781346473428,
"loss": 5.6948,
"mean_token_accuracy": 0.1465153157711029,
"num_tokens": 14644970.0,
"step": 7935
},
{
"entropy": 5.756932115554809,
"epoch": 0.6670867464818315,
"grad_norm": 1.203125,
"learning_rate": 0.0004961726365696805,
"loss": 5.573,
"mean_token_accuracy": 0.15876282155513763,
"num_tokens": 14655043.0,
"step": 7940
},
{
"entropy": 5.705471324920654,
"epoch": 0.6675068262969964,
"grad_norm": 1.203125,
"learning_rate": 0.0004961671345740589,
"loss": 5.5511,
"mean_token_accuracy": 0.15569487810134888,
"num_tokens": 14663994.0,
"step": 7945
},
{
"entropy": 5.643215370178223,
"epoch": 0.6679269061121613,
"grad_norm": 1.1796875,
"learning_rate": 0.0004961616286605753,
"loss": 5.5418,
"mean_token_accuracy": 0.1519337624311447,
"num_tokens": 14674101.0,
"step": 7950
},
{
"entropy": 5.652733421325683,
"epoch": 0.6683469859273262,
"grad_norm": 1.2578125,
"learning_rate": 0.0004961561188293273,
"loss": 5.6378,
"mean_token_accuracy": 0.15260846465826033,
"num_tokens": 14684156.0,
"step": 7955
},
{
"entropy": 5.650334310531616,
"epoch": 0.6687670657424911,
"grad_norm": 1.1796875,
"learning_rate": 0.0004961506050804126,
"loss": 5.5486,
"mean_token_accuracy": 0.16360556036233903,
"num_tokens": 14693223.0,
"step": 7960
},
{
"entropy": 5.747977447509766,
"epoch": 0.6691871455576559,
"grad_norm": 1.1640625,
"learning_rate": 0.000496145087413929,
"loss": 5.5514,
"mean_token_accuracy": 0.1563424676656723,
"num_tokens": 14702959.0,
"step": 7965
},
{
"entropy": 5.758147382736206,
"epoch": 0.6696072253728208,
"grad_norm": 1.21875,
"learning_rate": 0.0004961395658299737,
"loss": 5.6716,
"mean_token_accuracy": 0.15093171894550322,
"num_tokens": 14712146.0,
"step": 7970
},
{
"entropy": 5.668249607086182,
"epoch": 0.6700273051879857,
"grad_norm": 1.296875,
"learning_rate": 0.0004961340403286451,
"loss": 5.6011,
"mean_token_accuracy": 0.1509583607316017,
"num_tokens": 14721932.0,
"step": 7975
},
{
"entropy": 5.666338872909546,
"epoch": 0.6704473850031506,
"grad_norm": 1.1875,
"learning_rate": 0.0004961285109100408,
"loss": 5.5256,
"mean_token_accuracy": 0.16514174044132232,
"num_tokens": 14731080.0,
"step": 7980
},
{
"entropy": 5.632850456237793,
"epoch": 0.6708674648183155,
"grad_norm": 1.421875,
"learning_rate": 0.0004961229775742587,
"loss": 5.5381,
"mean_token_accuracy": 0.16417475491762162,
"num_tokens": 14740057.0,
"step": 7985
},
{
"entropy": 5.78202714920044,
"epoch": 0.6712875446334804,
"grad_norm": 1.296875,
"learning_rate": 0.000496117440321397,
"loss": 5.6093,
"mean_token_accuracy": 0.16053422838449477,
"num_tokens": 14748399.0,
"step": 7990
},
{
"entropy": 5.70674409866333,
"epoch": 0.6717076244486453,
"grad_norm": 1.2890625,
"learning_rate": 0.0004961118991515537,
"loss": 5.6082,
"mean_token_accuracy": 0.1542074903845787,
"num_tokens": 14757215.0,
"step": 7995
},
{
"entropy": 5.65750560760498,
"epoch": 0.6721277042638101,
"grad_norm": 1.203125,
"learning_rate": 0.000496106354064827,
"loss": 5.6398,
"mean_token_accuracy": 0.15751723647117616,
"num_tokens": 14766191.0,
"step": 8000
},
{
"entropy": 5.8624285697937015,
"epoch": 0.672547784078975,
"grad_norm": 1.125,
"learning_rate": 0.0004961008050613149,
"loss": 5.6822,
"mean_token_accuracy": 0.1490050807595253,
"num_tokens": 14775220.0,
"step": 8005
},
{
"entropy": 5.722703504562378,
"epoch": 0.6729678638941399,
"grad_norm": 1.2265625,
"learning_rate": 0.0004960952521411161,
"loss": 5.6323,
"mean_token_accuracy": 0.15520244240760803,
"num_tokens": 14784287.0,
"step": 8010
},
{
"entropy": 5.772007274627685,
"epoch": 0.6733879437093048,
"grad_norm": 1.1953125,
"learning_rate": 0.0004960896953043287,
"loss": 5.7014,
"mean_token_accuracy": 0.15136762484908103,
"num_tokens": 14794219.0,
"step": 8015
},
{
"entropy": 5.756883382797241,
"epoch": 0.6738080235244697,
"grad_norm": 1.328125,
"learning_rate": 0.0004960841345510511,
"loss": 5.6006,
"mean_token_accuracy": 0.15511505603790282,
"num_tokens": 14803324.0,
"step": 8020
},
{
"entropy": 5.709602546691895,
"epoch": 0.6742281033396346,
"grad_norm": 1.3125,
"learning_rate": 0.000496078569881382,
"loss": 5.6139,
"mean_token_accuracy": 0.15611841082572936,
"num_tokens": 14811963.0,
"step": 8025
},
{
"entropy": 5.684782934188843,
"epoch": 0.6746481831547995,
"grad_norm": 1.2734375,
"learning_rate": 0.0004960730012954198,
"loss": 5.5822,
"mean_token_accuracy": 0.1514737568795681,
"num_tokens": 14821903.0,
"step": 8030
},
{
"entropy": 5.651682996749878,
"epoch": 0.6750682629699643,
"grad_norm": 1.2578125,
"learning_rate": 0.0004960674287932634,
"loss": 5.5603,
"mean_token_accuracy": 0.15362193435430527,
"num_tokens": 14831215.0,
"step": 8035
},
{
"entropy": 5.724930715560913,
"epoch": 0.6754883427851291,
"grad_norm": 1.265625,
"learning_rate": 0.0004960618523750111,
"loss": 5.4999,
"mean_token_accuracy": 0.15814183801412582,
"num_tokens": 14840354.0,
"step": 8040
},
{
"entropy": 5.705191946029663,
"epoch": 0.675908422600294,
"grad_norm": 1.2265625,
"learning_rate": 0.000496056272040762,
"loss": 5.6671,
"mean_token_accuracy": 0.15655403584241867,
"num_tokens": 14849660.0,
"step": 8045
},
{
"entropy": 5.735822105407715,
"epoch": 0.6763285024154589,
"grad_norm": 1.234375,
"learning_rate": 0.0004960506877906149,
"loss": 5.5847,
"mean_token_accuracy": 0.15051980167627335,
"num_tokens": 14859819.0,
"step": 8050
},
{
"entropy": 5.742122793197632,
"epoch": 0.6767485822306238,
"grad_norm": 1.203125,
"learning_rate": 0.0004960450996246686,
"loss": 5.5912,
"mean_token_accuracy": 0.1597781151533127,
"num_tokens": 14869260.0,
"step": 8055
},
{
"entropy": 5.650514125823975,
"epoch": 0.6771686620457887,
"grad_norm": 1.2421875,
"learning_rate": 0.0004960395075430222,
"loss": 5.5622,
"mean_token_accuracy": 0.15572320222854613,
"num_tokens": 14878685.0,
"step": 8060
},
{
"entropy": 5.661490488052368,
"epoch": 0.6775887418609536,
"grad_norm": 1.1328125,
"learning_rate": 0.0004960339115457748,
"loss": 5.5594,
"mean_token_accuracy": 0.15551790744066238,
"num_tokens": 14888456.0,
"step": 8065
},
{
"entropy": 5.721035194396973,
"epoch": 0.6780088216761184,
"grad_norm": 1.40625,
"learning_rate": 0.0004960283116330255,
"loss": 5.6613,
"mean_token_accuracy": 0.1521642178297043,
"num_tokens": 14897401.0,
"step": 8070
},
{
"entropy": 5.722047758102417,
"epoch": 0.6784289014912833,
"grad_norm": 1.21875,
"learning_rate": 0.0004960227078048735,
"loss": 5.5854,
"mean_token_accuracy": 0.15904034078121185,
"num_tokens": 14906741.0,
"step": 8075
},
{
"entropy": 5.669973230361938,
"epoch": 0.6788489813064482,
"grad_norm": 1.328125,
"learning_rate": 0.0004960171000614179,
"loss": 5.4744,
"mean_token_accuracy": 0.16626458019018173,
"num_tokens": 14916002.0,
"step": 8080
},
{
"entropy": 5.575302028656006,
"epoch": 0.6792690611216131,
"grad_norm": 1.2421875,
"learning_rate": 0.0004960114884027583,
"loss": 5.4191,
"mean_token_accuracy": 0.17047906070947647,
"num_tokens": 14925247.0,
"step": 8085
},
{
"entropy": 5.6553647994995115,
"epoch": 0.679689140936778,
"grad_norm": 1.359375,
"learning_rate": 0.0004960058728289939,
"loss": 5.542,
"mean_token_accuracy": 0.1531473934650421,
"num_tokens": 14933925.0,
"step": 8090
},
{
"entropy": 5.774356412887573,
"epoch": 0.6801092207519429,
"grad_norm": 1.515625,
"learning_rate": 0.0004960002533402243,
"loss": 5.5971,
"mean_token_accuracy": 0.15796921104192735,
"num_tokens": 14943368.0,
"step": 8095
},
{
"entropy": 5.718150043487549,
"epoch": 0.6805293005671077,
"grad_norm": 1.125,
"learning_rate": 0.0004959946299365491,
"loss": 5.6373,
"mean_token_accuracy": 0.15140645951032639,
"num_tokens": 14953710.0,
"step": 8100
},
{
"entropy": 5.776417064666748,
"epoch": 0.6809493803822726,
"grad_norm": 1.2109375,
"learning_rate": 0.0004959890026180677,
"loss": 5.642,
"mean_token_accuracy": 0.15366872251033784,
"num_tokens": 14962814.0,
"step": 8105
},
{
"entropy": 5.610153675079346,
"epoch": 0.6813694601974375,
"grad_norm": 1.21875,
"learning_rate": 0.00049598337138488,
"loss": 5.5258,
"mean_token_accuracy": 0.16162641048431398,
"num_tokens": 14971631.0,
"step": 8110
},
{
"entropy": 5.704393625259399,
"epoch": 0.6817895400126024,
"grad_norm": 1.3671875,
"learning_rate": 0.0004959777362370855,
"loss": 5.5272,
"mean_token_accuracy": 0.15709181278944015,
"num_tokens": 14980528.0,
"step": 8115
},
{
"entropy": 5.752010297775269,
"epoch": 0.6822096198277673,
"grad_norm": 1.1640625,
"learning_rate": 0.0004959720971747843,
"loss": 5.5463,
"mean_token_accuracy": 0.15931246876716615,
"num_tokens": 14989331.0,
"step": 8120
},
{
"entropy": 5.641499423980713,
"epoch": 0.6826296996429322,
"grad_norm": 1.1796875,
"learning_rate": 0.0004959664541980762,
"loss": 5.5448,
"mean_token_accuracy": 0.16116672456264497,
"num_tokens": 14999403.0,
"step": 8125
},
{
"entropy": 5.716615200042725,
"epoch": 0.6830497794580971,
"grad_norm": 1.28125,
"learning_rate": 0.0004959608073070612,
"loss": 5.6204,
"mean_token_accuracy": 0.15285489484667777,
"num_tokens": 15009388.0,
"step": 8130
},
{
"entropy": 5.761466312408447,
"epoch": 0.6834698592732619,
"grad_norm": 1.15625,
"learning_rate": 0.0004959551565018392,
"loss": 5.551,
"mean_token_accuracy": 0.1617701143026352,
"num_tokens": 15018586.0,
"step": 8135
},
{
"entropy": 5.675375652313233,
"epoch": 0.6838899390884268,
"grad_norm": 1.265625,
"learning_rate": 0.0004959495017825104,
"loss": 5.5729,
"mean_token_accuracy": 0.1646643817424774,
"num_tokens": 15027982.0,
"step": 8140
},
{
"entropy": 5.645266532897949,
"epoch": 0.6843100189035917,
"grad_norm": 1.2734375,
"learning_rate": 0.0004959438431491749,
"loss": 5.5486,
"mean_token_accuracy": 0.16084384471178054,
"num_tokens": 15037103.0,
"step": 8145
},
{
"entropy": 5.652230548858642,
"epoch": 0.6847300987187566,
"grad_norm": 1.21875,
"learning_rate": 0.000495938180601933,
"loss": 5.6586,
"mean_token_accuracy": 0.15451920107007028,
"num_tokens": 15046739.0,
"step": 8150
},
{
"entropy": 5.771371221542358,
"epoch": 0.6851501785339215,
"grad_norm": 1.1953125,
"learning_rate": 0.0004959325141408851,
"loss": 5.6031,
"mean_token_accuracy": 0.15792788416147233,
"num_tokens": 15056586.0,
"step": 8155
},
{
"entropy": 5.683104991912842,
"epoch": 0.6855702583490864,
"grad_norm": 1.2734375,
"learning_rate": 0.0004959268437661313,
"loss": 5.5742,
"mean_token_accuracy": 0.15851471424102784,
"num_tokens": 15066622.0,
"step": 8160
},
{
"entropy": 5.7034577369689945,
"epoch": 0.6859903381642513,
"grad_norm": 1.375,
"learning_rate": 0.0004959211694777724,
"loss": 5.5704,
"mean_token_accuracy": 0.160592782497406,
"num_tokens": 15075415.0,
"step": 8165
},
{
"entropy": 5.628656339645386,
"epoch": 0.686410417979416,
"grad_norm": 1.1328125,
"learning_rate": 0.0004959154912759086,
"loss": 5.5469,
"mean_token_accuracy": 0.16037230640649797,
"num_tokens": 15085087.0,
"step": 8170
},
{
"entropy": 5.674224853515625,
"epoch": 0.6868304977945809,
"grad_norm": 1.3046875,
"learning_rate": 0.0004959098091606406,
"loss": 5.5436,
"mean_token_accuracy": 0.1610500007867813,
"num_tokens": 15093580.0,
"step": 8175
},
{
"entropy": 5.649624395370483,
"epoch": 0.6872505776097458,
"grad_norm": 1.7421875,
"learning_rate": 0.0004959041231320692,
"loss": 5.5309,
"mean_token_accuracy": 0.16328124403953553,
"num_tokens": 15104033.0,
"step": 8180
},
{
"entropy": 5.689769697189331,
"epoch": 0.6876706574249107,
"grad_norm": 1.40625,
"learning_rate": 0.0004958984331902951,
"loss": 5.6276,
"mean_token_accuracy": 0.15330591350793837,
"num_tokens": 15113164.0,
"step": 8185
},
{
"entropy": 5.6513279438018795,
"epoch": 0.6880907372400756,
"grad_norm": 1.21875,
"learning_rate": 0.0004958927393354188,
"loss": 5.5384,
"mean_token_accuracy": 0.16279968321323396,
"num_tokens": 15122215.0,
"step": 8190
},
{
"entropy": 5.661105632781982,
"epoch": 0.6885108170552405,
"grad_norm": 1.2578125,
"learning_rate": 0.0004958870415675415,
"loss": 5.5442,
"mean_token_accuracy": 0.1579482913017273,
"num_tokens": 15130877.0,
"step": 8195
},
{
"entropy": 5.671978092193603,
"epoch": 0.6889308968704054,
"grad_norm": 1.296875,
"learning_rate": 0.0004958813398867639,
"loss": 5.5246,
"mean_token_accuracy": 0.16374406665563584,
"num_tokens": 15140227.0,
"step": 8200
},
{
"entropy": 5.849607276916504,
"epoch": 0.6893509766855702,
"grad_norm": 1.1796875,
"learning_rate": 0.0004958756342931872,
"loss": 5.6957,
"mean_token_accuracy": 0.14763638526201248,
"num_tokens": 15150006.0,
"step": 8205
},
{
"entropy": 5.712654685974121,
"epoch": 0.6897710565007351,
"grad_norm": 1.2734375,
"learning_rate": 0.0004958699247869122,
"loss": 5.5825,
"mean_token_accuracy": 0.15836212635040284,
"num_tokens": 15160032.0,
"step": 8210
},
{
"entropy": 5.662709379196167,
"epoch": 0.6901911363159,
"grad_norm": 1.1796875,
"learning_rate": 0.0004958642113680404,
"loss": 5.5373,
"mean_token_accuracy": 0.1649177625775337,
"num_tokens": 15168966.0,
"step": 8215
},
{
"entropy": 5.824816131591797,
"epoch": 0.6906112161310649,
"grad_norm": 1.5703125,
"learning_rate": 0.0004958584940366727,
"loss": 5.7236,
"mean_token_accuracy": 0.1509198695421219,
"num_tokens": 15179337.0,
"step": 8220
},
{
"entropy": 5.735790348052978,
"epoch": 0.6910312959462298,
"grad_norm": 1.1953125,
"learning_rate": 0.0004958527727929106,
"loss": 5.6149,
"mean_token_accuracy": 0.15628711581230165,
"num_tokens": 15188395.0,
"step": 8225
},
{
"entropy": 5.690367364883423,
"epoch": 0.6914513757613947,
"grad_norm": 1.1953125,
"learning_rate": 0.0004958470476368552,
"loss": 5.5466,
"mean_token_accuracy": 0.16225921884179115,
"num_tokens": 15198669.0,
"step": 8230
},
{
"entropy": 5.666697883605957,
"epoch": 0.6918714555765595,
"grad_norm": 1.2578125,
"learning_rate": 0.0004958413185686082,
"loss": 5.5682,
"mean_token_accuracy": 0.15779572129249572,
"num_tokens": 15207371.0,
"step": 8235
},
{
"entropy": 5.701167821884155,
"epoch": 0.6922915353917244,
"grad_norm": 1.1875,
"learning_rate": 0.0004958355855882709,
"loss": 5.5849,
"mean_token_accuracy": 0.15898908376693727,
"num_tokens": 15215694.0,
"step": 8240
},
{
"entropy": 5.754236125946045,
"epoch": 0.6927116152068893,
"grad_norm": 1.3125,
"learning_rate": 0.000495829848695945,
"loss": 5.5756,
"mean_token_accuracy": 0.15326517820358276,
"num_tokens": 15224963.0,
"step": 8245
},
{
"entropy": 5.597766494750976,
"epoch": 0.6931316950220542,
"grad_norm": 1.234375,
"learning_rate": 0.000495824107891732,
"loss": 5.3993,
"mean_token_accuracy": 0.16624577939510346,
"num_tokens": 15233569.0,
"step": 8250
},
{
"entropy": 5.594980192184448,
"epoch": 0.6935517748372191,
"grad_norm": 1.375,
"learning_rate": 0.0004958183631757336,
"loss": 5.5792,
"mean_token_accuracy": 0.15748120099306107,
"num_tokens": 15242671.0,
"step": 8255
},
{
"entropy": 5.651359605789184,
"epoch": 0.693971854652384,
"grad_norm": 1.2265625,
"learning_rate": 0.0004958126145480517,
"loss": 5.5311,
"mean_token_accuracy": 0.16182665377855301,
"num_tokens": 15251698.0,
"step": 8260
},
{
"entropy": 5.7826704502105715,
"epoch": 0.6943919344675489,
"grad_norm": 1.3359375,
"learning_rate": 0.0004958068620087879,
"loss": 5.6492,
"mean_token_accuracy": 0.1562219113111496,
"num_tokens": 15260608.0,
"step": 8265
},
{
"entropy": 5.599528217315674,
"epoch": 0.6948120142827137,
"grad_norm": 1.3203125,
"learning_rate": 0.0004958011055580443,
"loss": 5.5184,
"mean_token_accuracy": 0.1627250775694847,
"num_tokens": 15268866.0,
"step": 8270
},
{
"entropy": 5.586710405349732,
"epoch": 0.6952320940978786,
"grad_norm": 1.21875,
"learning_rate": 0.0004957953451959229,
"loss": 5.4797,
"mean_token_accuracy": 0.1718572720885277,
"num_tokens": 15277600.0,
"step": 8275
},
{
"entropy": 5.653484869003296,
"epoch": 0.6956521739130435,
"grad_norm": 1.1640625,
"learning_rate": 0.0004957895809225254,
"loss": 5.4978,
"mean_token_accuracy": 0.1609959051012993,
"num_tokens": 15286016.0,
"step": 8280
},
{
"entropy": 5.688267993927002,
"epoch": 0.6960722537282084,
"grad_norm": 1.2578125,
"learning_rate": 0.0004957838127379544,
"loss": 5.5499,
"mean_token_accuracy": 0.1622077226638794,
"num_tokens": 15294676.0,
"step": 8285
},
{
"entropy": 5.697055292129517,
"epoch": 0.6964923335433733,
"grad_norm": 1.1796875,
"learning_rate": 0.0004957780406423118,
"loss": 5.5451,
"mean_token_accuracy": 0.15666759461164476,
"num_tokens": 15304084.0,
"step": 8290
},
{
"entropy": 5.6490225315094,
"epoch": 0.6969124133585382,
"grad_norm": 1.296875,
"learning_rate": 0.0004957722646356999,
"loss": 5.5443,
"mean_token_accuracy": 0.15901953727006912,
"num_tokens": 15314182.0,
"step": 8295
},
{
"entropy": 5.695853900909424,
"epoch": 0.697332493173703,
"grad_norm": 1.1875,
"learning_rate": 0.0004957664847182209,
"loss": 5.6596,
"mean_token_accuracy": 0.15351271778345107,
"num_tokens": 15324213.0,
"step": 8300
},
{
"entropy": 5.770386505126953,
"epoch": 0.6977525729888678,
"grad_norm": 1.3359375,
"learning_rate": 0.0004957607008899774,
"loss": 5.5987,
"mean_token_accuracy": 0.15388550460338593,
"num_tokens": 15333122.0,
"step": 8305
},
{
"entropy": 5.790099620819092,
"epoch": 0.6981726528040327,
"grad_norm": 1.234375,
"learning_rate": 0.0004957549131510717,
"loss": 5.6877,
"mean_token_accuracy": 0.146911858022213,
"num_tokens": 15342199.0,
"step": 8310
},
{
"entropy": 5.784045934677124,
"epoch": 0.6985927326191976,
"grad_norm": 1.171875,
"learning_rate": 0.0004957491215016065,
"loss": 5.6357,
"mean_token_accuracy": 0.14943675845861434,
"num_tokens": 15352463.0,
"step": 8315
},
{
"entropy": 5.574797677993774,
"epoch": 0.6990128124343625,
"grad_norm": 1.25,
"learning_rate": 0.0004957433259416841,
"loss": 5.4843,
"mean_token_accuracy": 0.15912050753831863,
"num_tokens": 15361815.0,
"step": 8320
},
{
"entropy": 5.761810731887818,
"epoch": 0.6994328922495274,
"grad_norm": 1.2734375,
"learning_rate": 0.0004957375264714075,
"loss": 5.5941,
"mean_token_accuracy": 0.14885254427790642,
"num_tokens": 15371773.0,
"step": 8325
},
{
"entropy": 5.640265989303589,
"epoch": 0.6998529720646923,
"grad_norm": 1.171875,
"learning_rate": 0.0004957317230908792,
"loss": 5.5488,
"mean_token_accuracy": 0.15723545998334884,
"num_tokens": 15380881.0,
"step": 8330
},
{
"entropy": 5.614818906784057,
"epoch": 0.7002730518798572,
"grad_norm": 1.2578125,
"learning_rate": 0.0004957259158002022,
"loss": 5.4189,
"mean_token_accuracy": 0.1693311810493469,
"num_tokens": 15389310.0,
"step": 8335
},
{
"entropy": 5.604830265045166,
"epoch": 0.700693131695022,
"grad_norm": 1.21875,
"learning_rate": 0.0004957201045994791,
"loss": 5.5215,
"mean_token_accuracy": 0.16204681545495986,
"num_tokens": 15398584.0,
"step": 8340
},
{
"entropy": 5.675758123397827,
"epoch": 0.7011132115101869,
"grad_norm": 1.21875,
"learning_rate": 0.0004957142894888131,
"loss": 5.5447,
"mean_token_accuracy": 0.1662557229399681,
"num_tokens": 15407208.0,
"step": 8345
},
{
"entropy": 5.706802701950073,
"epoch": 0.7015332913253518,
"grad_norm": 1.296875,
"learning_rate": 0.0004957084704683071,
"loss": 5.585,
"mean_token_accuracy": 0.16073613464832306,
"num_tokens": 15416474.0,
"step": 8350
},
{
"entropy": 5.709591579437256,
"epoch": 0.7019533711405167,
"grad_norm": 1.296875,
"learning_rate": 0.0004957026475380642,
"loss": 5.5815,
"mean_token_accuracy": 0.15872790813446044,
"num_tokens": 15426101.0,
"step": 8355
},
{
"entropy": 5.750075244903565,
"epoch": 0.7023734509556816,
"grad_norm": 1.3125,
"learning_rate": 0.0004956968206981875,
"loss": 5.6179,
"mean_token_accuracy": 0.15739136934280396,
"num_tokens": 15435910.0,
"step": 8360
},
{
"entropy": 5.7634851932525635,
"epoch": 0.7027935307708465,
"grad_norm": 1.1796875,
"learning_rate": 0.0004956909899487803,
"loss": 5.6603,
"mean_token_accuracy": 0.1546674281358719,
"num_tokens": 15445494.0,
"step": 8365
},
{
"entropy": 5.677435827255249,
"epoch": 0.7032136105860114,
"grad_norm": 1.1875,
"learning_rate": 0.0004956851552899459,
"loss": 5.5525,
"mean_token_accuracy": 0.1608722448348999,
"num_tokens": 15455332.0,
"step": 8370
},
{
"entropy": 5.684817934036255,
"epoch": 0.7036336904011762,
"grad_norm": 1.15625,
"learning_rate": 0.0004956793167217874,
"loss": 5.6122,
"mean_token_accuracy": 0.1539962038397789,
"num_tokens": 15464241.0,
"step": 8375
},
{
"entropy": 5.77572021484375,
"epoch": 0.7040537702163411,
"grad_norm": 1.3125,
"learning_rate": 0.0004956734742444087,
"loss": 5.6124,
"mean_token_accuracy": 0.156563501060009,
"num_tokens": 15473473.0,
"step": 8380
},
{
"entropy": 5.676117277145385,
"epoch": 0.704473850031506,
"grad_norm": 1.265625,
"learning_rate": 0.0004956676278579129,
"loss": 5.4876,
"mean_token_accuracy": 0.16209144443273543,
"num_tokens": 15482494.0,
"step": 8385
},
{
"entropy": 5.5847352027893065,
"epoch": 0.7048939298466709,
"grad_norm": 1.1875,
"learning_rate": 0.0004956617775624037,
"loss": 5.5162,
"mean_token_accuracy": 0.15996210426092147,
"num_tokens": 15491180.0,
"step": 8390
},
{
"entropy": 5.684952878952027,
"epoch": 0.7053140096618358,
"grad_norm": 1.2734375,
"learning_rate": 0.0004956559233579848,
"loss": 5.5644,
"mean_token_accuracy": 0.1575160101056099,
"num_tokens": 15501035.0,
"step": 8395
},
{
"entropy": 5.660862255096435,
"epoch": 0.7057340894770007,
"grad_norm": 1.234375,
"learning_rate": 0.0004956500652447598,
"loss": 5.5498,
"mean_token_accuracy": 0.15877256616950036,
"num_tokens": 15510191.0,
"step": 8400
},
{
"entropy": 5.616110467910767,
"epoch": 0.7061541692921655,
"grad_norm": 1.2109375,
"learning_rate": 0.0004956442032228324,
"loss": 5.6257,
"mean_token_accuracy": 0.1581491820514202,
"num_tokens": 15519253.0,
"step": 8405
},
{
"entropy": 5.665469741821289,
"epoch": 0.7065742491073304,
"grad_norm": 1.2578125,
"learning_rate": 0.0004956383372923067,
"loss": 5.5804,
"mean_token_accuracy": 0.15870503187179566,
"num_tokens": 15528348.0,
"step": 8410
},
{
"entropy": 5.852564477920533,
"epoch": 0.7069943289224953,
"grad_norm": 1.15625,
"learning_rate": 0.0004956324674532864,
"loss": 5.6792,
"mean_token_accuracy": 0.15142991095781327,
"num_tokens": 15537557.0,
"step": 8415
},
{
"entropy": 5.765557336807251,
"epoch": 0.7074144087376601,
"grad_norm": 1.1171875,
"learning_rate": 0.0004956265937058757,
"loss": 5.5959,
"mean_token_accuracy": 0.15625424385070802,
"num_tokens": 15546745.0,
"step": 8420
},
{
"entropy": 5.6664046287536625,
"epoch": 0.707834488552825,
"grad_norm": 1.2734375,
"learning_rate": 0.0004956207160501784,
"loss": 5.4974,
"mean_token_accuracy": 0.16058537662029265,
"num_tokens": 15555532.0,
"step": 8425
},
{
"entropy": 5.695961809158325,
"epoch": 0.70825456836799,
"grad_norm": 1.1796875,
"learning_rate": 0.0004956148344862987,
"loss": 5.5334,
"mean_token_accuracy": 0.16354536563158034,
"num_tokens": 15564189.0,
"step": 8430
},
{
"entropy": 5.604541540145874,
"epoch": 0.7086746481831548,
"grad_norm": 1.1640625,
"learning_rate": 0.0004956089490143408,
"loss": 5.5702,
"mean_token_accuracy": 0.15514277219772338,
"num_tokens": 15574116.0,
"step": 8435
},
{
"entropy": 5.74328932762146,
"epoch": 0.7090947279983196,
"grad_norm": 1.2734375,
"learning_rate": 0.0004956030596344089,
"loss": 5.5655,
"mean_token_accuracy": 0.15882139503955842,
"num_tokens": 15583031.0,
"step": 8440
},
{
"entropy": 5.730233430862427,
"epoch": 0.7095148078134845,
"grad_norm": 1.1484375,
"learning_rate": 0.0004955971663466075,
"loss": 5.7008,
"mean_token_accuracy": 0.1583823412656784,
"num_tokens": 15592576.0,
"step": 8445
},
{
"entropy": 5.7435754299163815,
"epoch": 0.7099348876286494,
"grad_norm": 1.3671875,
"learning_rate": 0.0004955912691510407,
"loss": 5.6197,
"mean_token_accuracy": 0.15949044972658158,
"num_tokens": 15601065.0,
"step": 8450
},
{
"entropy": 5.7207683563232425,
"epoch": 0.7103549674438143,
"grad_norm": 1.25,
"learning_rate": 0.0004955853680478134,
"loss": 5.5574,
"mean_token_accuracy": 0.1536167934536934,
"num_tokens": 15610112.0,
"step": 8455
},
{
"entropy": 5.724563407897949,
"epoch": 0.7107750472589792,
"grad_norm": 1.2265625,
"learning_rate": 0.0004955794630370297,
"loss": 5.5492,
"mean_token_accuracy": 0.1562636002898216,
"num_tokens": 15618890.0,
"step": 8460
},
{
"entropy": 5.628248786926269,
"epoch": 0.7111951270741441,
"grad_norm": 1.2578125,
"learning_rate": 0.0004955735541187945,
"loss": 5.5698,
"mean_token_accuracy": 0.1557060018181801,
"num_tokens": 15627678.0,
"step": 8465
},
{
"entropy": 5.7201728343963625,
"epoch": 0.711615206889309,
"grad_norm": 1.359375,
"learning_rate": 0.0004955676412932124,
"loss": 5.5602,
"mean_token_accuracy": 0.1623851999640465,
"num_tokens": 15636833.0,
"step": 8470
},
{
"entropy": 5.690943670272827,
"epoch": 0.7120352867044738,
"grad_norm": 1.4296875,
"learning_rate": 0.0004955617245603881,
"loss": 5.587,
"mean_token_accuracy": 0.15263200998306276,
"num_tokens": 15646571.0,
"step": 8475
},
{
"entropy": 5.6829156398773195,
"epoch": 0.7124553665196387,
"grad_norm": 1.3359375,
"learning_rate": 0.0004955558039204263,
"loss": 5.6182,
"mean_token_accuracy": 0.16115371286869049,
"num_tokens": 15654907.0,
"step": 8480
},
{
"entropy": 5.7532580375671385,
"epoch": 0.7128754463348036,
"grad_norm": 1.3671875,
"learning_rate": 0.0004955498793734321,
"loss": 5.5462,
"mean_token_accuracy": 0.1611197918653488,
"num_tokens": 15664336.0,
"step": 8485
},
{
"entropy": 5.745222282409668,
"epoch": 0.7132955261499685,
"grad_norm": 1.3671875,
"learning_rate": 0.0004955439509195103,
"loss": 5.6056,
"mean_token_accuracy": 0.16001220643520356,
"num_tokens": 15674000.0,
"step": 8490
},
{
"entropy": 5.7138604640960695,
"epoch": 0.7137156059651334,
"grad_norm": 1.875,
"learning_rate": 0.0004955380185587661,
"loss": 5.6066,
"mean_token_accuracy": 0.15749419778585433,
"num_tokens": 15684214.0,
"step": 8495
},
{
"entropy": 5.710150194168091,
"epoch": 0.7141356857802983,
"grad_norm": 1.4375,
"learning_rate": 0.0004955320822913043,
"loss": 5.6195,
"mean_token_accuracy": 0.15598317384719848,
"num_tokens": 15693546.0,
"step": 8500
},
{
"entropy": 5.677717828750611,
"epoch": 0.7145557655954632,
"grad_norm": 1.2109375,
"learning_rate": 0.0004955261421172302,
"loss": 5.523,
"mean_token_accuracy": 0.15616966933012008,
"num_tokens": 15702310.0,
"step": 8505
},
{
"entropy": 5.698915433883667,
"epoch": 0.714975845410628,
"grad_norm": 1.2578125,
"learning_rate": 0.0004955201980366493,
"loss": 5.5748,
"mean_token_accuracy": 0.1571685291826725,
"num_tokens": 15711544.0,
"step": 8510
},
{
"entropy": 5.5956744194030765,
"epoch": 0.7153959252257929,
"grad_norm": 1.46875,
"learning_rate": 0.0004955142500496665,
"loss": 5.4703,
"mean_token_accuracy": 0.15791433602571486,
"num_tokens": 15720914.0,
"step": 8515
},
{
"entropy": 5.749522113800049,
"epoch": 0.7158160050409578,
"grad_norm": 1.2578125,
"learning_rate": 0.0004955082981563872,
"loss": 5.5593,
"mean_token_accuracy": 0.1538071796298027,
"num_tokens": 15729825.0,
"step": 8520
},
{
"entropy": 5.651506567001343,
"epoch": 0.7162360848561227,
"grad_norm": 1.28125,
"learning_rate": 0.000495502342356917,
"loss": 5.5774,
"mean_token_accuracy": 0.15711765587329865,
"num_tokens": 15739649.0,
"step": 8525
},
{
"entropy": 5.69700608253479,
"epoch": 0.7166561646712876,
"grad_norm": 1.3046875,
"learning_rate": 0.0004954963826513614,
"loss": 5.471,
"mean_token_accuracy": 0.15691360533237458,
"num_tokens": 15747805.0,
"step": 8530
},
{
"entropy": 5.759174013137818,
"epoch": 0.7170762444864525,
"grad_norm": 1.2109375,
"learning_rate": 0.000495490419039826,
"loss": 5.6066,
"mean_token_accuracy": 0.15306852161884307,
"num_tokens": 15757267.0,
"step": 8535
},
{
"entropy": 5.656875848770142,
"epoch": 0.7174963243016174,
"grad_norm": 1.2734375,
"learning_rate": 0.0004954844515224162,
"loss": 5.5701,
"mean_token_accuracy": 0.16100525110960007,
"num_tokens": 15767412.0,
"step": 8540
},
{
"entropy": 5.617850732803345,
"epoch": 0.7179164041167821,
"grad_norm": 1.25,
"learning_rate": 0.0004954784800992379,
"loss": 5.5648,
"mean_token_accuracy": 0.15756135284900666,
"num_tokens": 15776813.0,
"step": 8545
},
{
"entropy": 5.7305539608001705,
"epoch": 0.718336483931947,
"grad_norm": 1.171875,
"learning_rate": 0.0004954725047703969,
"loss": 5.619,
"mean_token_accuracy": 0.15379863306879998,
"num_tokens": 15786258.0,
"step": 8550
},
{
"entropy": 5.710029125213623,
"epoch": 0.7187565637471119,
"grad_norm": 1.21875,
"learning_rate": 0.000495466525535999,
"loss": 5.576,
"mean_token_accuracy": 0.15841218307614327,
"num_tokens": 15795673.0,
"step": 8555
},
{
"entropy": 5.743064737319946,
"epoch": 0.7191766435622768,
"grad_norm": 1.28125,
"learning_rate": 0.0004954605423961501,
"loss": 5.5714,
"mean_token_accuracy": 0.15423453375697135,
"num_tokens": 15805050.0,
"step": 8560
},
{
"entropy": 5.599371862411499,
"epoch": 0.7195967233774417,
"grad_norm": 1.2421875,
"learning_rate": 0.0004954545553509562,
"loss": 5.5315,
"mean_token_accuracy": 0.16561046838760377,
"num_tokens": 15813347.0,
"step": 8565
},
{
"entropy": 5.735318899154663,
"epoch": 0.7200168031926066,
"grad_norm": 1.1953125,
"learning_rate": 0.0004954485644005235,
"loss": 5.6455,
"mean_token_accuracy": 0.15517944395542144,
"num_tokens": 15823528.0,
"step": 8570
},
{
"entropy": 5.804451417922974,
"epoch": 0.7204368830077714,
"grad_norm": 1.34375,
"learning_rate": 0.0004954425695449578,
"loss": 5.5616,
"mean_token_accuracy": 0.15588821172714235,
"num_tokens": 15832727.0,
"step": 8575
},
{
"entropy": 5.727295684814453,
"epoch": 0.7208569628229363,
"grad_norm": 1.1796875,
"learning_rate": 0.0004954365707843657,
"loss": 5.6252,
"mean_token_accuracy": 0.1505603663623333,
"num_tokens": 15842402.0,
"step": 8580
},
{
"entropy": 5.609046983718872,
"epoch": 0.7212770426381012,
"grad_norm": 1.2109375,
"learning_rate": 0.0004954305681188531,
"loss": 5.4919,
"mean_token_accuracy": 0.15758911669254302,
"num_tokens": 15850886.0,
"step": 8585
},
{
"entropy": 5.874528598785401,
"epoch": 0.7216971224532661,
"grad_norm": 1.4609375,
"learning_rate": 0.0004954245615485265,
"loss": 5.775,
"mean_token_accuracy": 0.15257195830345155,
"num_tokens": 15860093.0,
"step": 8590
},
{
"entropy": 5.730541467666626,
"epoch": 0.722117202268431,
"grad_norm": 1.265625,
"learning_rate": 0.0004954185510734924,
"loss": 5.4951,
"mean_token_accuracy": 0.16199183613061904,
"num_tokens": 15868681.0,
"step": 8595
},
{
"entropy": 5.63291220664978,
"epoch": 0.7225372820835959,
"grad_norm": 1.2109375,
"learning_rate": 0.0004954125366938571,
"loss": 5.5715,
"mean_token_accuracy": 0.1619466871023178,
"num_tokens": 15878041.0,
"step": 8600
},
{
"entropy": 5.672195911407471,
"epoch": 0.7229573618987608,
"grad_norm": 1.234375,
"learning_rate": 0.0004954065184097271,
"loss": 5.5729,
"mean_token_accuracy": 0.15789103657007217,
"num_tokens": 15887562.0,
"step": 8605
},
{
"entropy": 5.6754150390625,
"epoch": 0.7233774417139256,
"grad_norm": 1.359375,
"learning_rate": 0.0004954004962212092,
"loss": 5.4812,
"mean_token_accuracy": 0.17005416005849838,
"num_tokens": 15896480.0,
"step": 8610
},
{
"entropy": 5.799858427047729,
"epoch": 0.7237975215290905,
"grad_norm": 1.2578125,
"learning_rate": 0.0004953944701284101,
"loss": 5.7002,
"mean_token_accuracy": 0.15285454094409942,
"num_tokens": 15906743.0,
"step": 8615
},
{
"entropy": 5.687485456466675,
"epoch": 0.7242176013442554,
"grad_norm": 1.3046875,
"learning_rate": 0.0004953884401314363,
"loss": 5.657,
"mean_token_accuracy": 0.14405350238084794,
"num_tokens": 15915981.0,
"step": 8620
},
{
"entropy": 5.668266773223877,
"epoch": 0.7246376811594203,
"grad_norm": 1.90625,
"learning_rate": 0.0004953824062303949,
"loss": 5.4964,
"mean_token_accuracy": 0.15426170378923415,
"num_tokens": 15924117.0,
"step": 8625
},
{
"entropy": 5.691087102890014,
"epoch": 0.7250577609745852,
"grad_norm": 1.3046875,
"learning_rate": 0.0004953763684253926,
"loss": 5.5372,
"mean_token_accuracy": 0.16509582996368408,
"num_tokens": 15933124.0,
"step": 8630
},
{
"entropy": 5.6671497344970705,
"epoch": 0.7254778407897501,
"grad_norm": 1.2265625,
"learning_rate": 0.0004953703267165364,
"loss": 5.4093,
"mean_token_accuracy": 0.16131499111652375,
"num_tokens": 15942422.0,
"step": 8635
},
{
"entropy": 5.673836374282837,
"epoch": 0.725897920604915,
"grad_norm": 1.390625,
"learning_rate": 0.0004953642811039332,
"loss": 5.6407,
"mean_token_accuracy": 0.1549506589770317,
"num_tokens": 15950989.0,
"step": 8640
},
{
"entropy": 5.6949738502502445,
"epoch": 0.7263180004200798,
"grad_norm": 1.34375,
"learning_rate": 0.0004953582315876904,
"loss": 5.6403,
"mean_token_accuracy": 0.1533081702888012,
"num_tokens": 15959659.0,
"step": 8645
},
{
"entropy": 5.694576978683472,
"epoch": 0.7267380802352447,
"grad_norm": 1.328125,
"learning_rate": 0.000495352178167915,
"loss": 5.5041,
"mean_token_accuracy": 0.17099424004554747,
"num_tokens": 15968102.0,
"step": 8650
},
{
"entropy": 5.793760204315186,
"epoch": 0.7271581600504096,
"grad_norm": 1.28125,
"learning_rate": 0.0004953461208447143,
"loss": 5.6391,
"mean_token_accuracy": 0.15256927609443666,
"num_tokens": 15977705.0,
"step": 8655
},
{
"entropy": 5.675874042510986,
"epoch": 0.7275782398655745,
"grad_norm": 1.3828125,
"learning_rate": 0.0004953400596181953,
"loss": 5.6382,
"mean_token_accuracy": 0.15347846299409867,
"num_tokens": 15986703.0,
"step": 8660
},
{
"entropy": 5.719884777069092,
"epoch": 0.7279983196807394,
"grad_norm": 1.2109375,
"learning_rate": 0.0004953339944884657,
"loss": 5.5402,
"mean_token_accuracy": 0.1610653355717659,
"num_tokens": 15995672.0,
"step": 8665
},
{
"entropy": 5.578777265548706,
"epoch": 0.7284183994959043,
"grad_norm": 1.171875,
"learning_rate": 0.0004953279254556329,
"loss": 5.5127,
"mean_token_accuracy": 0.16606825590133667,
"num_tokens": 16004437.0,
"step": 8670
},
{
"entropy": 5.646161603927612,
"epoch": 0.7288384793110692,
"grad_norm": 1.359375,
"learning_rate": 0.0004953218525198043,
"loss": 5.5353,
"mean_token_accuracy": 0.15697001963853835,
"num_tokens": 16012847.0,
"step": 8675
},
{
"entropy": 5.735745191574097,
"epoch": 0.7292585591262339,
"grad_norm": 1.2421875,
"learning_rate": 0.0004953157756810876,
"loss": 5.5712,
"mean_token_accuracy": 0.15936386734247207,
"num_tokens": 16022213.0,
"step": 8680
},
{
"entropy": 5.667310523986816,
"epoch": 0.7296786389413988,
"grad_norm": 1.1328125,
"learning_rate": 0.0004953096949395902,
"loss": 5.6223,
"mean_token_accuracy": 0.16006928235292434,
"num_tokens": 16031411.0,
"step": 8685
},
{
"entropy": 5.7181422233581545,
"epoch": 0.7300987187565637,
"grad_norm": 1.1484375,
"learning_rate": 0.0004953036102954202,
"loss": 5.665,
"mean_token_accuracy": 0.1544424846768379,
"num_tokens": 16041227.0,
"step": 8690
},
{
"entropy": 5.656005573272705,
"epoch": 0.7305187985717286,
"grad_norm": 1.140625,
"learning_rate": 0.0004952975217486852,
"loss": 5.4861,
"mean_token_accuracy": 0.16734187602996825,
"num_tokens": 16049777.0,
"step": 8695
},
{
"entropy": 5.685440540313721,
"epoch": 0.7309388783868935,
"grad_norm": 1.125,
"learning_rate": 0.0004952914292994928,
"loss": 5.5849,
"mean_token_accuracy": 0.1611620768904686,
"num_tokens": 16059093.0,
"step": 8700
},
{
"entropy": 5.751390409469605,
"epoch": 0.7313589582020584,
"grad_norm": 1.2890625,
"learning_rate": 0.0004952853329479514,
"loss": 5.6066,
"mean_token_accuracy": 0.16480785459280015,
"num_tokens": 16068550.0,
"step": 8705
},
{
"entropy": 5.6971841812133786,
"epoch": 0.7317790380172233,
"grad_norm": 1.296875,
"learning_rate": 0.0004952792326941686,
"loss": 5.6367,
"mean_token_accuracy": 0.15641499161720276,
"num_tokens": 16078286.0,
"step": 8710
},
{
"entropy": 5.731863737106323,
"epoch": 0.7321991178323881,
"grad_norm": 1.234375,
"learning_rate": 0.0004952731285382527,
"loss": 5.5655,
"mean_token_accuracy": 0.15366139262914658,
"num_tokens": 16087560.0,
"step": 8715
},
{
"entropy": 5.676146841049194,
"epoch": 0.732619197647553,
"grad_norm": 1.328125,
"learning_rate": 0.0004952670204803118,
"loss": 5.5266,
"mean_token_accuracy": 0.1664838597178459,
"num_tokens": 16097478.0,
"step": 8720
},
{
"entropy": 5.717540884017945,
"epoch": 0.7330392774627179,
"grad_norm": 1.1171875,
"learning_rate": 0.0004952609085204539,
"loss": 5.6373,
"mean_token_accuracy": 0.16377640068531035,
"num_tokens": 16106884.0,
"step": 8725
},
{
"entropy": 5.7192158699035645,
"epoch": 0.7334593572778828,
"grad_norm": 1.328125,
"learning_rate": 0.0004952547926587876,
"loss": 5.5669,
"mean_token_accuracy": 0.15231358855962754,
"num_tokens": 16115689.0,
"step": 8730
},
{
"entropy": 5.7321789264678955,
"epoch": 0.7338794370930477,
"grad_norm": 1.3203125,
"learning_rate": 0.0004952486728954209,
"loss": 5.4923,
"mean_token_accuracy": 0.16603951752185822,
"num_tokens": 16125237.0,
"step": 8735
},
{
"entropy": 5.608639526367187,
"epoch": 0.7342995169082126,
"grad_norm": 1.3046875,
"learning_rate": 0.0004952425492304624,
"loss": 5.5186,
"mean_token_accuracy": 0.16192169040441512,
"num_tokens": 16133940.0,
"step": 8740
},
{
"entropy": 5.68213267326355,
"epoch": 0.7347195967233774,
"grad_norm": 1.328125,
"learning_rate": 0.0004952364216640207,
"loss": 5.6094,
"mean_token_accuracy": 0.1565001666545868,
"num_tokens": 16143256.0,
"step": 8745
},
{
"entropy": 5.72492618560791,
"epoch": 0.7351396765385423,
"grad_norm": 1.265625,
"learning_rate": 0.000495230290196204,
"loss": 5.4731,
"mean_token_accuracy": 0.1615516275167465,
"num_tokens": 16153259.0,
"step": 8750
},
{
"entropy": 5.719367265701294,
"epoch": 0.7355597563537072,
"grad_norm": 1.46875,
"learning_rate": 0.0004952241548271212,
"loss": 5.7241,
"mean_token_accuracy": 0.15054681450128554,
"num_tokens": 16162125.0,
"step": 8755
},
{
"entropy": 5.750476121902466,
"epoch": 0.7359798361688721,
"grad_norm": 1.265625,
"learning_rate": 0.0004952180155568809,
"loss": 5.6369,
"mean_token_accuracy": 0.15786453932523728,
"num_tokens": 16171680.0,
"step": 8760
},
{
"entropy": 5.760352325439453,
"epoch": 0.736399915984037,
"grad_norm": 1.2734375,
"learning_rate": 0.0004952118723855919,
"loss": 5.6384,
"mean_token_accuracy": 0.15803294628858566,
"num_tokens": 16181559.0,
"step": 8765
},
{
"entropy": 5.698783349990845,
"epoch": 0.7368199957992019,
"grad_norm": 1.28125,
"learning_rate": 0.0004952057253133628,
"loss": 5.5916,
"mean_token_accuracy": 0.15793971419334413,
"num_tokens": 16190611.0,
"step": 8770
},
{
"entropy": 5.748305034637451,
"epoch": 0.7372400756143668,
"grad_norm": 1.2890625,
"learning_rate": 0.0004951995743403028,
"loss": 5.6072,
"mean_token_accuracy": 0.1553585410118103,
"num_tokens": 16200156.0,
"step": 8775
},
{
"entropy": 5.690016174316407,
"epoch": 0.7376601554295316,
"grad_norm": 1.3515625,
"learning_rate": 0.0004951934194665208,
"loss": 5.5778,
"mean_token_accuracy": 0.15406155884265899,
"num_tokens": 16209808.0,
"step": 8780
},
{
"entropy": 5.6809381484985355,
"epoch": 0.7380802352446965,
"grad_norm": 1.171875,
"learning_rate": 0.0004951872606921257,
"loss": 5.5356,
"mean_token_accuracy": 0.16181282997131347,
"num_tokens": 16219243.0,
"step": 8785
},
{
"entropy": 5.642398118972778,
"epoch": 0.7385003150598614,
"grad_norm": 1.15625,
"learning_rate": 0.0004951810980172265,
"loss": 5.5469,
"mean_token_accuracy": 0.16897100061178208,
"num_tokens": 16228180.0,
"step": 8790
},
{
"entropy": 5.688885879516602,
"epoch": 0.7389203948750263,
"grad_norm": 1.2421875,
"learning_rate": 0.0004951749314419327,
"loss": 5.5564,
"mean_token_accuracy": 0.16285726577043533,
"num_tokens": 16237045.0,
"step": 8795
},
{
"entropy": 5.7178980827331545,
"epoch": 0.7393404746901912,
"grad_norm": 1.3125,
"learning_rate": 0.0004951687609663533,
"loss": 5.4986,
"mean_token_accuracy": 0.16633692383766174,
"num_tokens": 16245307.0,
"step": 8800
},
{
"entropy": 5.6679950714111325,
"epoch": 0.739760554505356,
"grad_norm": 1.3359375,
"learning_rate": 0.0004951625865905977,
"loss": 5.5264,
"mean_token_accuracy": 0.1576075181365013,
"num_tokens": 16255047.0,
"step": 8805
},
{
"entropy": 5.619790697097779,
"epoch": 0.740180634320521,
"grad_norm": 1.3125,
"learning_rate": 0.0004951564083147753,
"loss": 5.5622,
"mean_token_accuracy": 0.16550215929746628,
"num_tokens": 16264969.0,
"step": 8810
},
{
"entropy": 5.738697290420532,
"epoch": 0.7406007141356857,
"grad_norm": 1.25,
"learning_rate": 0.0004951502261389953,
"loss": 5.657,
"mean_token_accuracy": 0.15243045836687089,
"num_tokens": 16274757.0,
"step": 8815
},
{
"entropy": 5.708020639419556,
"epoch": 0.7410207939508506,
"grad_norm": 1.1953125,
"learning_rate": 0.0004951440400633677,
"loss": 5.5439,
"mean_token_accuracy": 0.17007714062929152,
"num_tokens": 16283409.0,
"step": 8820
},
{
"entropy": 5.637024974822998,
"epoch": 0.7414408737660155,
"grad_norm": 1.2890625,
"learning_rate": 0.0004951378500880015,
"loss": 5.5293,
"mean_token_accuracy": 0.1615572139620781,
"num_tokens": 16293206.0,
"step": 8825
},
{
"entropy": 5.699015426635742,
"epoch": 0.7418609535811804,
"grad_norm": 1.328125,
"learning_rate": 0.0004951316562130067,
"loss": 5.537,
"mean_token_accuracy": 0.16240676045417785,
"num_tokens": 16303121.0,
"step": 8830
},
{
"entropy": 5.673394870758057,
"epoch": 0.7422810333963453,
"grad_norm": 1.1640625,
"learning_rate": 0.000495125458438493,
"loss": 5.5209,
"mean_token_accuracy": 0.1677414059638977,
"num_tokens": 16312710.0,
"step": 8835
},
{
"entropy": 5.826586580276489,
"epoch": 0.7427011132115102,
"grad_norm": 1.2890625,
"learning_rate": 0.0004951192567645702,
"loss": 5.7156,
"mean_token_accuracy": 0.1540753185749054,
"num_tokens": 16322280.0,
"step": 8840
},
{
"entropy": 5.609132862091064,
"epoch": 0.7431211930266751,
"grad_norm": 1.265625,
"learning_rate": 0.0004951130511913481,
"loss": 5.5458,
"mean_token_accuracy": 0.1609240397810936,
"num_tokens": 16331656.0,
"step": 8845
},
{
"entropy": 5.695055818557739,
"epoch": 0.7435412728418399,
"grad_norm": 1.1171875,
"learning_rate": 0.0004951068417189366,
"loss": 5.5792,
"mean_token_accuracy": 0.16215466409921647,
"num_tokens": 16341074.0,
"step": 8850
},
{
"entropy": 5.693843412399292,
"epoch": 0.7439613526570048,
"grad_norm": 1.203125,
"learning_rate": 0.0004951006283474457,
"loss": 5.5714,
"mean_token_accuracy": 0.15623839199543,
"num_tokens": 16350097.0,
"step": 8855
},
{
"entropy": 5.533200073242187,
"epoch": 0.7443814324721697,
"grad_norm": 1.2265625,
"learning_rate": 0.0004950944110769856,
"loss": 5.4689,
"mean_token_accuracy": 0.16720343977212906,
"num_tokens": 16359274.0,
"step": 8860
},
{
"entropy": 5.591437864303589,
"epoch": 0.7448015122873346,
"grad_norm": 1.2890625,
"learning_rate": 0.0004950881899076663,
"loss": 5.4485,
"mean_token_accuracy": 0.17578994035720824,
"num_tokens": 16368445.0,
"step": 8865
},
{
"entropy": 5.731908941268921,
"epoch": 0.7452215921024995,
"grad_norm": 1.21875,
"learning_rate": 0.0004950819648395979,
"loss": 5.5514,
"mean_token_accuracy": 0.16629501432180405,
"num_tokens": 16377689.0,
"step": 8870
},
{
"entropy": 5.6472413539886475,
"epoch": 0.7456416719176644,
"grad_norm": 1.34375,
"learning_rate": 0.000495075735872891,
"loss": 5.5111,
"mean_token_accuracy": 0.15904580056667328,
"num_tokens": 16386713.0,
"step": 8875
},
{
"entropy": 5.683931541442871,
"epoch": 0.7460617517328293,
"grad_norm": 1.2890625,
"learning_rate": 0.0004950695030076557,
"loss": 5.5424,
"mean_token_accuracy": 0.15848269909620286,
"num_tokens": 16395390.0,
"step": 8880
},
{
"entropy": 5.741724491119385,
"epoch": 0.7464818315479941,
"grad_norm": 1.359375,
"learning_rate": 0.0004950632662440027,
"loss": 5.6115,
"mean_token_accuracy": 0.159981369972229,
"num_tokens": 16404531.0,
"step": 8885
},
{
"entropy": 5.599276351928711,
"epoch": 0.746901911363159,
"grad_norm": 1.2578125,
"learning_rate": 0.0004950570255820419,
"loss": 5.4983,
"mean_token_accuracy": 0.16114450246095657,
"num_tokens": 16413649.0,
"step": 8890
},
{
"entropy": 5.629710292816162,
"epoch": 0.7473219911783239,
"grad_norm": 1.21875,
"learning_rate": 0.0004950507810218843,
"loss": 5.6315,
"mean_token_accuracy": 0.1517360143363476,
"num_tokens": 16423247.0,
"step": 8895
},
{
"entropy": 5.7457269668579105,
"epoch": 0.7477420709934888,
"grad_norm": 1.3046875,
"learning_rate": 0.0004950445325636405,
"loss": 5.5523,
"mean_token_accuracy": 0.15606946051120757,
"num_tokens": 16432190.0,
"step": 8900
},
{
"entropy": 5.723924207687378,
"epoch": 0.7481621508086537,
"grad_norm": 1.1328125,
"learning_rate": 0.0004950382802074211,
"loss": 5.5128,
"mean_token_accuracy": 0.1662849009037018,
"num_tokens": 16443091.0,
"step": 8905
},
{
"entropy": 5.593891191482544,
"epoch": 0.7485822306238186,
"grad_norm": 1.1953125,
"learning_rate": 0.0004950320239533369,
"loss": 5.5523,
"mean_token_accuracy": 0.16187762469053268,
"num_tokens": 16452077.0,
"step": 8910
},
{
"entropy": 5.766776895523071,
"epoch": 0.7490023104389834,
"grad_norm": 1.328125,
"learning_rate": 0.0004950257638014986,
"loss": 5.6666,
"mean_token_accuracy": 0.1541683092713356,
"num_tokens": 16461893.0,
"step": 8915
},
{
"entropy": 5.725894832611084,
"epoch": 0.7494223902541483,
"grad_norm": 1.203125,
"learning_rate": 0.0004950194997520172,
"loss": 5.5287,
"mean_token_accuracy": 0.15639800429344178,
"num_tokens": 16470904.0,
"step": 8920
},
{
"entropy": 5.652572441101074,
"epoch": 0.7498424700693131,
"grad_norm": 1.390625,
"learning_rate": 0.0004950132318050037,
"loss": 5.5541,
"mean_token_accuracy": 0.15636360496282578,
"num_tokens": 16480130.0,
"step": 8925
},
{
"entropy": 5.686444091796875,
"epoch": 0.750262549884478,
"grad_norm": 1.4140625,
"learning_rate": 0.0004950069599605691,
"loss": 5.6189,
"mean_token_accuracy": 0.15773196816444396,
"num_tokens": 16489485.0,
"step": 8930
},
{
"entropy": 5.682436275482178,
"epoch": 0.750682629699643,
"grad_norm": 1.296875,
"learning_rate": 0.0004950006842188245,
"loss": 5.5841,
"mean_token_accuracy": 0.16362835615873336,
"num_tokens": 16498529.0,
"step": 8935
},
{
"entropy": 5.676763725280762,
"epoch": 0.7511027095148078,
"grad_norm": 1.171875,
"learning_rate": 0.000494994404579881,
"loss": 5.4927,
"mean_token_accuracy": 0.15938565880060196,
"num_tokens": 16508094.0,
"step": 8940
},
{
"entropy": 5.751111745834351,
"epoch": 0.7515227893299727,
"grad_norm": 1.265625,
"learning_rate": 0.00049498812104385,
"loss": 5.61,
"mean_token_accuracy": 0.15739443451166152,
"num_tokens": 16517620.0,
"step": 8945
},
{
"entropy": 5.664203453063965,
"epoch": 0.7519428691451375,
"grad_norm": 1.234375,
"learning_rate": 0.0004949818336108425,
"loss": 5.5822,
"mean_token_accuracy": 0.15442595928907393,
"num_tokens": 16526720.0,
"step": 8950
},
{
"entropy": 5.672812128067017,
"epoch": 0.7523629489603024,
"grad_norm": 1.171875,
"learning_rate": 0.0004949755422809703,
"loss": 5.5558,
"mean_token_accuracy": 0.15813300311565398,
"num_tokens": 16535979.0,
"step": 8955
},
{
"entropy": 5.660897350311279,
"epoch": 0.7527830287754673,
"grad_norm": 1.2734375,
"learning_rate": 0.0004949692470543446,
"loss": 5.4598,
"mean_token_accuracy": 0.17246938049793242,
"num_tokens": 16544538.0,
"step": 8960
},
{
"entropy": 5.610929727554321,
"epoch": 0.7532031085906322,
"grad_norm": 1.2421875,
"learning_rate": 0.0004949629479310769,
"loss": 5.5282,
"mean_token_accuracy": 0.16230118721723558,
"num_tokens": 16553962.0,
"step": 8965
},
{
"entropy": 5.661597442626953,
"epoch": 0.7536231884057971,
"grad_norm": 1.2421875,
"learning_rate": 0.0004949566449112788,
"loss": 5.4763,
"mean_token_accuracy": 0.16285742372274398,
"num_tokens": 16562652.0,
"step": 8970
},
{
"entropy": 5.704653215408325,
"epoch": 0.754043268220962,
"grad_norm": 1.21875,
"learning_rate": 0.0004949503379950621,
"loss": 5.5603,
"mean_token_accuracy": 0.15362354516983032,
"num_tokens": 16570887.0,
"step": 8975
},
{
"entropy": 5.722882509231567,
"epoch": 0.7544633480361269,
"grad_norm": 1.0859375,
"learning_rate": 0.0004949440271825385,
"loss": 5.6894,
"mean_token_accuracy": 0.1523263484239578,
"num_tokens": 16581469.0,
"step": 8980
},
{
"entropy": 5.761154270172119,
"epoch": 0.7548834278512917,
"grad_norm": 1.2265625,
"learning_rate": 0.0004949377124738196,
"loss": 5.5602,
"mean_token_accuracy": 0.15706332474946977,
"num_tokens": 16590213.0,
"step": 8985
},
{
"entropy": 5.696602058410645,
"epoch": 0.7553035076664566,
"grad_norm": 1.328125,
"learning_rate": 0.0004949313938690174,
"loss": 5.5492,
"mean_token_accuracy": 0.1597098231315613,
"num_tokens": 16598384.0,
"step": 8990
},
{
"entropy": 5.6125421047210695,
"epoch": 0.7557235874816215,
"grad_norm": 1.265625,
"learning_rate": 0.0004949250713682438,
"loss": 5.5277,
"mean_token_accuracy": 0.16421396732330323,
"num_tokens": 16607670.0,
"step": 8995
},
{
"entropy": 5.717169332504272,
"epoch": 0.7561436672967864,
"grad_norm": 1.25,
"learning_rate": 0.0004949187449716107,
"loss": 5.6197,
"mean_token_accuracy": 0.15434789657592773,
"num_tokens": 16617560.0,
"step": 9000
},
{
"epoch": 0.7561436672967864,
"eval_entropy": 5.550903656885421,
"eval_loss": 5.585652828216553,
"eval_mean_token_accuracy": 0.1653114039517877,
"eval_num_tokens": 16617560.0,
"eval_runtime": 27.4853,
"eval_samples_per_second": 1359.488,
"eval_steps_per_second": 169.945,
"step": 9000
},
{
"entropy": 5.676779699325562,
"epoch": 0.7565637471119513,
"grad_norm": 1.2421875,
"learning_rate": 0.0004949124146792304,
"loss": 5.5309,
"mean_token_accuracy": 0.16354483962059022,
"num_tokens": 16626038.0,
"step": 9005
},
{
"entropy": 5.657222414016724,
"epoch": 0.7569838269271162,
"grad_norm": 1.296875,
"learning_rate": 0.0004949060804912149,
"loss": 5.5512,
"mean_token_accuracy": 0.16051364243030547,
"num_tokens": 16636490.0,
"step": 9010
},
{
"entropy": 5.710636186599731,
"epoch": 0.7574039067422811,
"grad_norm": 1.2734375,
"learning_rate": 0.0004948997424076764,
"loss": 5.5464,
"mean_token_accuracy": 0.1557503804564476,
"num_tokens": 16645369.0,
"step": 9015
},
{
"entropy": 5.759248876571656,
"epoch": 0.7578239865574459,
"grad_norm": 1.203125,
"learning_rate": 0.0004948934004287272,
"loss": 5.6124,
"mean_token_accuracy": 0.1579928658902645,
"num_tokens": 16654348.0,
"step": 9020
},
{
"entropy": 5.7791351795196535,
"epoch": 0.7582440663726108,
"grad_norm": 1.21875,
"learning_rate": 0.0004948870545544796,
"loss": 5.6141,
"mean_token_accuracy": 0.1496404230594635,
"num_tokens": 16664009.0,
"step": 9025
},
{
"entropy": 5.709259700775147,
"epoch": 0.7586641461877757,
"grad_norm": 1.3359375,
"learning_rate": 0.000494880704785046,
"loss": 5.6286,
"mean_token_accuracy": 0.15393585115671157,
"num_tokens": 16674079.0,
"step": 9030
},
{
"entropy": 5.69500150680542,
"epoch": 0.7590842260029406,
"grad_norm": 1.21875,
"learning_rate": 0.0004948743511205392,
"loss": 5.559,
"mean_token_accuracy": 0.16091811507940293,
"num_tokens": 16683687.0,
"step": 9035
},
{
"entropy": 5.664481973648071,
"epoch": 0.7595043058181055,
"grad_norm": 1.1640625,
"learning_rate": 0.0004948679935610712,
"loss": 5.4712,
"mean_token_accuracy": 0.1736868515610695,
"num_tokens": 16693311.0,
"step": 9040
},
{
"entropy": 5.641069650650024,
"epoch": 0.7599243856332704,
"grad_norm": 1.25,
"learning_rate": 0.000494861632106755,
"loss": 5.5108,
"mean_token_accuracy": 0.1594437539577484,
"num_tokens": 16702121.0,
"step": 9045
},
{
"entropy": 5.643872547149658,
"epoch": 0.7603444654484351,
"grad_norm": 1.28125,
"learning_rate": 0.0004948552667577033,
"loss": 5.5391,
"mean_token_accuracy": 0.1581725984811783,
"num_tokens": 16711883.0,
"step": 9050
},
{
"entropy": 5.724424076080322,
"epoch": 0.7607645452636,
"grad_norm": 1.3125,
"learning_rate": 0.0004948488975140286,
"loss": 5.612,
"mean_token_accuracy": 0.15874697566032409,
"num_tokens": 16721449.0,
"step": 9055
},
{
"entropy": 5.686666107177734,
"epoch": 0.7611846250787649,
"grad_norm": 1.34375,
"learning_rate": 0.000494842524375844,
"loss": 5.5403,
"mean_token_accuracy": 0.1576692521572113,
"num_tokens": 16730068.0,
"step": 9060
},
{
"entropy": 5.629677724838257,
"epoch": 0.7616047048939298,
"grad_norm": 1.2265625,
"learning_rate": 0.0004948361473432623,
"loss": 5.5526,
"mean_token_accuracy": 0.158638134598732,
"num_tokens": 16739970.0,
"step": 9065
},
{
"entropy": 5.772377395629883,
"epoch": 0.7620247847090947,
"grad_norm": 1.3203125,
"learning_rate": 0.0004948297664163964,
"loss": 5.6304,
"mean_token_accuracy": 0.15675289779901505,
"num_tokens": 16749461.0,
"step": 9070
},
{
"entropy": 5.757342672348022,
"epoch": 0.7624448645242596,
"grad_norm": 1.2578125,
"learning_rate": 0.0004948233815953593,
"loss": 5.6877,
"mean_token_accuracy": 0.1533423252403736,
"num_tokens": 16758747.0,
"step": 9075
},
{
"entropy": 5.6478190422058105,
"epoch": 0.7628649443394245,
"grad_norm": 1.34375,
"learning_rate": 0.0004948169928802643,
"loss": 5.4186,
"mean_token_accuracy": 0.1680832475423813,
"num_tokens": 16767212.0,
"step": 9080
},
{
"entropy": 5.715030384063721,
"epoch": 0.7632850241545893,
"grad_norm": 1.3125,
"learning_rate": 0.0004948106002712245,
"loss": 5.5782,
"mean_token_accuracy": 0.15843361914157866,
"num_tokens": 16776514.0,
"step": 9085
},
{
"entropy": 5.709997463226318,
"epoch": 0.7637051039697542,
"grad_norm": 1.1796875,
"learning_rate": 0.0004948042037683529,
"loss": 5.5506,
"mean_token_accuracy": 0.15640008747577666,
"num_tokens": 16786310.0,
"step": 9090
},
{
"entropy": 5.718139171600342,
"epoch": 0.7641251837849191,
"grad_norm": 1.2109375,
"learning_rate": 0.0004947978033717632,
"loss": 5.581,
"mean_token_accuracy": 0.15548551678657532,
"num_tokens": 16795551.0,
"step": 9095
},
{
"entropy": 5.7015400409698485,
"epoch": 0.764545263600084,
"grad_norm": 1.1796875,
"learning_rate": 0.0004947913990815684,
"loss": 5.5491,
"mean_token_accuracy": 0.15944830179214478,
"num_tokens": 16805099.0,
"step": 9100
},
{
"entropy": 5.720749711990356,
"epoch": 0.7649653434152489,
"grad_norm": 1.140625,
"learning_rate": 0.0004947849908978824,
"loss": 5.5826,
"mean_token_accuracy": 0.15997215658426284,
"num_tokens": 16813963.0,
"step": 9105
},
{
"entropy": 5.7728959083557125,
"epoch": 0.7653854232304138,
"grad_norm": 1.1796875,
"learning_rate": 0.0004947785788208182,
"loss": 5.624,
"mean_token_accuracy": 0.15642002671957017,
"num_tokens": 16822814.0,
"step": 9110
},
{
"entropy": 5.748675060272217,
"epoch": 0.7658055030455787,
"grad_norm": 1.2421875,
"learning_rate": 0.0004947721628504898,
"loss": 5.6506,
"mean_token_accuracy": 0.15549189746379852,
"num_tokens": 16831906.0,
"step": 9115
},
{
"entropy": 5.6409765720367435,
"epoch": 0.7662255828607435,
"grad_norm": 1.359375,
"learning_rate": 0.0004947657429870108,
"loss": 5.4597,
"mean_token_accuracy": 0.1629703313112259,
"num_tokens": 16840050.0,
"step": 9120
},
{
"entropy": 5.592518949508667,
"epoch": 0.7666456626759084,
"grad_norm": 1.5859375,
"learning_rate": 0.0004947593192304946,
"loss": 5.4915,
"mean_token_accuracy": 0.16089494079351424,
"num_tokens": 16848404.0,
"step": 9125
},
{
"entropy": 5.62299108505249,
"epoch": 0.7670657424910733,
"grad_norm": 1.265625,
"learning_rate": 0.0004947528915810554,
"loss": 5.4933,
"mean_token_accuracy": 0.1622912123799324,
"num_tokens": 16856568.0,
"step": 9130
},
{
"entropy": 5.6679102897644045,
"epoch": 0.7674858223062382,
"grad_norm": 1.53125,
"learning_rate": 0.0004947464600388066,
"loss": 5.5196,
"mean_token_accuracy": 0.1620594307780266,
"num_tokens": 16864936.0,
"step": 9135
},
{
"entropy": 5.808090543746948,
"epoch": 0.7679059021214031,
"grad_norm": 1.40625,
"learning_rate": 0.0004947400246038627,
"loss": 5.658,
"mean_token_accuracy": 0.15416733771562577,
"num_tokens": 16874504.0,
"step": 9140
},
{
"entropy": 5.572278261184692,
"epoch": 0.768325981936568,
"grad_norm": 1.21875,
"learning_rate": 0.0004947335852763374,
"loss": 5.41,
"mean_token_accuracy": 0.1646368309855461,
"num_tokens": 16883365.0,
"step": 9145
},
{
"entropy": 5.67635669708252,
"epoch": 0.7687460617517329,
"grad_norm": 1.2578125,
"learning_rate": 0.0004947271420563447,
"loss": 5.6623,
"mean_token_accuracy": 0.144975683093071,
"num_tokens": 16892701.0,
"step": 9150
},
{
"entropy": 5.678815221786499,
"epoch": 0.7691661415668977,
"grad_norm": 1.2265625,
"learning_rate": 0.0004947206949439989,
"loss": 5.4911,
"mean_token_accuracy": 0.15904399305582045,
"num_tokens": 16901864.0,
"step": 9155
},
{
"entropy": 5.71064567565918,
"epoch": 0.7695862213820626,
"grad_norm": 1.265625,
"learning_rate": 0.000494714243939414,
"loss": 5.5155,
"mean_token_accuracy": 0.16552575305104256,
"num_tokens": 16910908.0,
"step": 9160
},
{
"entropy": 5.67052116394043,
"epoch": 0.7700063011972275,
"grad_norm": 1.25,
"learning_rate": 0.0004947077890427045,
"loss": 5.5671,
"mean_token_accuracy": 0.16161819100379943,
"num_tokens": 16920299.0,
"step": 9165
},
{
"entropy": 5.782629203796387,
"epoch": 0.7704263810123924,
"grad_norm": 1.25,
"learning_rate": 0.0004947013302539846,
"loss": 5.6819,
"mean_token_accuracy": 0.14940958172082902,
"num_tokens": 16930027.0,
"step": 9170
},
{
"entropy": 5.750628232955933,
"epoch": 0.7708464608275573,
"grad_norm": 1.3046875,
"learning_rate": 0.0004946948675733688,
"loss": 5.5743,
"mean_token_accuracy": 0.15798421204090118,
"num_tokens": 16939387.0,
"step": 9175
},
{
"entropy": 5.68762173652649,
"epoch": 0.7712665406427222,
"grad_norm": 1.34375,
"learning_rate": 0.0004946884010009714,
"loss": 5.5436,
"mean_token_accuracy": 0.16124205887317658,
"num_tokens": 16950024.0,
"step": 9180
},
{
"entropy": 5.638156890869141,
"epoch": 0.771686620457887,
"grad_norm": 1.2578125,
"learning_rate": 0.0004946819305369073,
"loss": 5.4453,
"mean_token_accuracy": 0.16821138560771942,
"num_tokens": 16958219.0,
"step": 9185
},
{
"entropy": 5.6072611808776855,
"epoch": 0.7721067002730518,
"grad_norm": 1.3125,
"learning_rate": 0.0004946754561812909,
"loss": 5.424,
"mean_token_accuracy": 0.16381552219390869,
"num_tokens": 16966829.0,
"step": 9190
},
{
"entropy": 5.621193552017212,
"epoch": 0.7725267800882167,
"grad_norm": 1.203125,
"learning_rate": 0.0004946689779342367,
"loss": 5.5281,
"mean_token_accuracy": 0.16035796105861663,
"num_tokens": 16975585.0,
"step": 9195
},
{
"entropy": 5.625691366195679,
"epoch": 0.7729468599033816,
"grad_norm": 1.15625,
"learning_rate": 0.0004946624957958599,
"loss": 5.5195,
"mean_token_accuracy": 0.16271020472049713,
"num_tokens": 16984848.0,
"step": 9200
},
{
"entropy": 5.610690355300903,
"epoch": 0.7733669397185465,
"grad_norm": 1.2578125,
"learning_rate": 0.000494656009766275,
"loss": 5.5278,
"mean_token_accuracy": 0.16337853372097016,
"num_tokens": 16993179.0,
"step": 9205
},
{
"entropy": 5.62993860244751,
"epoch": 0.7737870195337114,
"grad_norm": 1.2578125,
"learning_rate": 0.000494649519845597,
"loss": 5.536,
"mean_token_accuracy": 0.1587835118174553,
"num_tokens": 17002563.0,
"step": 9210
},
{
"entropy": 5.708981704711914,
"epoch": 0.7742070993488763,
"grad_norm": 1.25,
"learning_rate": 0.0004946430260339409,
"loss": 5.5556,
"mean_token_accuracy": 0.15450216233730316,
"num_tokens": 17011805.0,
"step": 9215
},
{
"entropy": 5.677693319320679,
"epoch": 0.7746271791640411,
"grad_norm": 1.203125,
"learning_rate": 0.0004946365283314216,
"loss": 5.5224,
"mean_token_accuracy": 0.15712636411190034,
"num_tokens": 17020398.0,
"step": 9220
},
{
"entropy": 5.605440521240235,
"epoch": 0.775047258979206,
"grad_norm": 1.1015625,
"learning_rate": 0.0004946300267381545,
"loss": 5.4969,
"mean_token_accuracy": 0.16217170357704164,
"num_tokens": 17030805.0,
"step": 9225
},
{
"entropy": 5.669513273239136,
"epoch": 0.7754673387943709,
"grad_norm": 1.4609375,
"learning_rate": 0.0004946235212542544,
"loss": 5.5344,
"mean_token_accuracy": 0.16208829283714293,
"num_tokens": 17040164.0,
"step": 9230
},
{
"entropy": 5.650531339645386,
"epoch": 0.7758874186095358,
"grad_norm": 1.296875,
"learning_rate": 0.0004946170118798367,
"loss": 5.5906,
"mean_token_accuracy": 0.15611371397972107,
"num_tokens": 17049519.0,
"step": 9235
},
{
"entropy": 5.731847667694092,
"epoch": 0.7763074984247007,
"grad_norm": 1.3203125,
"learning_rate": 0.0004946104986150167,
"loss": 5.5354,
"mean_token_accuracy": 0.15955014228820802,
"num_tokens": 17058042.0,
"step": 9240
},
{
"entropy": 5.645770788192749,
"epoch": 0.7767275782398656,
"grad_norm": 1.3515625,
"learning_rate": 0.0004946039814599099,
"loss": 5.5463,
"mean_token_accuracy": 0.16279122084379197,
"num_tokens": 17067107.0,
"step": 9245
},
{
"entropy": 5.659356498718262,
"epoch": 0.7771476580550305,
"grad_norm": 1.21875,
"learning_rate": 0.0004945974604146316,
"loss": 5.6383,
"mean_token_accuracy": 0.16182979196310043,
"num_tokens": 17076975.0,
"step": 9250
},
{
"entropy": 5.652160120010376,
"epoch": 0.7775677378701953,
"grad_norm": 1.1640625,
"learning_rate": 0.0004945909354792974,
"loss": 5.4849,
"mean_token_accuracy": 0.16177307814359665,
"num_tokens": 17086405.0,
"step": 9255
},
{
"entropy": 5.673442792892456,
"epoch": 0.7779878176853602,
"grad_norm": 1.2265625,
"learning_rate": 0.0004945844066540229,
"loss": 5.5607,
"mean_token_accuracy": 0.15728678107261657,
"num_tokens": 17095333.0,
"step": 9260
},
{
"entropy": 5.715294075012207,
"epoch": 0.7784078975005251,
"grad_norm": 1.296875,
"learning_rate": 0.0004945778739389236,
"loss": 5.6047,
"mean_token_accuracy": 0.15702673196792602,
"num_tokens": 17103631.0,
"step": 9265
},
{
"entropy": 5.694115066528321,
"epoch": 0.77882797731569,
"grad_norm": 1.2578125,
"learning_rate": 0.0004945713373341152,
"loss": 5.4989,
"mean_token_accuracy": 0.1604058101773262,
"num_tokens": 17112612.0,
"step": 9270
},
{
"entropy": 5.704887533187867,
"epoch": 0.7792480571308549,
"grad_norm": 1.1640625,
"learning_rate": 0.0004945647968397139,
"loss": 5.5416,
"mean_token_accuracy": 0.1582975372672081,
"num_tokens": 17121592.0,
"step": 9275
},
{
"entropy": 5.646542644500732,
"epoch": 0.7796681369460198,
"grad_norm": 1.21875,
"learning_rate": 0.0004945582524558352,
"loss": 5.5639,
"mean_token_accuracy": 0.16268820613622664,
"num_tokens": 17131003.0,
"step": 9280
},
{
"entropy": 5.76059775352478,
"epoch": 0.7800882167611847,
"grad_norm": 1.265625,
"learning_rate": 0.000494551704182595,
"loss": 5.568,
"mean_token_accuracy": 0.15795339047908782,
"num_tokens": 17140013.0,
"step": 9285
},
{
"entropy": 5.794629859924316,
"epoch": 0.7805082965763495,
"grad_norm": 1.21875,
"learning_rate": 0.0004945451520201095,
"loss": 5.7168,
"mean_token_accuracy": 0.1471349537372589,
"num_tokens": 17150406.0,
"step": 9290
},
{
"entropy": 5.697279930114746,
"epoch": 0.7809283763915144,
"grad_norm": 1.2890625,
"learning_rate": 0.0004945385959684947,
"loss": 5.5815,
"mean_token_accuracy": 0.15653786659240723,
"num_tokens": 17159757.0,
"step": 9295
},
{
"entropy": 5.733369779586792,
"epoch": 0.7813484562066793,
"grad_norm": 1.2421875,
"learning_rate": 0.0004945320360278667,
"loss": 5.5842,
"mean_token_accuracy": 0.16423405855894088,
"num_tokens": 17169317.0,
"step": 9300
},
{
"entropy": 5.745339679718017,
"epoch": 0.7817685360218442,
"grad_norm": 1.7265625,
"learning_rate": 0.0004945254721983416,
"loss": 5.5893,
"mean_token_accuracy": 0.16655531898140907,
"num_tokens": 17178410.0,
"step": 9305
},
{
"entropy": 5.705344915390015,
"epoch": 0.782188615837009,
"grad_norm": 1.34375,
"learning_rate": 0.000494518904480036,
"loss": 5.5004,
"mean_token_accuracy": 0.15776502788066865,
"num_tokens": 17186922.0,
"step": 9310
},
{
"entropy": 5.664468145370483,
"epoch": 0.782608695652174,
"grad_norm": 1.2265625,
"learning_rate": 0.0004945123328730659,
"loss": 5.5965,
"mean_token_accuracy": 0.1562838301062584,
"num_tokens": 17197125.0,
"step": 9315
},
{
"entropy": 5.67675747871399,
"epoch": 0.7830287754673388,
"grad_norm": 1.2421875,
"learning_rate": 0.000494505757377548,
"loss": 5.507,
"mean_token_accuracy": 0.15732436180114745,
"num_tokens": 17206169.0,
"step": 9320
},
{
"entropy": 5.616800355911255,
"epoch": 0.7834488552825036,
"grad_norm": 1.2890625,
"learning_rate": 0.0004944991779935985,
"loss": 5.4589,
"mean_token_accuracy": 0.16231919527053834,
"num_tokens": 17214607.0,
"step": 9325
},
{
"entropy": 5.607938623428344,
"epoch": 0.7838689350976685,
"grad_norm": 1.265625,
"learning_rate": 0.000494492594721334,
"loss": 5.4331,
"mean_token_accuracy": 0.16268193870782852,
"num_tokens": 17223616.0,
"step": 9330
},
{
"entropy": 5.702634334564209,
"epoch": 0.7842890149128334,
"grad_norm": 1.328125,
"learning_rate": 0.0004944860075608715,
"loss": 5.5245,
"mean_token_accuracy": 0.1614809066057205,
"num_tokens": 17232729.0,
"step": 9335
},
{
"entropy": 5.658515548706054,
"epoch": 0.7847090947279983,
"grad_norm": 1.234375,
"learning_rate": 0.0004944794165123272,
"loss": 5.5865,
"mean_token_accuracy": 0.16026565730571746,
"num_tokens": 17242128.0,
"step": 9340
},
{
"entropy": 5.650646591186524,
"epoch": 0.7851291745431632,
"grad_norm": 1.359375,
"learning_rate": 0.000494472821575818,
"loss": 5.4782,
"mean_token_accuracy": 0.16116299331188202,
"num_tokens": 17250806.0,
"step": 9345
},
{
"entropy": 5.776854419708252,
"epoch": 0.7855492543583281,
"grad_norm": 1.390625,
"learning_rate": 0.0004944662227514609,
"loss": 5.722,
"mean_token_accuracy": 0.14963266700506211,
"num_tokens": 17260888.0,
"step": 9350
},
{
"entropy": 5.670192575454712,
"epoch": 0.785969334173493,
"grad_norm": 1.2265625,
"learning_rate": 0.0004944596200393726,
"loss": 5.4813,
"mean_token_accuracy": 0.16274381577968597,
"num_tokens": 17270387.0,
"step": 9355
},
{
"entropy": 5.671835851669312,
"epoch": 0.7863894139886578,
"grad_norm": 1.4375,
"learning_rate": 0.0004944530134396702,
"loss": 5.5314,
"mean_token_accuracy": 0.16092900931835175,
"num_tokens": 17279866.0,
"step": 9360
},
{
"entropy": 5.621250009536743,
"epoch": 0.7868094938038227,
"grad_norm": 1.3125,
"learning_rate": 0.0004944464029524707,
"loss": 5.5052,
"mean_token_accuracy": 0.161967870593071,
"num_tokens": 17289233.0,
"step": 9365
},
{
"entropy": 5.738542604446411,
"epoch": 0.7872295736189876,
"grad_norm": 1.21875,
"learning_rate": 0.000494439788577891,
"loss": 5.5936,
"mean_token_accuracy": 0.15567109882831573,
"num_tokens": 17298705.0,
"step": 9370
},
{
"entropy": 5.732174921035766,
"epoch": 0.7876496534341525,
"grad_norm": 1.3125,
"learning_rate": 0.0004944331703160486,
"loss": 5.549,
"mean_token_accuracy": 0.1578393891453743,
"num_tokens": 17307793.0,
"step": 9375
},
{
"entropy": 5.651821613311768,
"epoch": 0.7880697332493174,
"grad_norm": 1.5859375,
"learning_rate": 0.0004944265481670605,
"loss": 5.6218,
"mean_token_accuracy": 0.15234800577163696,
"num_tokens": 17318248.0,
"step": 9380
},
{
"entropy": 5.704642677307129,
"epoch": 0.7884898130644823,
"grad_norm": 1.2421875,
"learning_rate": 0.0004944199221310441,
"loss": 5.5411,
"mean_token_accuracy": 0.15532419532537461,
"num_tokens": 17327281.0,
"step": 9385
},
{
"entropy": 5.700555324554443,
"epoch": 0.7889098928796471,
"grad_norm": 1.265625,
"learning_rate": 0.0004944132922081168,
"loss": 5.5445,
"mean_token_accuracy": 0.16321972906589508,
"num_tokens": 17336805.0,
"step": 9390
},
{
"entropy": 5.66338005065918,
"epoch": 0.789329972694812,
"grad_norm": 1.265625,
"learning_rate": 0.0004944066583983961,
"loss": 5.5073,
"mean_token_accuracy": 0.15633855164051055,
"num_tokens": 17346024.0,
"step": 9395
},
{
"entropy": 5.653223037719727,
"epoch": 0.7897500525099769,
"grad_norm": 1.3359375,
"learning_rate": 0.0004944000207019992,
"loss": 5.5732,
"mean_token_accuracy": 0.15641905665397643,
"num_tokens": 17355100.0,
"step": 9400
},
{
"entropy": 5.753649759292602,
"epoch": 0.7901701323251418,
"grad_norm": 1.1328125,
"learning_rate": 0.0004943933791190441,
"loss": 5.6113,
"mean_token_accuracy": 0.1497926726937294,
"num_tokens": 17364769.0,
"step": 9405
},
{
"entropy": 5.621674108505249,
"epoch": 0.7905902121403067,
"grad_norm": 1.2109375,
"learning_rate": 0.0004943867336496482,
"loss": 5.4727,
"mean_token_accuracy": 0.16464877128601074,
"num_tokens": 17374082.0,
"step": 9410
},
{
"entropy": 5.578705501556397,
"epoch": 0.7910102919554716,
"grad_norm": 1.2109375,
"learning_rate": 0.0004943800842939293,
"loss": 5.5156,
"mean_token_accuracy": 0.16334666460752487,
"num_tokens": 17383570.0,
"step": 9415
},
{
"entropy": 5.728313684463501,
"epoch": 0.7914303717706365,
"grad_norm": 1.3046875,
"learning_rate": 0.000494373431052005,
"loss": 5.5246,
"mean_token_accuracy": 0.1631974011659622,
"num_tokens": 17392105.0,
"step": 9420
},
{
"entropy": 5.636175203323364,
"epoch": 0.7918504515858013,
"grad_norm": 1.3203125,
"learning_rate": 0.0004943667739239935,
"loss": 5.4982,
"mean_token_accuracy": 0.15721355080604554,
"num_tokens": 17401363.0,
"step": 9425
},
{
"entropy": 5.671138334274292,
"epoch": 0.7922705314009661,
"grad_norm": 1.1796875,
"learning_rate": 0.0004943601129100125,
"loss": 5.4985,
"mean_token_accuracy": 0.16342997252941133,
"num_tokens": 17411333.0,
"step": 9430
},
{
"entropy": 5.735333776473999,
"epoch": 0.792690611216131,
"grad_norm": 1.3046875,
"learning_rate": 0.0004943534480101801,
"loss": 5.5707,
"mean_token_accuracy": 0.16346004083752633,
"num_tokens": 17421162.0,
"step": 9435
},
{
"entropy": 5.659286451339722,
"epoch": 0.793110691031296,
"grad_norm": 1.328125,
"learning_rate": 0.0004943467792246142,
"loss": 5.5134,
"mean_token_accuracy": 0.16254661232233047,
"num_tokens": 17430119.0,
"step": 9440
},
{
"entropy": 5.65619683265686,
"epoch": 0.7935307708464608,
"grad_norm": 1.34375,
"learning_rate": 0.0004943401065534332,
"loss": 5.5137,
"mean_token_accuracy": 0.15908659845590592,
"num_tokens": 17439617.0,
"step": 9445
},
{
"entropy": 5.591221809387207,
"epoch": 0.7939508506616257,
"grad_norm": 1.1796875,
"learning_rate": 0.0004943334299967551,
"loss": 5.6362,
"mean_token_accuracy": 0.1572137139737606,
"num_tokens": 17448720.0,
"step": 9450
},
{
"entropy": 5.676210021972656,
"epoch": 0.7943709304767906,
"grad_norm": 1.203125,
"learning_rate": 0.0004943267495546982,
"loss": 5.5135,
"mean_token_accuracy": 0.16351162791252136,
"num_tokens": 17457458.0,
"step": 9455
},
{
"entropy": 5.732522106170654,
"epoch": 0.7947910102919554,
"grad_norm": 1.2421875,
"learning_rate": 0.0004943200652273809,
"loss": 5.5481,
"mean_token_accuracy": 0.16332785785198212,
"num_tokens": 17467095.0,
"step": 9460
},
{
"entropy": 5.6177106380462645,
"epoch": 0.7952110901071203,
"grad_norm": 1.34375,
"learning_rate": 0.0004943133770149216,
"loss": 5.5644,
"mean_token_accuracy": 0.15639227479696274,
"num_tokens": 17476247.0,
"step": 9465
},
{
"entropy": 5.722067785263062,
"epoch": 0.7956311699222852,
"grad_norm": 1.1875,
"learning_rate": 0.0004943066849174386,
"loss": 5.5801,
"mean_token_accuracy": 0.16233462244272232,
"num_tokens": 17486352.0,
"step": 9470
},
{
"entropy": 5.737708044052124,
"epoch": 0.7960512497374501,
"grad_norm": 1.2265625,
"learning_rate": 0.0004942999889350508,
"loss": 5.5497,
"mean_token_accuracy": 0.15952325612306595,
"num_tokens": 17495633.0,
"step": 9475
},
{
"entropy": 5.705710554122925,
"epoch": 0.796471329552615,
"grad_norm": 1.3046875,
"learning_rate": 0.0004942932890678765,
"loss": 5.5931,
"mean_token_accuracy": 0.1573889285326004,
"num_tokens": 17504325.0,
"step": 9480
},
{
"entropy": 5.69957857131958,
"epoch": 0.7968914093677799,
"grad_norm": 1.375,
"learning_rate": 0.0004942865853160346,
"loss": 5.5992,
"mean_token_accuracy": 0.1549372524023056,
"num_tokens": 17513265.0,
"step": 9485
},
{
"entropy": 5.726451587677002,
"epoch": 0.7973114891829448,
"grad_norm": 1.234375,
"learning_rate": 0.0004942798776796436,
"loss": 5.6099,
"mean_token_accuracy": 0.15643575862050058,
"num_tokens": 17522939.0,
"step": 9490
},
{
"entropy": 5.747159051895141,
"epoch": 0.7977315689981096,
"grad_norm": 1.3984375,
"learning_rate": 0.0004942731661588226,
"loss": 5.6201,
"mean_token_accuracy": 0.15074056014418602,
"num_tokens": 17532250.0,
"step": 9495
},
{
"entropy": 5.744595193862915,
"epoch": 0.7981516488132745,
"grad_norm": 1.4921875,
"learning_rate": 0.0004942664507536904,
"loss": 5.6216,
"mean_token_accuracy": 0.16153676807880402,
"num_tokens": 17541368.0,
"step": 9500
},
{
"entropy": 5.665510129928589,
"epoch": 0.7985717286284394,
"grad_norm": 1.6484375,
"learning_rate": 0.0004942597314643659,
"loss": 5.5775,
"mean_token_accuracy": 0.15800571888685228,
"num_tokens": 17550871.0,
"step": 9505
},
{
"entropy": 5.659941339492798,
"epoch": 0.7989918084436043,
"grad_norm": 1.25,
"learning_rate": 0.0004942530082909681,
"loss": 5.4959,
"mean_token_accuracy": 0.1667015627026558,
"num_tokens": 17559683.0,
"step": 9510
},
{
"entropy": 5.669128704071045,
"epoch": 0.7994118882587692,
"grad_norm": 1.5078125,
"learning_rate": 0.0004942462812336163,
"loss": 5.5025,
"mean_token_accuracy": 0.16116804033517837,
"num_tokens": 17568877.0,
"step": 9515
},
{
"entropy": 5.7886709690094,
"epoch": 0.7998319680739341,
"grad_norm": 1.25,
"learning_rate": 0.0004942395502924293,
"loss": 5.6565,
"mean_token_accuracy": 0.15164065062999726,
"num_tokens": 17578202.0,
"step": 9520
},
{
"entropy": 5.6807924747467045,
"epoch": 0.800252047889099,
"grad_norm": 1.34375,
"learning_rate": 0.0004942328154675268,
"loss": 5.4977,
"mean_token_accuracy": 0.1627306804060936,
"num_tokens": 17587342.0,
"step": 9525
},
{
"entropy": 5.630272340774536,
"epoch": 0.8006721277042638,
"grad_norm": 1.2890625,
"learning_rate": 0.0004942260767590277,
"loss": 5.3614,
"mean_token_accuracy": 0.16487462520599366,
"num_tokens": 17595671.0,
"step": 9530
},
{
"entropy": 5.690436267852784,
"epoch": 0.8010922075194287,
"grad_norm": 1.2890625,
"learning_rate": 0.0004942193341670516,
"loss": 5.6735,
"mean_token_accuracy": 0.15220091938972474,
"num_tokens": 17605649.0,
"step": 9535
},
{
"entropy": 5.691447019577026,
"epoch": 0.8015122873345936,
"grad_norm": 1.25,
"learning_rate": 0.0004942125876917178,
"loss": 5.5799,
"mean_token_accuracy": 0.15388443917036057,
"num_tokens": 17615286.0,
"step": 9540
},
{
"entropy": 5.613958406448364,
"epoch": 0.8019323671497585,
"grad_norm": 1.2109375,
"learning_rate": 0.000494205837333146,
"loss": 5.552,
"mean_token_accuracy": 0.1583222895860672,
"num_tokens": 17624583.0,
"step": 9545
},
{
"entropy": 5.717194938659668,
"epoch": 0.8023524469649234,
"grad_norm": 1.6875,
"learning_rate": 0.0004941990830914557,
"loss": 5.5296,
"mean_token_accuracy": 0.16425732970237733,
"num_tokens": 17633894.0,
"step": 9550
},
{
"entropy": 5.757351922988891,
"epoch": 0.8027725267800883,
"grad_norm": 1.2265625,
"learning_rate": 0.0004941923249667663,
"loss": 5.6131,
"mean_token_accuracy": 0.15476072132587432,
"num_tokens": 17643172.0,
"step": 9555
},
{
"entropy": 5.701081371307373,
"epoch": 0.803192606595253,
"grad_norm": 1.2734375,
"learning_rate": 0.0004941855629591979,
"loss": 5.5169,
"mean_token_accuracy": 0.15671369135379792,
"num_tokens": 17651901.0,
"step": 9560
},
{
"entropy": 5.622681045532227,
"epoch": 0.8036126864104179,
"grad_norm": 1.1640625,
"learning_rate": 0.0004941787970688701,
"loss": 5.5049,
"mean_token_accuracy": 0.16184851676225662,
"num_tokens": 17660806.0,
"step": 9565
},
{
"entropy": 5.761856603622436,
"epoch": 0.8040327662255828,
"grad_norm": 1.3046875,
"learning_rate": 0.0004941720272959027,
"loss": 5.5699,
"mean_token_accuracy": 0.16645232439041138,
"num_tokens": 17669157.0,
"step": 9570
},
{
"entropy": 5.619096279144287,
"epoch": 0.8044528460407477,
"grad_norm": 1.2265625,
"learning_rate": 0.0004941652536404157,
"loss": 5.4943,
"mean_token_accuracy": 0.15924308001995086,
"num_tokens": 17678664.0,
"step": 9575
},
{
"entropy": 5.673613357543945,
"epoch": 0.8048729258559126,
"grad_norm": 1.3359375,
"learning_rate": 0.0004941584761025291,
"loss": 5.5281,
"mean_token_accuracy": 0.1605657756328583,
"num_tokens": 17688252.0,
"step": 9580
},
{
"entropy": 5.641302871704101,
"epoch": 0.8052930056710775,
"grad_norm": 1.3515625,
"learning_rate": 0.000494151694682363,
"loss": 5.5365,
"mean_token_accuracy": 0.16418962329626083,
"num_tokens": 17696473.0,
"step": 9585
},
{
"entropy": 5.629877805709839,
"epoch": 0.8057130854862424,
"grad_norm": 1.2578125,
"learning_rate": 0.0004941449093800374,
"loss": 5.5673,
"mean_token_accuracy": 0.15728162452578545,
"num_tokens": 17706177.0,
"step": 9590
},
{
"entropy": 5.620923471450806,
"epoch": 0.8061331653014072,
"grad_norm": 1.34375,
"learning_rate": 0.0004941381201956726,
"loss": 5.4071,
"mean_token_accuracy": 0.1691529244184494,
"num_tokens": 17715355.0,
"step": 9595
},
{
"entropy": 5.623278331756592,
"epoch": 0.8065532451165721,
"grad_norm": 1.296875,
"learning_rate": 0.0004941313271293889,
"loss": 5.5008,
"mean_token_accuracy": 0.16670178472995759,
"num_tokens": 17724345.0,
"step": 9600
},
{
"entropy": 5.654106616973877,
"epoch": 0.806973324931737,
"grad_norm": 1.3046875,
"learning_rate": 0.0004941245301813065,
"loss": 5.4473,
"mean_token_accuracy": 0.17143428921699524,
"num_tokens": 17732805.0,
"step": 9605
},
{
"entropy": 5.620670795440674,
"epoch": 0.8073934047469019,
"grad_norm": 1.3046875,
"learning_rate": 0.0004941177293515459,
"loss": 5.4774,
"mean_token_accuracy": 0.16311821192502976,
"num_tokens": 17741963.0,
"step": 9610
},
{
"entropy": 5.5626294136047365,
"epoch": 0.8078134845620668,
"grad_norm": 1.3359375,
"learning_rate": 0.0004941109246402275,
"loss": 5.5126,
"mean_token_accuracy": 0.15953077971935273,
"num_tokens": 17751858.0,
"step": 9615
},
{
"entropy": 5.737004566192627,
"epoch": 0.8082335643772317,
"grad_norm": 1.265625,
"learning_rate": 0.0004941041160474721,
"loss": 5.6139,
"mean_token_accuracy": 0.15390506833791734,
"num_tokens": 17761152.0,
"step": 9620
},
{
"entropy": 5.729790878295899,
"epoch": 0.8086536441923966,
"grad_norm": 1.4140625,
"learning_rate": 0.0004940973035733999,
"loss": 5.5742,
"mean_token_accuracy": 0.16010648310184478,
"num_tokens": 17770493.0,
"step": 9625
},
{
"entropy": 5.743511009216308,
"epoch": 0.8090737240075614,
"grad_norm": 1.265625,
"learning_rate": 0.0004940904872181318,
"loss": 5.5539,
"mean_token_accuracy": 0.15472937971353531,
"num_tokens": 17779871.0,
"step": 9630
},
{
"entropy": 5.71267032623291,
"epoch": 0.8094938038227263,
"grad_norm": 1.296875,
"learning_rate": 0.0004940836669817887,
"loss": 5.5842,
"mean_token_accuracy": 0.15565359741449356,
"num_tokens": 17788606.0,
"step": 9635
},
{
"entropy": 5.613599967956543,
"epoch": 0.8099138836378912,
"grad_norm": 1.2734375,
"learning_rate": 0.0004940768428644911,
"loss": 5.5147,
"mean_token_accuracy": 0.16102469265460967,
"num_tokens": 17797458.0,
"step": 9640
},
{
"entropy": 5.596334409713745,
"epoch": 0.8103339634530561,
"grad_norm": 1.234375,
"learning_rate": 0.0004940700148663601,
"loss": 5.4564,
"mean_token_accuracy": 0.1620950683951378,
"num_tokens": 17806902.0,
"step": 9645
},
{
"entropy": 5.73327784538269,
"epoch": 0.810754043268221,
"grad_norm": 1.296875,
"learning_rate": 0.0004940631829875165,
"loss": 5.591,
"mean_token_accuracy": 0.16165916323661805,
"num_tokens": 17816374.0,
"step": 9650
},
{
"entropy": 5.712001848220825,
"epoch": 0.8111741230833859,
"grad_norm": 1.390625,
"learning_rate": 0.0004940563472280815,
"loss": 5.5728,
"mean_token_accuracy": 0.1657658874988556,
"num_tokens": 17825267.0,
"step": 9655
},
{
"entropy": 5.610749959945679,
"epoch": 0.8115942028985508,
"grad_norm": 1.2734375,
"learning_rate": 0.0004940495075881761,
"loss": 5.4791,
"mean_token_accuracy": 0.16635317206382752,
"num_tokens": 17834027.0,
"step": 9660
},
{
"entropy": 5.574903249740601,
"epoch": 0.8120142827137156,
"grad_norm": 1.34375,
"learning_rate": 0.0004940426640679214,
"loss": 5.4839,
"mean_token_accuracy": 0.1606460615992546,
"num_tokens": 17843587.0,
"step": 9665
},
{
"entropy": 5.689661979675293,
"epoch": 0.8124343625288805,
"grad_norm": 1.3359375,
"learning_rate": 0.0004940358166674388,
"loss": 5.5224,
"mean_token_accuracy": 0.16242198795080184,
"num_tokens": 17852284.0,
"step": 9670
},
{
"entropy": 5.723747253417969,
"epoch": 0.8128544423440454,
"grad_norm": 1.34375,
"learning_rate": 0.0004940289653868494,
"loss": 5.5335,
"mean_token_accuracy": 0.16003253161907197,
"num_tokens": 17860896.0,
"step": 9675
},
{
"entropy": 5.5253918170928955,
"epoch": 0.8132745221592103,
"grad_norm": 1.2578125,
"learning_rate": 0.0004940221102262747,
"loss": 5.5117,
"mean_token_accuracy": 0.15657734125852585,
"num_tokens": 17870796.0,
"step": 9680
},
{
"entropy": 5.698360538482666,
"epoch": 0.8136946019743752,
"grad_norm": 1.265625,
"learning_rate": 0.0004940152511858361,
"loss": 5.588,
"mean_token_accuracy": 0.15942368805408477,
"num_tokens": 17880016.0,
"step": 9685
},
{
"entropy": 5.813475561141968,
"epoch": 0.81411468178954,
"grad_norm": 1.3125,
"learning_rate": 0.0004940083882656551,
"loss": 5.6518,
"mean_token_accuracy": 0.15555018186569214,
"num_tokens": 17889348.0,
"step": 9690
},
{
"entropy": 5.644897079467773,
"epoch": 0.814534761604705,
"grad_norm": 1.265625,
"learning_rate": 0.0004940015214658532,
"loss": 5.5043,
"mean_token_accuracy": 0.1621764436364174,
"num_tokens": 17898392.0,
"step": 9695
},
{
"entropy": 5.6429681301116945,
"epoch": 0.8149548414198697,
"grad_norm": 1.4921875,
"learning_rate": 0.0004939946507865522,
"loss": 5.5817,
"mean_token_accuracy": 0.16317397505044937,
"num_tokens": 17907141.0,
"step": 9700
},
{
"entropy": 5.671615123748779,
"epoch": 0.8153749212350346,
"grad_norm": 1.21875,
"learning_rate": 0.0004939877762278737,
"loss": 5.4645,
"mean_token_accuracy": 0.16062404215335846,
"num_tokens": 17915792.0,
"step": 9705
},
{
"entropy": 5.751763725280762,
"epoch": 0.8157950010501995,
"grad_norm": 1.7265625,
"learning_rate": 0.0004939808977899396,
"loss": 5.6207,
"mean_token_accuracy": 0.1522047832608223,
"num_tokens": 17925603.0,
"step": 9710
},
{
"entropy": 5.697290706634521,
"epoch": 0.8162150808653644,
"grad_norm": 1.296875,
"learning_rate": 0.0004939740154728716,
"loss": 5.5698,
"mean_token_accuracy": 0.16246933341026307,
"num_tokens": 17934436.0,
"step": 9715
},
{
"entropy": 5.676386976242066,
"epoch": 0.8166351606805293,
"grad_norm": 1.3984375,
"learning_rate": 0.0004939671292767915,
"loss": 5.5134,
"mean_token_accuracy": 0.17046479880809784,
"num_tokens": 17942969.0,
"step": 9720
},
{
"entropy": 5.714960718154908,
"epoch": 0.8170552404956942,
"grad_norm": 1.3046875,
"learning_rate": 0.0004939602392018216,
"loss": 5.5926,
"mean_token_accuracy": 0.16091118156909942,
"num_tokens": 17952053.0,
"step": 9725
},
{
"entropy": 5.675097370147705,
"epoch": 0.817475320310859,
"grad_norm": 1.34375,
"learning_rate": 0.0004939533452480839,
"loss": 5.5554,
"mean_token_accuracy": 0.1619985356926918,
"num_tokens": 17960707.0,
"step": 9730
},
{
"entropy": 5.7456968307495115,
"epoch": 0.8178954001260239,
"grad_norm": 1.390625,
"learning_rate": 0.0004939464474157003,
"loss": 5.6414,
"mean_token_accuracy": 0.1500391572713852,
"num_tokens": 17971035.0,
"step": 9735
},
{
"entropy": 5.731802082061767,
"epoch": 0.8183154799411888,
"grad_norm": 1.3828125,
"learning_rate": 0.0004939395457047932,
"loss": 5.5165,
"mean_token_accuracy": 0.15629953145980835,
"num_tokens": 17980656.0,
"step": 9740
},
{
"entropy": 5.7165204048156735,
"epoch": 0.8187355597563537,
"grad_norm": 1.2109375,
"learning_rate": 0.0004939326401154847,
"loss": 5.5595,
"mean_token_accuracy": 0.15109186619520187,
"num_tokens": 17990977.0,
"step": 9745
},
{
"entropy": 5.62971863746643,
"epoch": 0.8191556395715186,
"grad_norm": 1.390625,
"learning_rate": 0.0004939257306478973,
"loss": 5.5567,
"mean_token_accuracy": 0.16121531277894974,
"num_tokens": 18000186.0,
"step": 9750
},
{
"entropy": 5.70962438583374,
"epoch": 0.8195757193866835,
"grad_norm": 1.2265625,
"learning_rate": 0.0004939188173021532,
"loss": 5.5371,
"mean_token_accuracy": 0.1581372946500778,
"num_tokens": 18010269.0,
"step": 9755
},
{
"entropy": 5.664460325241089,
"epoch": 0.8199957992018484,
"grad_norm": 1.3515625,
"learning_rate": 0.0004939119000783751,
"loss": 5.472,
"mean_token_accuracy": 0.1687139466404915,
"num_tokens": 18018461.0,
"step": 9760
},
{
"entropy": 5.625735569000244,
"epoch": 0.8204158790170132,
"grad_norm": 1.3125,
"learning_rate": 0.0004939049789766855,
"loss": 5.4871,
"mean_token_accuracy": 0.1622154951095581,
"num_tokens": 18027173.0,
"step": 9765
},
{
"entropy": 5.623517990112305,
"epoch": 0.8208359588321781,
"grad_norm": 1.2265625,
"learning_rate": 0.0004938980539972068,
"loss": 5.5813,
"mean_token_accuracy": 0.15948344767093658,
"num_tokens": 18036791.0,
"step": 9770
},
{
"entropy": 5.608444881439209,
"epoch": 0.821256038647343,
"grad_norm": 1.390625,
"learning_rate": 0.0004938911251400617,
"loss": 5.5246,
"mean_token_accuracy": 0.16499666422605513,
"num_tokens": 18046908.0,
"step": 9775
},
{
"entropy": 5.666135549545288,
"epoch": 0.8216761184625079,
"grad_norm": 1.46875,
"learning_rate": 0.0004938841924053731,
"loss": 5.4648,
"mean_token_accuracy": 0.16549027860164642,
"num_tokens": 18055825.0,
"step": 9780
},
{
"entropy": 5.765940713882446,
"epoch": 0.8220961982776728,
"grad_norm": 1.5390625,
"learning_rate": 0.0004938772557932637,
"loss": 5.5851,
"mean_token_accuracy": 0.1571800611913204,
"num_tokens": 18065334.0,
"step": 9785
},
{
"entropy": 5.630688571929932,
"epoch": 0.8225162780928377,
"grad_norm": 1.21875,
"learning_rate": 0.0004938703153038565,
"loss": 5.4912,
"mean_token_accuracy": 0.16764561533927919,
"num_tokens": 18073999.0,
"step": 9790
},
{
"entropy": 5.563945531845093,
"epoch": 0.8229363579080026,
"grad_norm": 1.2734375,
"learning_rate": 0.0004938633709372744,
"loss": 5.5292,
"mean_token_accuracy": 0.15778465792536736,
"num_tokens": 18083665.0,
"step": 9795
},
{
"entropy": 5.66337776184082,
"epoch": 0.8233564377231674,
"grad_norm": 1.3046875,
"learning_rate": 0.0004938564226936403,
"loss": 5.5062,
"mean_token_accuracy": 0.16438312083482742,
"num_tokens": 18092501.0,
"step": 9800
},
{
"entropy": 5.64669451713562,
"epoch": 0.8237765175383323,
"grad_norm": 1.5625,
"learning_rate": 0.0004938494705730773,
"loss": 5.5221,
"mean_token_accuracy": 0.15949836522340774,
"num_tokens": 18101320.0,
"step": 9805
},
{
"entropy": 5.626850128173828,
"epoch": 0.8241965973534972,
"grad_norm": 1.3203125,
"learning_rate": 0.0004938425145757087,
"loss": 5.5371,
"mean_token_accuracy": 0.15716064274311065,
"num_tokens": 18110190.0,
"step": 9810
},
{
"entropy": 5.680644369125366,
"epoch": 0.824616677168662,
"grad_norm": 1.359375,
"learning_rate": 0.0004938355547016577,
"loss": 5.5127,
"mean_token_accuracy": 0.16249138861894608,
"num_tokens": 18119301.0,
"step": 9815
},
{
"entropy": 5.7193972110748295,
"epoch": 0.825036756983827,
"grad_norm": 1.25,
"learning_rate": 0.0004938285909510474,
"loss": 5.5758,
"mean_token_accuracy": 0.1582377091050148,
"num_tokens": 18128959.0,
"step": 9820
},
{
"entropy": 5.6662201404571535,
"epoch": 0.8254568367989918,
"grad_norm": 1.296875,
"learning_rate": 0.0004938216233240014,
"loss": 5.5502,
"mean_token_accuracy": 0.1582813560962677,
"num_tokens": 18138156.0,
"step": 9825
},
{
"entropy": 5.727919578552246,
"epoch": 0.8258769166141567,
"grad_norm": 1.359375,
"learning_rate": 0.000493814651820643,
"loss": 5.585,
"mean_token_accuracy": 0.15460814982652665,
"num_tokens": 18147244.0,
"step": 9830
},
{
"entropy": 5.665496778488159,
"epoch": 0.8262969964293215,
"grad_norm": 1.2109375,
"learning_rate": 0.0004938076764410956,
"loss": 5.5885,
"mean_token_accuracy": 0.15983546376228333,
"num_tokens": 18156040.0,
"step": 9835
},
{
"entropy": 5.732932662963867,
"epoch": 0.8267170762444864,
"grad_norm": 1.625,
"learning_rate": 0.000493800697185483,
"loss": 5.517,
"mean_token_accuracy": 0.1574488326907158,
"num_tokens": 18165210.0,
"step": 9840
},
{
"entropy": 5.734502363204956,
"epoch": 0.8271371560596513,
"grad_norm": 1.2578125,
"learning_rate": 0.0004937937140539288,
"loss": 5.58,
"mean_token_accuracy": 0.15782398730516434,
"num_tokens": 18174841.0,
"step": 9845
},
{
"entropy": 5.62280797958374,
"epoch": 0.8275572358748162,
"grad_norm": 2.78125,
"learning_rate": 0.0004937867270465564,
"loss": 5.4522,
"mean_token_accuracy": 0.16294290423393248,
"num_tokens": 18184112.0,
"step": 9850
},
{
"entropy": 5.742010974884034,
"epoch": 0.8279773156899811,
"grad_norm": 1.28125,
"learning_rate": 0.0004937797361634899,
"loss": 5.6596,
"mean_token_accuracy": 0.15312351733446122,
"num_tokens": 18193564.0,
"step": 9855
},
{
"entropy": 5.61360445022583,
"epoch": 0.828397395505146,
"grad_norm": 1.3515625,
"learning_rate": 0.000493772741404853,
"loss": 5.3811,
"mean_token_accuracy": 0.1657651409506798,
"num_tokens": 18202836.0,
"step": 9860
},
{
"entropy": 5.662469673156738,
"epoch": 0.8288174753203108,
"grad_norm": 1.3828125,
"learning_rate": 0.0004937657427707698,
"loss": 5.504,
"mean_token_accuracy": 0.17115799337625504,
"num_tokens": 18212098.0,
"step": 9865
},
{
"entropy": 5.703630208969116,
"epoch": 0.8292375551354757,
"grad_norm": 1.28125,
"learning_rate": 0.0004937587402613639,
"loss": 5.5471,
"mean_token_accuracy": 0.16153400242328644,
"num_tokens": 18221541.0,
"step": 9870
},
{
"entropy": 5.612545967102051,
"epoch": 0.8296576349506406,
"grad_norm": 1.4609375,
"learning_rate": 0.0004937517338767597,
"loss": 5.536,
"mean_token_accuracy": 0.15903571546077727,
"num_tokens": 18231015.0,
"step": 9875
},
{
"entropy": 5.756252431869507,
"epoch": 0.8300777147658055,
"grad_norm": 1.375,
"learning_rate": 0.0004937447236170811,
"loss": 5.5685,
"mean_token_accuracy": 0.15943876206874846,
"num_tokens": 18239729.0,
"step": 9880
},
{
"entropy": 5.7537188053131105,
"epoch": 0.8304977945809704,
"grad_norm": 1.3046875,
"learning_rate": 0.0004937377094824523,
"loss": 5.6072,
"mean_token_accuracy": 0.15652247965335847,
"num_tokens": 18249773.0,
"step": 9885
},
{
"entropy": 5.684403038024902,
"epoch": 0.8309178743961353,
"grad_norm": 1.375,
"learning_rate": 0.0004937306914729977,
"loss": 5.5693,
"mean_token_accuracy": 0.158055904507637,
"num_tokens": 18259179.0,
"step": 9890
},
{
"entropy": 5.591420412063599,
"epoch": 0.8313379542113002,
"grad_norm": 1.2421875,
"learning_rate": 0.0004937236695888416,
"loss": 5.4445,
"mean_token_accuracy": 0.17210163027048112,
"num_tokens": 18268164.0,
"step": 9895
},
{
"entropy": 5.726157283782959,
"epoch": 0.831758034026465,
"grad_norm": 1.328125,
"learning_rate": 0.0004937166438301082,
"loss": 5.6133,
"mean_token_accuracy": 0.16078757047653197,
"num_tokens": 18276259.0,
"step": 9900
},
{
"entropy": 5.666176176071167,
"epoch": 0.8321781138416299,
"grad_norm": 1.3046875,
"learning_rate": 0.0004937096141969221,
"loss": 5.5998,
"mean_token_accuracy": 0.1596865251660347,
"num_tokens": 18285729.0,
"step": 9905
},
{
"entropy": 5.763876485824585,
"epoch": 0.8325981936567948,
"grad_norm": 1.2578125,
"learning_rate": 0.0004937025806894077,
"loss": 5.7629,
"mean_token_accuracy": 0.1466881103813648,
"num_tokens": 18295873.0,
"step": 9910
},
{
"entropy": 5.774165105819702,
"epoch": 0.8330182734719597,
"grad_norm": 1.2421875,
"learning_rate": 0.0004936955433076899,
"loss": 5.572,
"mean_token_accuracy": 0.16771850138902664,
"num_tokens": 18305135.0,
"step": 9915
},
{
"entropy": 5.704055881500244,
"epoch": 0.8334383532871246,
"grad_norm": 1.234375,
"learning_rate": 0.000493688502051893,
"loss": 5.6165,
"mean_token_accuracy": 0.15885019153356553,
"num_tokens": 18314251.0,
"step": 9920
},
{
"entropy": 5.626564168930054,
"epoch": 0.8338584331022895,
"grad_norm": 1.46875,
"learning_rate": 0.0004936814569221421,
"loss": 5.4544,
"mean_token_accuracy": 0.17203571647405624,
"num_tokens": 18322863.0,
"step": 9925
},
{
"entropy": 5.6520676612854,
"epoch": 0.8342785129174544,
"grad_norm": 1.2734375,
"learning_rate": 0.0004936744079185616,
"loss": 5.4672,
"mean_token_accuracy": 0.16363800168037415,
"num_tokens": 18332129.0,
"step": 9930
},
{
"entropy": 5.751771402359009,
"epoch": 0.8346985927326191,
"grad_norm": 1.25,
"learning_rate": 0.0004936673550412767,
"loss": 5.5544,
"mean_token_accuracy": 0.1615679681301117,
"num_tokens": 18341457.0,
"step": 9935
},
{
"entropy": 5.67233624458313,
"epoch": 0.835118672547784,
"grad_norm": 1.2734375,
"learning_rate": 0.000493660298290412,
"loss": 5.5372,
"mean_token_accuracy": 0.15321808978915213,
"num_tokens": 18351397.0,
"step": 9940
},
{
"entropy": 5.660567092895508,
"epoch": 0.8355387523629489,
"grad_norm": 1.2265625,
"learning_rate": 0.0004936532376660929,
"loss": 5.4801,
"mean_token_accuracy": 0.16411369144916535,
"num_tokens": 18360005.0,
"step": 9945
},
{
"entropy": 5.729122304916382,
"epoch": 0.8359588321781138,
"grad_norm": 1.296875,
"learning_rate": 0.0004936461731684442,
"loss": 5.5685,
"mean_token_accuracy": 0.1632169410586357,
"num_tokens": 18369707.0,
"step": 9950
},
{
"entropy": 5.814248561859131,
"epoch": 0.8363789119932787,
"grad_norm": 1.375,
"learning_rate": 0.0004936391047975912,
"loss": 5.6991,
"mean_token_accuracy": 0.15527767241001128,
"num_tokens": 18379514.0,
"step": 9955
},
{
"entropy": 5.689925670623779,
"epoch": 0.8367989918084436,
"grad_norm": 1.15625,
"learning_rate": 0.0004936320325536589,
"loss": 5.4191,
"mean_token_accuracy": 0.16286465376615525,
"num_tokens": 18388854.0,
"step": 9960
},
{
"entropy": 5.72346682548523,
"epoch": 0.8372190716236085,
"grad_norm": 1.2890625,
"learning_rate": 0.0004936249564367729,
"loss": 5.5954,
"mean_token_accuracy": 0.15776865184307098,
"num_tokens": 18397806.0,
"step": 9965
},
{
"entropy": 5.619910860061646,
"epoch": 0.8376391514387733,
"grad_norm": 1.4140625,
"learning_rate": 0.0004936178764470583,
"loss": 5.4444,
"mean_token_accuracy": 0.16496019810438156,
"num_tokens": 18406645.0,
"step": 9970
},
{
"entropy": 5.587065267562866,
"epoch": 0.8380592312539382,
"grad_norm": 1.3046875,
"learning_rate": 0.0004936107925846405,
"loss": 5.4584,
"mean_token_accuracy": 0.16309986114501954,
"num_tokens": 18415730.0,
"step": 9975
},
{
"entropy": 5.6642388820648195,
"epoch": 0.8384793110691031,
"grad_norm": 1.4375,
"learning_rate": 0.0004936037048496452,
"loss": 5.5638,
"mean_token_accuracy": 0.16353996396064757,
"num_tokens": 18424638.0,
"step": 9980
},
{
"entropy": 5.693403434753418,
"epoch": 0.838899390884268,
"grad_norm": 1.2578125,
"learning_rate": 0.0004935966132421977,
"loss": 5.6035,
"mean_token_accuracy": 0.15765634179115295,
"num_tokens": 18434090.0,
"step": 9985
},
{
"entropy": 5.589950704574585,
"epoch": 0.8393194706994329,
"grad_norm": 1.5546875,
"learning_rate": 0.0004935895177624239,
"loss": 5.4491,
"mean_token_accuracy": 0.16379072219133378,
"num_tokens": 18442965.0,
"step": 9990
},
{
"entropy": 5.756886720657349,
"epoch": 0.8397395505145978,
"grad_norm": 1.3671875,
"learning_rate": 0.0004935824184104493,
"loss": 5.4814,
"mean_token_accuracy": 0.1661657601594925,
"num_tokens": 18451553.0,
"step": 9995
},
{
"entropy": 5.6218055248260494,
"epoch": 0.8401596303297627,
"grad_norm": 1.375,
"learning_rate": 0.0004935753151863997,
"loss": 5.5372,
"mean_token_accuracy": 0.15614725649356842,
"num_tokens": 18461325.0,
"step": 10000
},
{
"entropy": 5.705133724212646,
"epoch": 0.8405797101449275,
"grad_norm": 1.4375,
"learning_rate": 0.0004935682080904009,
"loss": 5.5278,
"mean_token_accuracy": 0.16865952163934708,
"num_tokens": 18469977.0,
"step": 10005
},
{
"entropy": 5.704347705841064,
"epoch": 0.8409997899600924,
"grad_norm": 1.3125,
"learning_rate": 0.0004935610971225789,
"loss": 5.489,
"mean_token_accuracy": 0.16329597383737565,
"num_tokens": 18479534.0,
"step": 10010
},
{
"entropy": 5.642335319519043,
"epoch": 0.8414198697752573,
"grad_norm": 1.578125,
"learning_rate": 0.0004935539822830597,
"loss": 5.6098,
"mean_token_accuracy": 0.15342649221420288,
"num_tokens": 18488800.0,
"step": 10015
},
{
"entropy": 5.700902891159058,
"epoch": 0.8418399495904222,
"grad_norm": 1.375,
"learning_rate": 0.000493546863571969,
"loss": 5.5783,
"mean_token_accuracy": 0.16182019114494323,
"num_tokens": 18498083.0,
"step": 10020
},
{
"entropy": 5.679955768585205,
"epoch": 0.8422600294055871,
"grad_norm": 1.296875,
"learning_rate": 0.0004935397409894333,
"loss": 5.5258,
"mean_token_accuracy": 0.15353272706270218,
"num_tokens": 18508265.0,
"step": 10025
},
{
"entropy": 5.6730622291564945,
"epoch": 0.842680109220752,
"grad_norm": 1.2734375,
"learning_rate": 0.0004935326145355787,
"loss": 5.5537,
"mean_token_accuracy": 0.1579386070370674,
"num_tokens": 18517283.0,
"step": 10030
},
{
"entropy": 5.7045755863189695,
"epoch": 0.8431001890359168,
"grad_norm": 1.2734375,
"learning_rate": 0.0004935254842105311,
"loss": 5.5668,
"mean_token_accuracy": 0.16146936714649202,
"num_tokens": 18526482.0,
"step": 10035
},
{
"entropy": 5.603598499298096,
"epoch": 0.8435202688510817,
"grad_norm": 1.4140625,
"learning_rate": 0.0004935183500144173,
"loss": 5.3965,
"mean_token_accuracy": 0.1751741036772728,
"num_tokens": 18536150.0,
"step": 10040
},
{
"entropy": 5.746985244750976,
"epoch": 0.8439403486662466,
"grad_norm": 1.3671875,
"learning_rate": 0.0004935112119473634,
"loss": 5.6208,
"mean_token_accuracy": 0.15574218332767487,
"num_tokens": 18545168.0,
"step": 10045
},
{
"entropy": 5.681631946563721,
"epoch": 0.8443604284814115,
"grad_norm": 1.34375,
"learning_rate": 0.0004935040700094959,
"loss": 5.5482,
"mean_token_accuracy": 0.16096032857894899,
"num_tokens": 18553363.0,
"step": 10050
},
{
"entropy": 5.609125709533691,
"epoch": 0.8447805082965764,
"grad_norm": 1.34375,
"learning_rate": 0.0004934969242009412,
"loss": 5.4781,
"mean_token_accuracy": 0.16653624624013902,
"num_tokens": 18562546.0,
"step": 10055
},
{
"entropy": 5.635621118545532,
"epoch": 0.8452005881117413,
"grad_norm": 1.3203125,
"learning_rate": 0.0004934897745218262,
"loss": 5.5598,
"mean_token_accuracy": 0.1550781711935997,
"num_tokens": 18572149.0,
"step": 10060
},
{
"entropy": 5.681614828109741,
"epoch": 0.8456206679269062,
"grad_norm": 1.34375,
"learning_rate": 0.0004934826209722772,
"loss": 5.42,
"mean_token_accuracy": 0.16919473558664322,
"num_tokens": 18580842.0,
"step": 10065
},
{
"entropy": 5.661826229095459,
"epoch": 0.8460407477420709,
"grad_norm": 1.390625,
"learning_rate": 0.0004934754635524211,
"loss": 5.5272,
"mean_token_accuracy": 0.15960330069065093,
"num_tokens": 18589765.0,
"step": 10070
},
{
"entropy": 5.664134693145752,
"epoch": 0.8464608275572358,
"grad_norm": 1.25,
"learning_rate": 0.0004934683022623847,
"loss": 5.5449,
"mean_token_accuracy": 0.15602806955575943,
"num_tokens": 18599532.0,
"step": 10075
},
{
"entropy": 5.598707628250122,
"epoch": 0.8468809073724007,
"grad_norm": 1.40625,
"learning_rate": 0.0004934611371022947,
"loss": 5.4394,
"mean_token_accuracy": 0.16422633677721024,
"num_tokens": 18608438.0,
"step": 10080
},
{
"entropy": 5.69066367149353,
"epoch": 0.8473009871875656,
"grad_norm": 1.4453125,
"learning_rate": 0.0004934539680722783,
"loss": 5.596,
"mean_token_accuracy": 0.15671277940273284,
"num_tokens": 18617313.0,
"step": 10085
},
{
"entropy": 5.6565409183502195,
"epoch": 0.8477210670027305,
"grad_norm": 2.171875,
"learning_rate": 0.0004934467951724622,
"loss": 5.4335,
"mean_token_accuracy": 0.1600039303302765,
"num_tokens": 18625880.0,
"step": 10090
},
{
"entropy": 5.683764934539795,
"epoch": 0.8481411468178954,
"grad_norm": 1.2890625,
"learning_rate": 0.0004934396184029737,
"loss": 5.5012,
"mean_token_accuracy": 0.1638655111193657,
"num_tokens": 18635727.0,
"step": 10095
},
{
"entropy": 5.644234943389892,
"epoch": 0.8485612266330603,
"grad_norm": 1.296875,
"learning_rate": 0.0004934324377639398,
"loss": 5.5692,
"mean_token_accuracy": 0.15685787945985794,
"num_tokens": 18645619.0,
"step": 10100
},
{
"entropy": 5.63275990486145,
"epoch": 0.8489813064482251,
"grad_norm": 1.4609375,
"learning_rate": 0.0004934252532554878,
"loss": 5.4684,
"mean_token_accuracy": 0.1624760612845421,
"num_tokens": 18654901.0,
"step": 10105
},
{
"entropy": 5.754931497573852,
"epoch": 0.84940138626339,
"grad_norm": 1.6796875,
"learning_rate": 0.0004934180648777449,
"loss": 5.7184,
"mean_token_accuracy": 0.15616895407438278,
"num_tokens": 18664523.0,
"step": 10110
},
{
"entropy": 5.690887594223023,
"epoch": 0.8498214660785549,
"grad_norm": 1.328125,
"learning_rate": 0.0004934108726308384,
"loss": 5.5559,
"mean_token_accuracy": 0.154204061627388,
"num_tokens": 18673685.0,
"step": 10115
},
{
"entropy": 5.676758527755737,
"epoch": 0.8502415458937198,
"grad_norm": 1.28125,
"learning_rate": 0.0004934036765148958,
"loss": 5.515,
"mean_token_accuracy": 0.15912752598524094,
"num_tokens": 18682889.0,
"step": 10120
},
{
"entropy": 5.700036525726318,
"epoch": 0.8506616257088847,
"grad_norm": 1.671875,
"learning_rate": 0.0004933964765300446,
"loss": 5.5619,
"mean_token_accuracy": 0.15744878649711608,
"num_tokens": 18692978.0,
"step": 10125
},
{
"entropy": 5.651540040969849,
"epoch": 0.8510817055240496,
"grad_norm": 1.25,
"learning_rate": 0.000493389272676412,
"loss": 5.485,
"mean_token_accuracy": 0.16274063736200334,
"num_tokens": 18701846.0,
"step": 10130
},
{
"entropy": 5.673055028915405,
"epoch": 0.8515017853392145,
"grad_norm": 1.3359375,
"learning_rate": 0.0004933820649541262,
"loss": 5.5244,
"mean_token_accuracy": 0.1706227630376816,
"num_tokens": 18711492.0,
"step": 10135
},
{
"entropy": 5.60514988899231,
"epoch": 0.8519218651543793,
"grad_norm": 1.4609375,
"learning_rate": 0.0004933748533633145,
"loss": 5.4485,
"mean_token_accuracy": 0.16744111329317093,
"num_tokens": 18720407.0,
"step": 10140
},
{
"entropy": 5.625371885299683,
"epoch": 0.8523419449695442,
"grad_norm": 1.2421875,
"learning_rate": 0.0004933676379041045,
"loss": 5.5183,
"mean_token_accuracy": 0.16600939780473709,
"num_tokens": 18729968.0,
"step": 10145
},
{
"entropy": 5.72891936302185,
"epoch": 0.8527620247847091,
"grad_norm": 1.3125,
"learning_rate": 0.0004933604185766245,
"loss": 5.6092,
"mean_token_accuracy": 0.151386359333992,
"num_tokens": 18739525.0,
"step": 10150
},
{
"entropy": 5.730268430709839,
"epoch": 0.853182104599874,
"grad_norm": 1.390625,
"learning_rate": 0.0004933531953810019,
"loss": 5.5,
"mean_token_accuracy": 0.16777922809123993,
"num_tokens": 18749087.0,
"step": 10155
},
{
"entropy": 5.7163464546203615,
"epoch": 0.8536021844150389,
"grad_norm": 1.296875,
"learning_rate": 0.0004933459683173652,
"loss": 5.5329,
"mean_token_accuracy": 0.16302687674760818,
"num_tokens": 18758174.0,
"step": 10160
},
{
"entropy": 5.709117221832275,
"epoch": 0.8540222642302038,
"grad_norm": 1.6171875,
"learning_rate": 0.0004933387373858418,
"loss": 5.5632,
"mean_token_accuracy": 0.16024067923426627,
"num_tokens": 18767679.0,
"step": 10165
},
{
"entropy": 5.670746326446533,
"epoch": 0.8544423440453687,
"grad_norm": 1.296875,
"learning_rate": 0.0004933315025865602,
"loss": 5.4889,
"mean_token_accuracy": 0.1621965780854225,
"num_tokens": 18776749.0,
"step": 10170
},
{
"entropy": 5.770751142501831,
"epoch": 0.8548624238605335,
"grad_norm": 1.390625,
"learning_rate": 0.0004933242639196485,
"loss": 5.6762,
"mean_token_accuracy": 0.1487576313316822,
"num_tokens": 18786313.0,
"step": 10175
},
{
"entropy": 5.775333738327026,
"epoch": 0.8552825036756984,
"grad_norm": 1.3046875,
"learning_rate": 0.0004933170213852348,
"loss": 5.55,
"mean_token_accuracy": 0.15689792037010192,
"num_tokens": 18795340.0,
"step": 10180
},
{
"entropy": 5.641742849349976,
"epoch": 0.8557025834908633,
"grad_norm": 1.328125,
"learning_rate": 0.0004933097749834476,
"loss": 5.4873,
"mean_token_accuracy": 0.1572565406560898,
"num_tokens": 18804114.0,
"step": 10185
},
{
"entropy": 5.663312005996704,
"epoch": 0.8561226633060282,
"grad_norm": 1.2421875,
"learning_rate": 0.000493302524714415,
"loss": 5.4998,
"mean_token_accuracy": 0.15901609212160112,
"num_tokens": 18813797.0,
"step": 10190
},
{
"entropy": 5.7054831981658936,
"epoch": 0.856542743121193,
"grad_norm": 1.2578125,
"learning_rate": 0.0004932952705782657,
"loss": 5.5388,
"mean_token_accuracy": 0.15331775918602944,
"num_tokens": 18822410.0,
"step": 10195
},
{
"entropy": 5.615892791748047,
"epoch": 0.856962822936358,
"grad_norm": 1.2421875,
"learning_rate": 0.000493288012575128,
"loss": 5.4726,
"mean_token_accuracy": 0.16814546436071395,
"num_tokens": 18832091.0,
"step": 10200
},
{
"entropy": 5.661955499649048,
"epoch": 0.8573829027515227,
"grad_norm": 1.1875,
"learning_rate": 0.0004932807507051307,
"loss": 5.5082,
"mean_token_accuracy": 0.1563362330198288,
"num_tokens": 18841298.0,
"step": 10205
},
{
"entropy": 5.606261587142944,
"epoch": 0.8578029825666876,
"grad_norm": 1.3828125,
"learning_rate": 0.0004932734849684022,
"loss": 5.4826,
"mean_token_accuracy": 0.1604735642671585,
"num_tokens": 18849683.0,
"step": 10210
},
{
"entropy": 5.6841357231140135,
"epoch": 0.8582230623818525,
"grad_norm": 1.390625,
"learning_rate": 0.0004932662153650712,
"loss": 5.4177,
"mean_token_accuracy": 0.16244781017303467,
"num_tokens": 18858832.0,
"step": 10215
},
{
"entropy": 5.587029647827149,
"epoch": 0.8586431421970174,
"grad_norm": 1.328125,
"learning_rate": 0.0004932589418952668,
"loss": 5.4653,
"mean_token_accuracy": 0.1638034462928772,
"num_tokens": 18867652.0,
"step": 10220
},
{
"entropy": 5.739847660064697,
"epoch": 0.8590632220121823,
"grad_norm": 1.2109375,
"learning_rate": 0.0004932516645591175,
"loss": 5.556,
"mean_token_accuracy": 0.16377656310796737,
"num_tokens": 18877282.0,
"step": 10225
},
{
"entropy": 5.71386079788208,
"epoch": 0.8594833018273472,
"grad_norm": 1.2265625,
"learning_rate": 0.0004932443833567524,
"loss": 5.6454,
"mean_token_accuracy": 0.15882862806320192,
"num_tokens": 18886565.0,
"step": 10230
},
{
"entropy": 5.699979639053344,
"epoch": 0.8599033816425121,
"grad_norm": 1.2734375,
"learning_rate": 0.0004932370982883003,
"loss": 5.592,
"mean_token_accuracy": 0.15794370770454408,
"num_tokens": 18896440.0,
"step": 10235
},
{
"entropy": 5.741853284835815,
"epoch": 0.8603234614576769,
"grad_norm": 1.2421875,
"learning_rate": 0.0004932298093538905,
"loss": 5.6174,
"mean_token_accuracy": 0.15673970580101013,
"num_tokens": 18906246.0,
"step": 10240
},
{
"entropy": 5.644249439239502,
"epoch": 0.8607435412728418,
"grad_norm": 1.171875,
"learning_rate": 0.000493222516553652,
"loss": 5.5138,
"mean_token_accuracy": 0.15427245274186135,
"num_tokens": 18915108.0,
"step": 10245
},
{
"entropy": 5.70051212310791,
"epoch": 0.8611636210880067,
"grad_norm": 1.34375,
"learning_rate": 0.0004932152198877139,
"loss": 5.5063,
"mean_token_accuracy": 0.16063451319932937,
"num_tokens": 18923664.0,
"step": 10250
},
{
"entropy": 5.675086784362793,
"epoch": 0.8615837009031716,
"grad_norm": 1.2890625,
"learning_rate": 0.0004932079193562057,
"loss": 5.6133,
"mean_token_accuracy": 0.15742011070251466,
"num_tokens": 18933496.0,
"step": 10255
},
{
"entropy": 5.652376413345337,
"epoch": 0.8620037807183365,
"grad_norm": 1.3203125,
"learning_rate": 0.0004932006149592564,
"loss": 5.4804,
"mean_token_accuracy": 0.16196195781230927,
"num_tokens": 18942222.0,
"step": 10260
},
{
"entropy": 5.731272411346436,
"epoch": 0.8624238605335014,
"grad_norm": 1.4609375,
"learning_rate": 0.0004931933066969957,
"loss": 5.4918,
"mean_token_accuracy": 0.1656545579433441,
"num_tokens": 18952057.0,
"step": 10265
},
{
"entropy": 5.618059682846069,
"epoch": 0.8628439403486663,
"grad_norm": 1.2734375,
"learning_rate": 0.0004931859945695528,
"loss": 5.5597,
"mean_token_accuracy": 0.15741539001464844,
"num_tokens": 18961664.0,
"step": 10270
},
{
"entropy": 5.5474916934967045,
"epoch": 0.8632640201638311,
"grad_norm": 1.328125,
"learning_rate": 0.0004931786785770575,
"loss": 5.3423,
"mean_token_accuracy": 0.17619887590408326,
"num_tokens": 18969900.0,
"step": 10275
},
{
"entropy": 5.721754789352417,
"epoch": 0.863684099978996,
"grad_norm": 1.453125,
"learning_rate": 0.0004931713587196392,
"loss": 5.6361,
"mean_token_accuracy": 0.15274747163057328,
"num_tokens": 18979286.0,
"step": 10280
},
{
"entropy": 5.7611936092376705,
"epoch": 0.8641041797941609,
"grad_norm": 1.1875,
"learning_rate": 0.0004931640349974275,
"loss": 5.5237,
"mean_token_accuracy": 0.15643700286746026,
"num_tokens": 18987553.0,
"step": 10285
},
{
"entropy": 5.683358573913575,
"epoch": 0.8645242596093258,
"grad_norm": 1.4609375,
"learning_rate": 0.0004931567074105524,
"loss": 5.6063,
"mean_token_accuracy": 0.16055196523666382,
"num_tokens": 18996354.0,
"step": 10290
},
{
"entropy": 5.577027750015259,
"epoch": 0.8649443394244907,
"grad_norm": 1.3203125,
"learning_rate": 0.0004931493759591435,
"loss": 5.482,
"mean_token_accuracy": 0.1618887558579445,
"num_tokens": 19005150.0,
"step": 10295
},
{
"entropy": 5.68660159111023,
"epoch": 0.8653644192396556,
"grad_norm": 1.4453125,
"learning_rate": 0.0004931420406433308,
"loss": 5.5006,
"mean_token_accuracy": 0.1582360938191414,
"num_tokens": 19014572.0,
"step": 10300
},
{
"entropy": 5.647102022171021,
"epoch": 0.8657844990548205,
"grad_norm": 1.328125,
"learning_rate": 0.000493134701463244,
"loss": 5.3617,
"mean_token_accuracy": 0.16853554248809816,
"num_tokens": 19023462.0,
"step": 10305
},
{
"entropy": 5.543448066711425,
"epoch": 0.8662045788699853,
"grad_norm": 1.3515625,
"learning_rate": 0.0004931273584190135,
"loss": 5.4538,
"mean_token_accuracy": 0.16720159947872162,
"num_tokens": 19032460.0,
"step": 10310
},
{
"entropy": 5.588135385513306,
"epoch": 0.8666246586851502,
"grad_norm": 1.25,
"learning_rate": 0.0004931200115107691,
"loss": 5.4846,
"mean_token_accuracy": 0.16485581398010254,
"num_tokens": 19041734.0,
"step": 10315
},
{
"entropy": 5.6352677822113035,
"epoch": 0.867044738500315,
"grad_norm": 1.2578125,
"learning_rate": 0.000493112660738641,
"loss": 5.477,
"mean_token_accuracy": 0.1592483252286911,
"num_tokens": 19050867.0,
"step": 10320
},
{
"entropy": 5.646694040298462,
"epoch": 0.86746481831548,
"grad_norm": 1.3125,
"learning_rate": 0.0004931053061027594,
"loss": 5.4616,
"mean_token_accuracy": 0.16102246344089508,
"num_tokens": 19060518.0,
"step": 10325
},
{
"entropy": 5.634038686752319,
"epoch": 0.8678848981306448,
"grad_norm": 1.21875,
"learning_rate": 0.0004930979476032546,
"loss": 5.4614,
"mean_token_accuracy": 0.164410237967968,
"num_tokens": 19069588.0,
"step": 10330
},
{
"entropy": 5.701401090621948,
"epoch": 0.8683049779458097,
"grad_norm": 1.4765625,
"learning_rate": 0.000493090585240257,
"loss": 5.5073,
"mean_token_accuracy": 0.1488155022263527,
"num_tokens": 19079060.0,
"step": 10335
},
{
"entropy": 5.603839588165283,
"epoch": 0.8687250577609746,
"grad_norm": 1.28125,
"learning_rate": 0.0004930832190138969,
"loss": 5.4475,
"mean_token_accuracy": 0.15975930094718932,
"num_tokens": 19087721.0,
"step": 10340
},
{
"entropy": 5.676361989974976,
"epoch": 0.8691451375761394,
"grad_norm": 1.28125,
"learning_rate": 0.000493075848924305,
"loss": 5.5022,
"mean_token_accuracy": 0.15760902911424637,
"num_tokens": 19096800.0,
"step": 10345
},
{
"entropy": 5.675476932525635,
"epoch": 0.8695652173913043,
"grad_norm": 1.2265625,
"learning_rate": 0.0004930684749716117,
"loss": 5.5605,
"mean_token_accuracy": 0.1568011909723282,
"num_tokens": 19106774.0,
"step": 10350
},
{
"entropy": 5.688272190093994,
"epoch": 0.8699852972064692,
"grad_norm": 1.3671875,
"learning_rate": 0.0004930610971559476,
"loss": 5.4826,
"mean_token_accuracy": 0.16367388367652894,
"num_tokens": 19116413.0,
"step": 10355
},
{
"entropy": 5.701381874084473,
"epoch": 0.8704053770216341,
"grad_norm": 1.265625,
"learning_rate": 0.0004930537154774436,
"loss": 5.5157,
"mean_token_accuracy": 0.159402497112751,
"num_tokens": 19125363.0,
"step": 10360
},
{
"entropy": 5.671617269515991,
"epoch": 0.870825456836799,
"grad_norm": 1.2421875,
"learning_rate": 0.0004930463299362302,
"loss": 5.5974,
"mean_token_accuracy": 0.15096435695886612,
"num_tokens": 19135461.0,
"step": 10365
},
{
"entropy": 5.698200702667236,
"epoch": 0.8712455366519639,
"grad_norm": 1.390625,
"learning_rate": 0.0004930389405324383,
"loss": 5.4737,
"mean_token_accuracy": 0.16709906607866287,
"num_tokens": 19144085.0,
"step": 10370
},
{
"entropy": 5.6751556396484375,
"epoch": 0.8716656164671287,
"grad_norm": 1.375,
"learning_rate": 0.0004930315472661987,
"loss": 5.4761,
"mean_token_accuracy": 0.16794967502355576,
"num_tokens": 19153291.0,
"step": 10375
},
{
"entropy": 5.637331962585449,
"epoch": 0.8720856962822936,
"grad_norm": 1.265625,
"learning_rate": 0.0004930241501376428,
"loss": 5.4874,
"mean_token_accuracy": 0.16048841327428817,
"num_tokens": 19163514.0,
"step": 10380
},
{
"entropy": 5.541778802871704,
"epoch": 0.8725057760974585,
"grad_norm": 1.453125,
"learning_rate": 0.0004930167491469013,
"loss": 5.3944,
"mean_token_accuracy": 0.16507934480905534,
"num_tokens": 19172103.0,
"step": 10385
},
{
"entropy": 5.616076421737671,
"epoch": 0.8729258559126234,
"grad_norm": 1.4296875,
"learning_rate": 0.0004930093442941053,
"loss": 5.4627,
"mean_token_accuracy": 0.16135224401950837,
"num_tokens": 19180893.0,
"step": 10390
},
{
"entropy": 5.616441106796264,
"epoch": 0.8733459357277883,
"grad_norm": 1.3046875,
"learning_rate": 0.0004930019355793858,
"loss": 5.3856,
"mean_token_accuracy": 0.16465070396661757,
"num_tokens": 19190495.0,
"step": 10395
},
{
"entropy": 5.613911056518555,
"epoch": 0.8737660155429532,
"grad_norm": 1.2734375,
"learning_rate": 0.0004929945230028746,
"loss": 5.4796,
"mean_token_accuracy": 0.16515476107597352,
"num_tokens": 19198988.0,
"step": 10400
},
{
"entropy": 5.563373661041259,
"epoch": 0.8741860953581181,
"grad_norm": 1.25,
"learning_rate": 0.0004929871065647024,
"loss": 5.3916,
"mean_token_accuracy": 0.1638674646615982,
"num_tokens": 19208014.0,
"step": 10405
},
{
"entropy": 5.694887781143189,
"epoch": 0.8746061751732829,
"grad_norm": 1.3046875,
"learning_rate": 0.0004929796862650011,
"loss": 5.5893,
"mean_token_accuracy": 0.1591682493686676,
"num_tokens": 19218220.0,
"step": 10410
},
{
"entropy": 5.64337854385376,
"epoch": 0.8750262549884478,
"grad_norm": 1.34375,
"learning_rate": 0.0004929722621039018,
"loss": 5.4937,
"mean_token_accuracy": 0.16073654294013978,
"num_tokens": 19227176.0,
"step": 10415
},
{
"entropy": 5.585389709472656,
"epoch": 0.8754463348036127,
"grad_norm": 1.453125,
"learning_rate": 0.0004929648340815362,
"loss": 5.5006,
"mean_token_accuracy": 0.15821082442998885,
"num_tokens": 19236085.0,
"step": 10420
},
{
"entropy": 5.637318229675293,
"epoch": 0.8758664146187776,
"grad_norm": 1.125,
"learning_rate": 0.0004929574021980355,
"loss": 5.5567,
"mean_token_accuracy": 0.1564241960644722,
"num_tokens": 19246671.0,
"step": 10425
},
{
"entropy": 5.614260578155518,
"epoch": 0.8762864944339425,
"grad_norm": 1.2734375,
"learning_rate": 0.0004929499664535319,
"loss": 5.4661,
"mean_token_accuracy": 0.15873886793851852,
"num_tokens": 19256321.0,
"step": 10430
},
{
"entropy": 5.670866823196411,
"epoch": 0.8767065742491074,
"grad_norm": 1.5234375,
"learning_rate": 0.0004929425268481569,
"loss": 5.4349,
"mean_token_accuracy": 0.16390531361103058,
"num_tokens": 19265518.0,
"step": 10435
},
{
"entropy": 5.6090500831604,
"epoch": 0.8771266540642723,
"grad_norm": 1.296875,
"learning_rate": 0.0004929350833820422,
"loss": 5.4446,
"mean_token_accuracy": 0.1625059276819229,
"num_tokens": 19274120.0,
"step": 10440
},
{
"entropy": 5.675984525680542,
"epoch": 0.877546733879437,
"grad_norm": 1.3828125,
"learning_rate": 0.0004929276360553197,
"loss": 5.5116,
"mean_token_accuracy": 0.16215913146734237,
"num_tokens": 19284377.0,
"step": 10445
},
{
"entropy": 5.569582605361939,
"epoch": 0.8779668136946019,
"grad_norm": 1.2578125,
"learning_rate": 0.0004929201848681213,
"loss": 5.3817,
"mean_token_accuracy": 0.1601586326956749,
"num_tokens": 19293326.0,
"step": 10450
},
{
"entropy": 5.556849384307862,
"epoch": 0.8783868935097668,
"grad_norm": 1.3515625,
"learning_rate": 0.0004929127298205792,
"loss": 5.4279,
"mean_token_accuracy": 0.17152911871671678,
"num_tokens": 19302086.0,
"step": 10455
},
{
"entropy": 5.7007475852966305,
"epoch": 0.8788069733249317,
"grad_norm": 1.3515625,
"learning_rate": 0.0004929052709128251,
"loss": 5.4637,
"mean_token_accuracy": 0.1675509050488472,
"num_tokens": 19310124.0,
"step": 10460
},
{
"entropy": 5.588010692596436,
"epoch": 0.8792270531400966,
"grad_norm": 1.3828125,
"learning_rate": 0.0004928978081449914,
"loss": 5.4754,
"mean_token_accuracy": 0.16137903779745102,
"num_tokens": 19321269.0,
"step": 10465
},
{
"entropy": 5.652129220962524,
"epoch": 0.8796471329552615,
"grad_norm": 1.5234375,
"learning_rate": 0.0004928903415172103,
"loss": 5.5049,
"mean_token_accuracy": 0.163689923286438,
"num_tokens": 19330390.0,
"step": 10470
},
{
"entropy": 5.647299528121948,
"epoch": 0.8800672127704264,
"grad_norm": 1.2734375,
"learning_rate": 0.000492882871029614,
"loss": 5.4939,
"mean_token_accuracy": 0.1618855006992817,
"num_tokens": 19339457.0,
"step": 10475
},
{
"entropy": 5.659735584259034,
"epoch": 0.8804872925855912,
"grad_norm": 1.296875,
"learning_rate": 0.0004928753966823348,
"loss": 5.5482,
"mean_token_accuracy": 0.15688573122024535,
"num_tokens": 19348710.0,
"step": 10480
},
{
"entropy": 5.697210121154785,
"epoch": 0.8809073724007561,
"grad_norm": 1.46875,
"learning_rate": 0.0004928679184755051,
"loss": 5.5815,
"mean_token_accuracy": 0.16629291623830794,
"num_tokens": 19357215.0,
"step": 10485
},
{
"entropy": 5.661016321182251,
"epoch": 0.881327452215921,
"grad_norm": 1.3984375,
"learning_rate": 0.0004928604364092574,
"loss": 5.5224,
"mean_token_accuracy": 0.16313585191965102,
"num_tokens": 19366043.0,
"step": 10490
},
{
"entropy": 5.675441455841065,
"epoch": 0.8817475320310859,
"grad_norm": 1.28125,
"learning_rate": 0.0004928529504837243,
"loss": 5.5815,
"mean_token_accuracy": 0.15857910960912705,
"num_tokens": 19375468.0,
"step": 10495
},
{
"entropy": 5.799028491973877,
"epoch": 0.8821676118462508,
"grad_norm": 1.2734375,
"learning_rate": 0.0004928454606990383,
"loss": 5.4609,
"mean_token_accuracy": 0.1582179293036461,
"num_tokens": 19384467.0,
"step": 10500
},
{
"entropy": 5.642581987380981,
"epoch": 0.8825876916614157,
"grad_norm": 1.296875,
"learning_rate": 0.0004928379670553322,
"loss": 5.5019,
"mean_token_accuracy": 0.1664658784866333,
"num_tokens": 19393618.0,
"step": 10505
},
{
"entropy": 5.677177000045776,
"epoch": 0.8830077714765806,
"grad_norm": 1.390625,
"learning_rate": 0.0004928304695527387,
"loss": 5.565,
"mean_token_accuracy": 0.160389643907547,
"num_tokens": 19402921.0,
"step": 10510
},
{
"entropy": 5.731782722473144,
"epoch": 0.8834278512917454,
"grad_norm": 1.359375,
"learning_rate": 0.0004928229681913905,
"loss": 5.5457,
"mean_token_accuracy": 0.16324697136878968,
"num_tokens": 19412048.0,
"step": 10515
},
{
"entropy": 5.733268880844117,
"epoch": 0.8838479311069103,
"grad_norm": 1.3203125,
"learning_rate": 0.0004928154629714207,
"loss": 5.5293,
"mean_token_accuracy": 0.16145433783531188,
"num_tokens": 19420993.0,
"step": 10520
},
{
"entropy": 5.6000950813293455,
"epoch": 0.8842680109220752,
"grad_norm": 1.25,
"learning_rate": 0.000492807953892962,
"loss": 5.5009,
"mean_token_accuracy": 0.15950000733137132,
"num_tokens": 19430145.0,
"step": 10525
},
{
"entropy": 5.610482597351075,
"epoch": 0.8846880907372401,
"grad_norm": 1.2890625,
"learning_rate": 0.0004928004409561476,
"loss": 5.4,
"mean_token_accuracy": 0.16498225480318068,
"num_tokens": 19438918.0,
"step": 10530
},
{
"entropy": 5.56955246925354,
"epoch": 0.885108170552405,
"grad_norm": 1.265625,
"learning_rate": 0.0004927929241611106,
"loss": 5.4581,
"mean_token_accuracy": 0.16881955415010452,
"num_tokens": 19448490.0,
"step": 10535
},
{
"entropy": 5.623895931243896,
"epoch": 0.8855282503675699,
"grad_norm": 1.2890625,
"learning_rate": 0.000492785403507984,
"loss": 5.5227,
"mean_token_accuracy": 0.1571098670363426,
"num_tokens": 19457098.0,
"step": 10540
},
{
"entropy": 5.634610605239868,
"epoch": 0.8859483301827347,
"grad_norm": 1.5078125,
"learning_rate": 0.0004927778789969012,
"loss": 5.5103,
"mean_token_accuracy": 0.1569845736026764,
"num_tokens": 19466419.0,
"step": 10545
},
{
"entropy": 5.628296756744385,
"epoch": 0.8863684099978996,
"grad_norm": 1.3515625,
"learning_rate": 0.0004927703506279955,
"loss": 5.5628,
"mean_token_accuracy": 0.15323089063167572,
"num_tokens": 19475882.0,
"step": 10550
},
{
"entropy": 5.80163049697876,
"epoch": 0.8867884898130645,
"grad_norm": 1.2421875,
"learning_rate": 0.0004927628184014,
"loss": 5.5953,
"mean_token_accuracy": 0.159140382707119,
"num_tokens": 19485917.0,
"step": 10555
},
{
"entropy": 5.71609354019165,
"epoch": 0.8872085696282294,
"grad_norm": 1.25,
"learning_rate": 0.0004927552823172483,
"loss": 5.5337,
"mean_token_accuracy": 0.15823422372341156,
"num_tokens": 19494984.0,
"step": 10560
},
{
"entropy": 5.688808917999268,
"epoch": 0.8876286494433943,
"grad_norm": 1.3125,
"learning_rate": 0.000492747742375674,
"loss": 5.4744,
"mean_token_accuracy": 0.16675125807523727,
"num_tokens": 19504087.0,
"step": 10565
},
{
"entropy": 5.661811017990113,
"epoch": 0.8880487292585592,
"grad_norm": 1.296875,
"learning_rate": 0.0004927401985768106,
"loss": 5.4992,
"mean_token_accuracy": 0.1689751848578453,
"num_tokens": 19512880.0,
"step": 10570
},
{
"entropy": 5.592242431640625,
"epoch": 0.888468809073724,
"grad_norm": 1.3515625,
"learning_rate": 0.0004927326509207915,
"loss": 5.4881,
"mean_token_accuracy": 0.16463636606931686,
"num_tokens": 19521723.0,
"step": 10575
},
{
"entropy": 5.759501934051514,
"epoch": 0.8888888888888888,
"grad_norm": 1.53125,
"learning_rate": 0.0004927250994077508,
"loss": 5.5706,
"mean_token_accuracy": 0.16017245650291442,
"num_tokens": 19531352.0,
"step": 10580
},
{
"entropy": 5.75401029586792,
"epoch": 0.8893089687040537,
"grad_norm": 1.4296875,
"learning_rate": 0.000492717544037822,
"loss": 5.6714,
"mean_token_accuracy": 0.16674605160951614,
"num_tokens": 19540943.0,
"step": 10585
},
{
"entropy": 5.614908599853516,
"epoch": 0.8897290485192186,
"grad_norm": 1.375,
"learning_rate": 0.000492709984811139,
"loss": 5.4374,
"mean_token_accuracy": 0.16738452166318893,
"num_tokens": 19550527.0,
"step": 10590
},
{
"entropy": 5.6367579936981205,
"epoch": 0.8901491283343835,
"grad_norm": 1.328125,
"learning_rate": 0.0004927024217278358,
"loss": 5.4179,
"mean_token_accuracy": 0.17424334436655045,
"num_tokens": 19559746.0,
"step": 10595
},
{
"entropy": 5.73973593711853,
"epoch": 0.8905692081495484,
"grad_norm": 1.265625,
"learning_rate": 0.0004926948547880462,
"loss": 5.6008,
"mean_token_accuracy": 0.15189019665122033,
"num_tokens": 19569286.0,
"step": 10600
},
{
"entropy": 5.646059894561768,
"epoch": 0.8909892879647133,
"grad_norm": 1.6015625,
"learning_rate": 0.0004926872839919044,
"loss": 5.4738,
"mean_token_accuracy": 0.1616295322775841,
"num_tokens": 19578245.0,
"step": 10605
},
{
"entropy": 5.602351570129395,
"epoch": 0.8914093677798782,
"grad_norm": 1.3203125,
"learning_rate": 0.0004926797093395446,
"loss": 5.4431,
"mean_token_accuracy": 0.16678168177604674,
"num_tokens": 19587244.0,
"step": 10610
},
{
"entropy": 5.681618118286133,
"epoch": 0.891829447595043,
"grad_norm": 1.2265625,
"learning_rate": 0.0004926721308311006,
"loss": 5.5176,
"mean_token_accuracy": 0.16693350076675414,
"num_tokens": 19596932.0,
"step": 10615
},
{
"entropy": 5.768650531768799,
"epoch": 0.8922495274102079,
"grad_norm": 1.34375,
"learning_rate": 0.0004926645484667069,
"loss": 5.6125,
"mean_token_accuracy": 0.15693477988243104,
"num_tokens": 19606256.0,
"step": 10620
},
{
"entropy": 5.757310009002685,
"epoch": 0.8926696072253728,
"grad_norm": 1.296875,
"learning_rate": 0.0004926569622464979,
"loss": 5.6243,
"mean_token_accuracy": 0.16205553114414215,
"num_tokens": 19615726.0,
"step": 10625
},
{
"entropy": 5.676463174819946,
"epoch": 0.8930896870405377,
"grad_norm": 1.3828125,
"learning_rate": 0.0004926493721706079,
"loss": 5.4859,
"mean_token_accuracy": 0.16073908805847167,
"num_tokens": 19624037.0,
"step": 10630
},
{
"entropy": 5.684830141067505,
"epoch": 0.8935097668557026,
"grad_norm": 1.625,
"learning_rate": 0.0004926417782391713,
"loss": 5.4641,
"mean_token_accuracy": 0.16800618469715117,
"num_tokens": 19632882.0,
"step": 10635
},
{
"entropy": 5.714396047592163,
"epoch": 0.8939298466708675,
"grad_norm": 1.3984375,
"learning_rate": 0.0004926341804523227,
"loss": 5.5903,
"mean_token_accuracy": 0.1597953498363495,
"num_tokens": 19642686.0,
"step": 10640
},
{
"entropy": 5.656919050216675,
"epoch": 0.8943499264860324,
"grad_norm": 1.4765625,
"learning_rate": 0.0004926265788101966,
"loss": 5.5042,
"mean_token_accuracy": 0.16232740730047227,
"num_tokens": 19651380.0,
"step": 10645
},
{
"entropy": 5.629188060760498,
"epoch": 0.8947700063011972,
"grad_norm": 1.296875,
"learning_rate": 0.0004926189733129278,
"loss": 5.4335,
"mean_token_accuracy": 0.16030899584293365,
"num_tokens": 19660136.0,
"step": 10650
},
{
"entropy": 5.602889823913574,
"epoch": 0.8951900861163621,
"grad_norm": 1.375,
"learning_rate": 0.0004926113639606509,
"loss": 5.4632,
"mean_token_accuracy": 0.17339837849140166,
"num_tokens": 19669146.0,
"step": 10655
},
{
"entropy": 5.749970960617065,
"epoch": 0.895610165931527,
"grad_norm": 1.3046875,
"learning_rate": 0.0004926037507535008,
"loss": 5.5953,
"mean_token_accuracy": 0.1565776377916336,
"num_tokens": 19678627.0,
"step": 10660
},
{
"entropy": 5.68851056098938,
"epoch": 0.8960302457466919,
"grad_norm": 1.46875,
"learning_rate": 0.0004925961336916122,
"loss": 5.5335,
"mean_token_accuracy": 0.16348532289266587,
"num_tokens": 19688033.0,
"step": 10665
},
{
"entropy": 5.685422372817993,
"epoch": 0.8964503255618568,
"grad_norm": 1.3203125,
"learning_rate": 0.0004925885127751202,
"loss": 5.5441,
"mean_token_accuracy": 0.1624837875366211,
"num_tokens": 19696523.0,
"step": 10670
},
{
"entropy": 5.724574613571167,
"epoch": 0.8968704053770217,
"grad_norm": 1.28125,
"learning_rate": 0.0004925808880041596,
"loss": 5.4653,
"mean_token_accuracy": 0.1627603441476822,
"num_tokens": 19706339.0,
"step": 10675
},
{
"entropy": 5.6943115234375,
"epoch": 0.8972904851921865,
"grad_norm": 1.53125,
"learning_rate": 0.0004925732593788658,
"loss": 5.4997,
"mean_token_accuracy": 0.15886156111955643,
"num_tokens": 19714779.0,
"step": 10680
},
{
"entropy": 5.638199138641357,
"epoch": 0.8977105650073514,
"grad_norm": 1.4296875,
"learning_rate": 0.0004925656268993737,
"loss": 5.5523,
"mean_token_accuracy": 0.16095577031373978,
"num_tokens": 19723727.0,
"step": 10685
},
{
"entropy": 5.653303480148315,
"epoch": 0.8981306448225163,
"grad_norm": 1.265625,
"learning_rate": 0.0004925579905658185,
"loss": 5.5389,
"mean_token_accuracy": 0.16392706334590912,
"num_tokens": 19732783.0,
"step": 10690
},
{
"entropy": 5.677282476425171,
"epoch": 0.8985507246376812,
"grad_norm": 1.2578125,
"learning_rate": 0.0004925503503783355,
"loss": 5.4975,
"mean_token_accuracy": 0.15634912848472596,
"num_tokens": 19741268.0,
"step": 10695
},
{
"entropy": 5.6636802673339846,
"epoch": 0.898970804452846,
"grad_norm": 1.34375,
"learning_rate": 0.0004925427063370601,
"loss": 5.4285,
"mean_token_accuracy": 0.16585467010736465,
"num_tokens": 19751490.0,
"step": 10700
},
{
"entropy": 5.662887954711914,
"epoch": 0.899390884268011,
"grad_norm": 1.46875,
"learning_rate": 0.0004925350584421278,
"loss": 5.4835,
"mean_token_accuracy": 0.16281237900257112,
"num_tokens": 19760487.0,
"step": 10705
},
{
"entropy": 5.680135250091553,
"epoch": 0.8998109640831758,
"grad_norm": 1.3046875,
"learning_rate": 0.0004925274066936738,
"loss": 5.4377,
"mean_token_accuracy": 0.16982365995645524,
"num_tokens": 19768984.0,
"step": 10710
},
{
"entropy": 5.600447034835815,
"epoch": 0.9002310438983406,
"grad_norm": 1.25,
"learning_rate": 0.0004925197510918339,
"loss": 5.4358,
"mean_token_accuracy": 0.1686476871371269,
"num_tokens": 19778335.0,
"step": 10715
},
{
"entropy": 5.654651784896851,
"epoch": 0.9006511237135055,
"grad_norm": 1.3671875,
"learning_rate": 0.0004925120916367435,
"loss": 5.5742,
"mean_token_accuracy": 0.1537408173084259,
"num_tokens": 19789082.0,
"step": 10720
},
{
"entropy": 5.627911615371704,
"epoch": 0.9010712035286704,
"grad_norm": 1.3125,
"learning_rate": 0.0004925044283285384,
"loss": 5.3205,
"mean_token_accuracy": 0.17863296270370482,
"num_tokens": 19797902.0,
"step": 10725
},
{
"entropy": 5.527833461761475,
"epoch": 0.9014912833438353,
"grad_norm": 1.375,
"learning_rate": 0.0004924967611673544,
"loss": 5.4749,
"mean_token_accuracy": 0.16872860938310624,
"num_tokens": 19806481.0,
"step": 10730
},
{
"entropy": 5.541446256637573,
"epoch": 0.9019113631590002,
"grad_norm": 1.328125,
"learning_rate": 0.0004924890901533273,
"loss": 5.3689,
"mean_token_accuracy": 0.17880219370126724,
"num_tokens": 19815226.0,
"step": 10735
},
{
"entropy": 5.774983835220337,
"epoch": 0.9023314429741651,
"grad_norm": 1.40625,
"learning_rate": 0.0004924814152865929,
"loss": 5.5806,
"mean_token_accuracy": 0.15592661499977112,
"num_tokens": 19824577.0,
"step": 10740
},
{
"entropy": 5.661573457717895,
"epoch": 0.90275152278933,
"grad_norm": 1.3828125,
"learning_rate": 0.0004924737365672873,
"loss": 5.5073,
"mean_token_accuracy": 0.16041582226753234,
"num_tokens": 19832936.0,
"step": 10745
},
{
"entropy": 5.730201482772827,
"epoch": 0.9031716026044948,
"grad_norm": 1.40625,
"learning_rate": 0.0004924660539955463,
"loss": 5.6387,
"mean_token_accuracy": 0.1655229866504669,
"num_tokens": 19841946.0,
"step": 10750
},
{
"entropy": 5.709415626525879,
"epoch": 0.9035916824196597,
"grad_norm": 1.2421875,
"learning_rate": 0.0004924583675715063,
"loss": 5.5192,
"mean_token_accuracy": 0.16042786836624146,
"num_tokens": 19851469.0,
"step": 10755
},
{
"entropy": 5.729109477996826,
"epoch": 0.9040117622348246,
"grad_norm": 1.3984375,
"learning_rate": 0.0004924506772953031,
"loss": 5.5905,
"mean_token_accuracy": 0.15771916061639785,
"num_tokens": 19860731.0,
"step": 10760
},
{
"entropy": 5.630281686782837,
"epoch": 0.9044318420499895,
"grad_norm": 1.671875,
"learning_rate": 0.0004924429831670733,
"loss": 5.5977,
"mean_token_accuracy": 0.15505731701850892,
"num_tokens": 19869717.0,
"step": 10765
},
{
"entropy": 5.670056867599487,
"epoch": 0.9048519218651544,
"grad_norm": 1.3984375,
"learning_rate": 0.000492435285186953,
"loss": 5.5315,
"mean_token_accuracy": 0.16062338650226593,
"num_tokens": 19879229.0,
"step": 10770
},
{
"entropy": 5.7497491359710695,
"epoch": 0.9052720016803193,
"grad_norm": 1.390625,
"learning_rate": 0.0004924275833550785,
"loss": 5.5242,
"mean_token_accuracy": 0.1601421967148781,
"num_tokens": 19888260.0,
"step": 10775
},
{
"entropy": 5.748339319229126,
"epoch": 0.9056920814954842,
"grad_norm": 1.4453125,
"learning_rate": 0.0004924198776715865,
"loss": 5.5553,
"mean_token_accuracy": 0.1663961872458458,
"num_tokens": 19897070.0,
"step": 10780
},
{
"entropy": 5.717373561859131,
"epoch": 0.906112161310649,
"grad_norm": 1.296875,
"learning_rate": 0.0004924121681366132,
"loss": 5.516,
"mean_token_accuracy": 0.1547970689833164,
"num_tokens": 19907170.0,
"step": 10785
},
{
"entropy": 5.6512758255004885,
"epoch": 0.9065322411258139,
"grad_norm": 1.4375,
"learning_rate": 0.0004924044547502951,
"loss": 5.477,
"mean_token_accuracy": 0.15937758833169938,
"num_tokens": 19917220.0,
"step": 10790
},
{
"entropy": 5.606697940826416,
"epoch": 0.9069523209409788,
"grad_norm": 1.265625,
"learning_rate": 0.0004923967375127692,
"loss": 5.5453,
"mean_token_accuracy": 0.16532299369573594,
"num_tokens": 19926724.0,
"step": 10795
},
{
"entropy": 5.764872741699219,
"epoch": 0.9073724007561437,
"grad_norm": 1.3515625,
"learning_rate": 0.000492389016424172,
"loss": 5.6433,
"mean_token_accuracy": 0.15729621946811675,
"num_tokens": 19936429.0,
"step": 10800
},
{
"entropy": 5.627816200256348,
"epoch": 0.9077924805713086,
"grad_norm": 1.3125,
"learning_rate": 0.0004923812914846404,
"loss": 5.4148,
"mean_token_accuracy": 0.17244187742471695,
"num_tokens": 19945096.0,
"step": 10805
},
{
"entropy": 5.617258310317993,
"epoch": 0.9082125603864735,
"grad_norm": 1.4375,
"learning_rate": 0.0004923735626943111,
"loss": 5.5084,
"mean_token_accuracy": 0.1648621678352356,
"num_tokens": 19953560.0,
"step": 10810
},
{
"entropy": 5.658634662628174,
"epoch": 0.9086326402016384,
"grad_norm": 1.3359375,
"learning_rate": 0.0004923658300533211,
"loss": 5.4752,
"mean_token_accuracy": 0.16002987921237946,
"num_tokens": 19962669.0,
"step": 10815
},
{
"entropy": 5.701725196838379,
"epoch": 0.9090527200168032,
"grad_norm": 1.421875,
"learning_rate": 0.0004923580935618073,
"loss": 5.5337,
"mean_token_accuracy": 0.16278491616249086,
"num_tokens": 19971990.0,
"step": 10820
},
{
"entropy": 5.643719911575317,
"epoch": 0.909472799831968,
"grad_norm": 1.3359375,
"learning_rate": 0.0004923503532199069,
"loss": 5.5083,
"mean_token_accuracy": 0.16564737558364867,
"num_tokens": 19981850.0,
"step": 10825
},
{
"entropy": 5.67440276145935,
"epoch": 0.909892879647133,
"grad_norm": 1.296875,
"learning_rate": 0.0004923426090277567,
"loss": 5.5631,
"mean_token_accuracy": 0.15856756269931793,
"num_tokens": 19991574.0,
"step": 10830
},
{
"entropy": 5.6700526714324955,
"epoch": 0.9103129594622978,
"grad_norm": 1.40625,
"learning_rate": 0.0004923348609854943,
"loss": 5.5267,
"mean_token_accuracy": 0.16576842218637466,
"num_tokens": 20001392.0,
"step": 10835
},
{
"entropy": 5.65757007598877,
"epoch": 0.9107330392774627,
"grad_norm": 1.34375,
"learning_rate": 0.0004923271090932566,
"loss": 5.5644,
"mean_token_accuracy": 0.15540991723537445,
"num_tokens": 20011277.0,
"step": 10840
},
{
"entropy": 5.6370244979858395,
"epoch": 0.9111531190926276,
"grad_norm": 1.3671875,
"learning_rate": 0.0004923193533511812,
"loss": 5.4852,
"mean_token_accuracy": 0.16075857132673263,
"num_tokens": 20021171.0,
"step": 10845
},
{
"entropy": 5.814991569519043,
"epoch": 0.9115731989077924,
"grad_norm": 1.4453125,
"learning_rate": 0.0004923115937594053,
"loss": 5.5416,
"mean_token_accuracy": 0.163661889731884,
"num_tokens": 20030189.0,
"step": 10850
},
{
"entropy": 5.692598056793213,
"epoch": 0.9119932787229573,
"grad_norm": 1.453125,
"learning_rate": 0.0004923038303180664,
"loss": 5.5171,
"mean_token_accuracy": 0.16363249719142914,
"num_tokens": 20038287.0,
"step": 10855
},
{
"entropy": 5.63488359451294,
"epoch": 0.9124133585381222,
"grad_norm": 1.3359375,
"learning_rate": 0.000492296063027302,
"loss": 5.5518,
"mean_token_accuracy": 0.15580225586891175,
"num_tokens": 20047653.0,
"step": 10860
},
{
"entropy": 5.688053321838379,
"epoch": 0.9128334383532871,
"grad_norm": 1.4453125,
"learning_rate": 0.0004922882918872498,
"loss": 5.5501,
"mean_token_accuracy": 0.15726881623268127,
"num_tokens": 20057415.0,
"step": 10865
},
{
"entropy": 5.78084363937378,
"epoch": 0.913253518168452,
"grad_norm": 1.46875,
"learning_rate": 0.0004922805168980475,
"loss": 5.5616,
"mean_token_accuracy": 0.16048648655414582,
"num_tokens": 20065996.0,
"step": 10870
},
{
"entropy": 5.706553649902344,
"epoch": 0.9136735979836169,
"grad_norm": 1.28125,
"learning_rate": 0.0004922727380598326,
"loss": 5.4904,
"mean_token_accuracy": 0.15912986695766448,
"num_tokens": 20075376.0,
"step": 10875
},
{
"entropy": 5.614659214019776,
"epoch": 0.9140936777987818,
"grad_norm": 1.3984375,
"learning_rate": 0.000492264955372743,
"loss": 5.5293,
"mean_token_accuracy": 0.15426042973995208,
"num_tokens": 20084950.0,
"step": 10880
},
{
"entropy": 5.729830312728882,
"epoch": 0.9145137576139466,
"grad_norm": 1.65625,
"learning_rate": 0.0004922571688369165,
"loss": 5.5176,
"mean_token_accuracy": 0.16088814586400985,
"num_tokens": 20094011.0,
"step": 10885
},
{
"entropy": 5.693281841278076,
"epoch": 0.9149338374291115,
"grad_norm": 1.375,
"learning_rate": 0.0004922493784524914,
"loss": 5.489,
"mean_token_accuracy": 0.16421934738755226,
"num_tokens": 20103037.0,
"step": 10890
},
{
"entropy": 5.709012699127197,
"epoch": 0.9153539172442764,
"grad_norm": 1.4921875,
"learning_rate": 0.0004922415842196052,
"loss": 5.6312,
"mean_token_accuracy": 0.15036728978157043,
"num_tokens": 20112727.0,
"step": 10895
},
{
"entropy": 5.627096223831177,
"epoch": 0.9157739970594413,
"grad_norm": 1.6796875,
"learning_rate": 0.0004922337861383963,
"loss": 5.4382,
"mean_token_accuracy": 0.16880789548158645,
"num_tokens": 20122341.0,
"step": 10900
},
{
"entropy": 5.691968965530395,
"epoch": 0.9161940768746062,
"grad_norm": 1.421875,
"learning_rate": 0.0004922259842090027,
"loss": 5.4329,
"mean_token_accuracy": 0.16247511506080628,
"num_tokens": 20131354.0,
"step": 10905
},
{
"entropy": 5.6311639785766605,
"epoch": 0.9166141566897711,
"grad_norm": 1.46875,
"learning_rate": 0.0004922181784315627,
"loss": 5.4611,
"mean_token_accuracy": 0.16493925154209138,
"num_tokens": 20140440.0,
"step": 10910
},
{
"entropy": 5.651127910614013,
"epoch": 0.917034236504936,
"grad_norm": 1.34375,
"learning_rate": 0.0004922103688062145,
"loss": 5.4833,
"mean_token_accuracy": 0.15852523893117904,
"num_tokens": 20149331.0,
"step": 10915
},
{
"entropy": 5.658089113235474,
"epoch": 0.9174543163201008,
"grad_norm": 1.3203125,
"learning_rate": 0.0004922025553330964,
"loss": 5.4451,
"mean_token_accuracy": 0.17089224606752396,
"num_tokens": 20158566.0,
"step": 10920
},
{
"entropy": 5.678445959091187,
"epoch": 0.9178743961352657,
"grad_norm": 1.34375,
"learning_rate": 0.000492194738012347,
"loss": 5.544,
"mean_token_accuracy": 0.16259633898735046,
"num_tokens": 20168339.0,
"step": 10925
},
{
"entropy": 5.716028356552124,
"epoch": 0.9182944759504306,
"grad_norm": 1.3984375,
"learning_rate": 0.0004921869168441045,
"loss": 5.5536,
"mean_token_accuracy": 0.15383095294237137,
"num_tokens": 20177967.0,
"step": 10930
},
{
"entropy": 5.637088251113892,
"epoch": 0.9187145557655955,
"grad_norm": 1.4375,
"learning_rate": 0.0004921790918285077,
"loss": 5.5568,
"mean_token_accuracy": 0.16159048825502395,
"num_tokens": 20187279.0,
"step": 10935
},
{
"entropy": 5.707053279876709,
"epoch": 0.9191346355807604,
"grad_norm": 1.578125,
"learning_rate": 0.0004921712629656951,
"loss": 5.6541,
"mean_token_accuracy": 0.17022398263216018,
"num_tokens": 20195324.0,
"step": 10940
},
{
"entropy": 5.740572452545166,
"epoch": 0.9195547153959253,
"grad_norm": 1.625,
"learning_rate": 0.0004921634302558054,
"loss": 5.5677,
"mean_token_accuracy": 0.16136983186006545,
"num_tokens": 20204985.0,
"step": 10945
},
{
"entropy": 5.679078197479248,
"epoch": 0.9199747952110902,
"grad_norm": 1.6875,
"learning_rate": 0.0004921555936989773,
"loss": 5.5835,
"mean_token_accuracy": 0.15574949830770493,
"num_tokens": 20214553.0,
"step": 10950
},
{
"entropy": 5.698373556137085,
"epoch": 0.9203948750262549,
"grad_norm": 1.3359375,
"learning_rate": 0.0004921477532953497,
"loss": 5.4858,
"mean_token_accuracy": 0.16730043292045593,
"num_tokens": 20224118.0,
"step": 10955
},
{
"entropy": 5.6933988571167,
"epoch": 0.9208149548414198,
"grad_norm": 1.28125,
"learning_rate": 0.0004921399090450616,
"loss": 5.4604,
"mean_token_accuracy": 0.15393470227718353,
"num_tokens": 20233719.0,
"step": 10960
},
{
"entropy": 5.638811302185059,
"epoch": 0.9212350346565847,
"grad_norm": 1.640625,
"learning_rate": 0.0004921320609482517,
"loss": 5.5464,
"mean_token_accuracy": 0.1613891303539276,
"num_tokens": 20242311.0,
"step": 10965
},
{
"entropy": 5.718568897247314,
"epoch": 0.9216551144717496,
"grad_norm": 1.5625,
"learning_rate": 0.0004921242090050591,
"loss": 5.5802,
"mean_token_accuracy": 0.15839444398880004,
"num_tokens": 20252998.0,
"step": 10970
},
{
"entropy": 5.725355386734009,
"epoch": 0.9220751942869145,
"grad_norm": 1.3125,
"learning_rate": 0.000492116353215623,
"loss": 5.63,
"mean_token_accuracy": 0.16329726874828338,
"num_tokens": 20262456.0,
"step": 10975
},
{
"entropy": 5.627430534362793,
"epoch": 0.9224952741020794,
"grad_norm": 1.328125,
"learning_rate": 0.0004921084935800825,
"loss": 5.3904,
"mean_token_accuracy": 0.17261915653944016,
"num_tokens": 20271516.0,
"step": 10980
},
{
"entropy": 5.653037452697754,
"epoch": 0.9229153539172443,
"grad_norm": 1.359375,
"learning_rate": 0.0004921006300985768,
"loss": 5.4391,
"mean_token_accuracy": 0.16310697942972183,
"num_tokens": 20280373.0,
"step": 10985
},
{
"entropy": 5.643791198730469,
"epoch": 0.9233354337324091,
"grad_norm": 1.3671875,
"learning_rate": 0.0004920927627712453,
"loss": 5.4371,
"mean_token_accuracy": 0.1669539228081703,
"num_tokens": 20289426.0,
"step": 10990
},
{
"entropy": 5.682681846618652,
"epoch": 0.923755513547574,
"grad_norm": 1.9296875,
"learning_rate": 0.0004920848915982273,
"loss": 5.5878,
"mean_token_accuracy": 0.1578148901462555,
"num_tokens": 20298045.0,
"step": 10995
},
{
"entropy": 5.6499217510223385,
"epoch": 0.9241755933627389,
"grad_norm": 1.3671875,
"learning_rate": 0.0004920770165796622,
"loss": 5.4716,
"mean_token_accuracy": 0.1633152633905411,
"num_tokens": 20307352.0,
"step": 11000
},
{
"entropy": 5.664845609664917,
"epoch": 0.9245956731779038,
"grad_norm": 1.6015625,
"learning_rate": 0.0004920691377156895,
"loss": 5.4945,
"mean_token_accuracy": 0.1626512423157692,
"num_tokens": 20316448.0,
"step": 11005
},
{
"entropy": 5.777086496353149,
"epoch": 0.9250157529930687,
"grad_norm": 1.6171875,
"learning_rate": 0.0004920612550064488,
"loss": 5.641,
"mean_token_accuracy": 0.1607293888926506,
"num_tokens": 20326440.0,
"step": 11010
},
{
"entropy": 5.655669260025024,
"epoch": 0.9254358328082336,
"grad_norm": 1.3671875,
"learning_rate": 0.0004920533684520797,
"loss": 5.431,
"mean_token_accuracy": 0.16383090168237685,
"num_tokens": 20335447.0,
"step": 11015
},
{
"entropy": 5.635196256637573,
"epoch": 0.9258559126233984,
"grad_norm": 1.6171875,
"learning_rate": 0.000492045478052722,
"loss": 5.5698,
"mean_token_accuracy": 0.1583259880542755,
"num_tokens": 20344523.0,
"step": 11020
},
{
"entropy": 5.65436954498291,
"epoch": 0.9262759924385633,
"grad_norm": 1.4375,
"learning_rate": 0.0004920375838085154,
"loss": 5.5243,
"mean_token_accuracy": 0.16398487687110902,
"num_tokens": 20354267.0,
"step": 11025
},
{
"entropy": 5.7046201705932615,
"epoch": 0.9266960722537282,
"grad_norm": 1.796875,
"learning_rate": 0.0004920296857195998,
"loss": 5.6023,
"mean_token_accuracy": 0.15840856581926346,
"num_tokens": 20364137.0,
"step": 11030
},
{
"entropy": 5.6905293464660645,
"epoch": 0.9271161520688931,
"grad_norm": 1.671875,
"learning_rate": 0.000492021783786115,
"loss": 5.4931,
"mean_token_accuracy": 0.16749118119478226,
"num_tokens": 20372583.0,
"step": 11035
},
{
"entropy": 5.655548763275147,
"epoch": 0.927536231884058,
"grad_norm": 1.546875,
"learning_rate": 0.0004920138780082011,
"loss": 5.4444,
"mean_token_accuracy": 0.1603078469634056,
"num_tokens": 20382050.0,
"step": 11040
},
{
"entropy": 5.616688871383667,
"epoch": 0.9279563116992229,
"grad_norm": 1.515625,
"learning_rate": 0.0004920059683859981,
"loss": 5.4141,
"mean_token_accuracy": 0.16826660186052322,
"num_tokens": 20391425.0,
"step": 11045
},
{
"entropy": 5.713768148422242,
"epoch": 0.9283763915143878,
"grad_norm": 1.53125,
"learning_rate": 0.0004919980549196461,
"loss": 5.568,
"mean_token_accuracy": 0.159714911878109,
"num_tokens": 20400559.0,
"step": 11050
},
{
"entropy": 5.690584897994995,
"epoch": 0.9287964713295526,
"grad_norm": 1.3515625,
"learning_rate": 0.0004919901376092853,
"loss": 5.5072,
"mean_token_accuracy": 0.16440173387527465,
"num_tokens": 20408985.0,
"step": 11055
},
{
"entropy": 5.639202308654785,
"epoch": 0.9292165511447175,
"grad_norm": 1.5,
"learning_rate": 0.0004919822164550559,
"loss": 5.5997,
"mean_token_accuracy": 0.14894191175699234,
"num_tokens": 20417855.0,
"step": 11060
},
{
"entropy": 5.681364822387695,
"epoch": 0.9296366309598824,
"grad_norm": 1.3203125,
"learning_rate": 0.0004919742914570983,
"loss": 5.5513,
"mean_token_accuracy": 0.1600523427128792,
"num_tokens": 20426191.0,
"step": 11065
},
{
"entropy": 5.688843154907227,
"epoch": 0.9300567107750473,
"grad_norm": 1.3828125,
"learning_rate": 0.000491966362615553,
"loss": 5.5361,
"mean_token_accuracy": 0.15442707315087317,
"num_tokens": 20435592.0,
"step": 11070
},
{
"entropy": 5.733451700210571,
"epoch": 0.9304767905902122,
"grad_norm": 1.3984375,
"learning_rate": 0.00049195842993056,
"loss": 5.5476,
"mean_token_accuracy": 0.16001696437597274,
"num_tokens": 20445504.0,
"step": 11075
},
{
"entropy": 5.7029729843139645,
"epoch": 0.930896870405377,
"grad_norm": 1.4453125,
"learning_rate": 0.0004919504934022604,
"loss": 5.499,
"mean_token_accuracy": 0.15569454431533813,
"num_tokens": 20455153.0,
"step": 11080
},
{
"entropy": 5.661473369598388,
"epoch": 0.931316950220542,
"grad_norm": 1.2890625,
"learning_rate": 0.0004919425530307943,
"loss": 5.4993,
"mean_token_accuracy": 0.15951350182294846,
"num_tokens": 20465101.0,
"step": 11085
},
{
"entropy": 5.632450342178345,
"epoch": 0.9317370300357067,
"grad_norm": 1.375,
"learning_rate": 0.0004919346088163028,
"loss": 5.5358,
"mean_token_accuracy": 0.1622280955314636,
"num_tokens": 20474700.0,
"step": 11090
},
{
"entropy": 5.722516965866089,
"epoch": 0.9321571098508716,
"grad_norm": 1.609375,
"learning_rate": 0.0004919266607589263,
"loss": 5.5613,
"mean_token_accuracy": 0.15339310318231583,
"num_tokens": 20483945.0,
"step": 11095
},
{
"entropy": 5.700594758987426,
"epoch": 0.9325771896660365,
"grad_norm": 1.421875,
"learning_rate": 0.0004919187088588057,
"loss": 5.5474,
"mean_token_accuracy": 0.1624033495783806,
"num_tokens": 20493307.0,
"step": 11100
},
{
"entropy": 5.659552097320557,
"epoch": 0.9329972694812014,
"grad_norm": 1.53125,
"learning_rate": 0.0004919107531160819,
"loss": 5.4732,
"mean_token_accuracy": 0.16901983320713043,
"num_tokens": 20501889.0,
"step": 11105
},
{
"entropy": 5.644443607330322,
"epoch": 0.9334173492963663,
"grad_norm": 1.5390625,
"learning_rate": 0.0004919027935308957,
"loss": 5.5147,
"mean_token_accuracy": 0.1635582149028778,
"num_tokens": 20510577.0,
"step": 11110
},
{
"entropy": 5.582364463806153,
"epoch": 0.9338374291115312,
"grad_norm": 1.3671875,
"learning_rate": 0.0004918948301033884,
"loss": 5.4685,
"mean_token_accuracy": 0.16453532576560975,
"num_tokens": 20520025.0,
"step": 11115
},
{
"entropy": 5.729832363128662,
"epoch": 0.9342575089266961,
"grad_norm": 1.71875,
"learning_rate": 0.0004918868628337007,
"loss": 5.5364,
"mean_token_accuracy": 0.16004782021045685,
"num_tokens": 20528989.0,
"step": 11120
},
{
"entropy": 5.722401762008667,
"epoch": 0.9346775887418609,
"grad_norm": 1.3515625,
"learning_rate": 0.0004918788917219739,
"loss": 5.4639,
"mean_token_accuracy": 0.16330705732107162,
"num_tokens": 20538328.0,
"step": 11125
},
{
"entropy": 5.684643316268921,
"epoch": 0.9350976685570258,
"grad_norm": 1.5625,
"learning_rate": 0.0004918709167683493,
"loss": 5.618,
"mean_token_accuracy": 0.15527107566595078,
"num_tokens": 20548069.0,
"step": 11130
},
{
"entropy": 5.62087116241455,
"epoch": 0.9355177483721907,
"grad_norm": 1.359375,
"learning_rate": 0.0004918629379729681,
"loss": 5.3577,
"mean_token_accuracy": 0.1718309447169304,
"num_tokens": 20557128.0,
"step": 11135
},
{
"entropy": 5.642658615112305,
"epoch": 0.9359378281873556,
"grad_norm": 1.515625,
"learning_rate": 0.0004918549553359715,
"loss": 5.4888,
"mean_token_accuracy": 0.1616463139653206,
"num_tokens": 20566352.0,
"step": 11140
},
{
"entropy": 5.673039436340332,
"epoch": 0.9363579080025205,
"grad_norm": 1.328125,
"learning_rate": 0.0004918469688575012,
"loss": 5.5125,
"mean_token_accuracy": 0.1632925733923912,
"num_tokens": 20575814.0,
"step": 11145
},
{
"entropy": 5.649928092956543,
"epoch": 0.9367779878176854,
"grad_norm": 1.3671875,
"learning_rate": 0.0004918389785376983,
"loss": 5.399,
"mean_token_accuracy": 0.16439888179302214,
"num_tokens": 20584715.0,
"step": 11150
},
{
"entropy": 5.607899522781372,
"epoch": 0.9371980676328503,
"grad_norm": 1.3984375,
"learning_rate": 0.0004918309843767047,
"loss": 5.4657,
"mean_token_accuracy": 0.1584487035870552,
"num_tokens": 20594630.0,
"step": 11155
},
{
"entropy": 5.618726491928101,
"epoch": 0.9376181474480151,
"grad_norm": 1.4140625,
"learning_rate": 0.0004918229863746618,
"loss": 5.4674,
"mean_token_accuracy": 0.1587045595049858,
"num_tokens": 20603653.0,
"step": 11160
},
{
"entropy": 5.6923305034637455,
"epoch": 0.93803822726318,
"grad_norm": 1.5625,
"learning_rate": 0.0004918149845317114,
"loss": 5.5296,
"mean_token_accuracy": 0.16108497381210327,
"num_tokens": 20612188.0,
"step": 11165
},
{
"entropy": 5.654456329345703,
"epoch": 0.9384583070783449,
"grad_norm": 1.3671875,
"learning_rate": 0.0004918069788479952,
"loss": 5.4462,
"mean_token_accuracy": 0.1687158852815628,
"num_tokens": 20620933.0,
"step": 11170
},
{
"entropy": 5.652120113372803,
"epoch": 0.9388783868935098,
"grad_norm": 1.4296875,
"learning_rate": 0.0004917989693236549,
"loss": 5.4875,
"mean_token_accuracy": 0.1668206810951233,
"num_tokens": 20629919.0,
"step": 11175
},
{
"entropy": 5.682731342315674,
"epoch": 0.9392984667086747,
"grad_norm": 1.4453125,
"learning_rate": 0.0004917909559588326,
"loss": 5.465,
"mean_token_accuracy": 0.1615450456738472,
"num_tokens": 20638475.0,
"step": 11180
},
{
"entropy": 5.754220056533813,
"epoch": 0.9397185465238396,
"grad_norm": 1.515625,
"learning_rate": 0.00049178293875367,
"loss": 5.5823,
"mean_token_accuracy": 0.1557904876768589,
"num_tokens": 20648105.0,
"step": 11185
},
{
"entropy": 5.608329772949219,
"epoch": 0.9401386263390044,
"grad_norm": 1.5625,
"learning_rate": 0.0004917749177083094,
"loss": 5.4916,
"mean_token_accuracy": 0.15995590686798095,
"num_tokens": 20657527.0,
"step": 11190
},
{
"entropy": 5.659386062622071,
"epoch": 0.9405587061541693,
"grad_norm": 1.4609375,
"learning_rate": 0.0004917668928228927,
"loss": 5.4957,
"mean_token_accuracy": 0.16557826548814775,
"num_tokens": 20666375.0,
"step": 11195
},
{
"entropy": 5.658402729034424,
"epoch": 0.9409787859693342,
"grad_norm": 1.3828125,
"learning_rate": 0.0004917588640975622,
"loss": 5.4458,
"mean_token_accuracy": 0.16658560037612916,
"num_tokens": 20675350.0,
"step": 11200
},
{
"entropy": 5.563868188858033,
"epoch": 0.941398865784499,
"grad_norm": 1.3046875,
"learning_rate": 0.00049175083153246,
"loss": 5.3752,
"mean_token_accuracy": 0.16457706093788146,
"num_tokens": 20684072.0,
"step": 11205
},
{
"entropy": 5.598710680007935,
"epoch": 0.941818945599664,
"grad_norm": 1.4609375,
"learning_rate": 0.0004917427951277284,
"loss": 5.4561,
"mean_token_accuracy": 0.16836450397968292,
"num_tokens": 20692989.0,
"step": 11210
},
{
"entropy": 5.691392278671264,
"epoch": 0.9422390254148288,
"grad_norm": 1.515625,
"learning_rate": 0.0004917347548835097,
"loss": 5.4403,
"mean_token_accuracy": 0.15893812775611876,
"num_tokens": 20701269.0,
"step": 11215
},
{
"entropy": 5.68915286064148,
"epoch": 0.9426591052299937,
"grad_norm": 1.4140625,
"learning_rate": 0.0004917267107999466,
"loss": 5.5515,
"mean_token_accuracy": 0.16261414885520936,
"num_tokens": 20709739.0,
"step": 11220
},
{
"entropy": 5.657668352127075,
"epoch": 0.9430791850451585,
"grad_norm": 2.296875,
"learning_rate": 0.0004917186628771812,
"loss": 5.4848,
"mean_token_accuracy": 0.1634127229452133,
"num_tokens": 20718950.0,
"step": 11225
},
{
"entropy": 5.669168424606323,
"epoch": 0.9434992648603234,
"grad_norm": 1.390625,
"learning_rate": 0.0004917106111153565,
"loss": 5.4763,
"mean_token_accuracy": 0.16090827137231828,
"num_tokens": 20729469.0,
"step": 11230
},
{
"entropy": 5.717650508880615,
"epoch": 0.9439193446754883,
"grad_norm": 1.5859375,
"learning_rate": 0.0004917025555146148,
"loss": 5.4905,
"mean_token_accuracy": 0.17064472585916518,
"num_tokens": 20738231.0,
"step": 11235
},
{
"entropy": 5.70476598739624,
"epoch": 0.9443394244906532,
"grad_norm": 1.4453125,
"learning_rate": 0.000491694496075099,
"loss": 5.6789,
"mean_token_accuracy": 0.14857278168201446,
"num_tokens": 20748578.0,
"step": 11240
},
{
"entropy": 5.7716655254364015,
"epoch": 0.9447595043058181,
"grad_norm": 1.4296875,
"learning_rate": 0.0004916864327969517,
"loss": 5.6139,
"mean_token_accuracy": 0.1479826033115387,
"num_tokens": 20759284.0,
"step": 11245
},
{
"entropy": 5.737630414962768,
"epoch": 0.945179584120983,
"grad_norm": 1.4765625,
"learning_rate": 0.0004916783656803158,
"loss": 5.5626,
"mean_token_accuracy": 0.16123252511024475,
"num_tokens": 20768186.0,
"step": 11250
},
{
"entropy": 5.648173189163208,
"epoch": 0.9455996639361479,
"grad_norm": 1.4375,
"learning_rate": 0.0004916702947253342,
"loss": 5.4262,
"mean_token_accuracy": 0.16916553676128387,
"num_tokens": 20776711.0,
"step": 11255
},
{
"entropy": 5.6307295799255375,
"epoch": 0.9460197437513127,
"grad_norm": 1.515625,
"learning_rate": 0.0004916622199321501,
"loss": 5.5014,
"mean_token_accuracy": 0.15936348885297774,
"num_tokens": 20785154.0,
"step": 11260
},
{
"entropy": 5.677838134765625,
"epoch": 0.9464398235664776,
"grad_norm": 1.53125,
"learning_rate": 0.0004916541413009062,
"loss": 5.4282,
"mean_token_accuracy": 0.1709165707230568,
"num_tokens": 20794114.0,
"step": 11265
},
{
"entropy": 5.734316635131836,
"epoch": 0.9468599033816425,
"grad_norm": 1.671875,
"learning_rate": 0.0004916460588317458,
"loss": 5.5242,
"mean_token_accuracy": 0.16012431532144547,
"num_tokens": 20803892.0,
"step": 11270
},
{
"entropy": 5.586827802658081,
"epoch": 0.9472799831968074,
"grad_norm": 1.4140625,
"learning_rate": 0.0004916379725248118,
"loss": 5.4265,
"mean_token_accuracy": 0.1667679503560066,
"num_tokens": 20812892.0,
"step": 11275
},
{
"entropy": 5.660504627227783,
"epoch": 0.9477000630119723,
"grad_norm": 1.359375,
"learning_rate": 0.0004916298823802479,
"loss": 5.4878,
"mean_token_accuracy": 0.15982511639595032,
"num_tokens": 20821934.0,
"step": 11280
},
{
"entropy": 5.59260663986206,
"epoch": 0.9481201428271372,
"grad_norm": 1.3359375,
"learning_rate": 0.0004916217883981971,
"loss": 5.4068,
"mean_token_accuracy": 0.16577256172895433,
"num_tokens": 20830100.0,
"step": 11285
},
{
"entropy": 5.655998659133911,
"epoch": 0.9485402226423021,
"grad_norm": 1.2890625,
"learning_rate": 0.0004916136905788029,
"loss": 5.486,
"mean_token_accuracy": 0.16212498694658278,
"num_tokens": 20839890.0,
"step": 11290
},
{
"entropy": 5.721244287490845,
"epoch": 0.9489603024574669,
"grad_norm": 1.4765625,
"learning_rate": 0.0004916055889222087,
"loss": 5.5913,
"mean_token_accuracy": 0.1496299162507057,
"num_tokens": 20848670.0,
"step": 11295
},
{
"entropy": 5.6661537170410154,
"epoch": 0.9493803822726318,
"grad_norm": 1.296875,
"learning_rate": 0.000491597483428558,
"loss": 5.4543,
"mean_token_accuracy": 0.16638226807117462,
"num_tokens": 20857291.0,
"step": 11300
},
{
"entropy": 5.579902648925781,
"epoch": 0.9498004620877967,
"grad_norm": 1.4765625,
"learning_rate": 0.0004915893740979944,
"loss": 5.4302,
"mean_token_accuracy": 0.16439789533615112,
"num_tokens": 20865341.0,
"step": 11305
},
{
"entropy": 5.676503372192383,
"epoch": 0.9502205419029616,
"grad_norm": 1.4609375,
"learning_rate": 0.0004915812609306617,
"loss": 5.5474,
"mean_token_accuracy": 0.16357194930315017,
"num_tokens": 20875194.0,
"step": 11310
},
{
"entropy": 5.66827392578125,
"epoch": 0.9506406217181265,
"grad_norm": 1.5,
"learning_rate": 0.0004915731439267034,
"loss": 5.4553,
"mean_token_accuracy": 0.16290259212255478,
"num_tokens": 20884831.0,
"step": 11315
},
{
"entropy": 5.562486600875855,
"epoch": 0.9510607015332914,
"grad_norm": 1.2578125,
"learning_rate": 0.0004915650230862634,
"loss": 5.3338,
"mean_token_accuracy": 0.1782681941986084,
"num_tokens": 20893790.0,
"step": 11320
},
{
"entropy": 5.596565675735474,
"epoch": 0.9514807813484563,
"grad_norm": 1.734375,
"learning_rate": 0.0004915568984094854,
"loss": 5.4803,
"mean_token_accuracy": 0.16112319082021714,
"num_tokens": 20902175.0,
"step": 11325
},
{
"entropy": 5.726467943191528,
"epoch": 0.951900861163621,
"grad_norm": 1.515625,
"learning_rate": 0.0004915487698965136,
"loss": 5.6184,
"mean_token_accuracy": 0.15359724164009095,
"num_tokens": 20911484.0,
"step": 11330
},
{
"entropy": 5.781221151351929,
"epoch": 0.952320940978786,
"grad_norm": 1.4375,
"learning_rate": 0.0004915406375474917,
"loss": 5.5453,
"mean_token_accuracy": 0.15792252421379088,
"num_tokens": 20920916.0,
"step": 11335
},
{
"entropy": 5.741652107238769,
"epoch": 0.9527410207939508,
"grad_norm": 1.3203125,
"learning_rate": 0.000491532501362564,
"loss": 5.5664,
"mean_token_accuracy": 0.16220796555280687,
"num_tokens": 20930219.0,
"step": 11340
},
{
"entropy": 5.57544355392456,
"epoch": 0.9531611006091157,
"grad_norm": 1.4765625,
"learning_rate": 0.0004915243613418745,
"loss": 5.3864,
"mean_token_accuracy": 0.17249523252248763,
"num_tokens": 20939591.0,
"step": 11345
},
{
"entropy": 5.6936798095703125,
"epoch": 0.9535811804242806,
"grad_norm": 1.3984375,
"learning_rate": 0.0004915162174855675,
"loss": 5.5717,
"mean_token_accuracy": 0.15805065482854844,
"num_tokens": 20950035.0,
"step": 11350
},
{
"entropy": 5.6745775699615475,
"epoch": 0.9540012602394455,
"grad_norm": 1.4765625,
"learning_rate": 0.0004915080697937872,
"loss": 5.4665,
"mean_token_accuracy": 0.16316088140010834,
"num_tokens": 20959168.0,
"step": 11355
},
{
"entropy": 5.566767168045044,
"epoch": 0.9544213400546103,
"grad_norm": 1.3671875,
"learning_rate": 0.0004914999182666779,
"loss": 5.415,
"mean_token_accuracy": 0.16996480226516725,
"num_tokens": 20967887.0,
"step": 11360
},
{
"entropy": 5.693580484390258,
"epoch": 0.9548414198697752,
"grad_norm": 1.578125,
"learning_rate": 0.0004914917629043839,
"loss": 5.5077,
"mean_token_accuracy": 0.15940580666065216,
"num_tokens": 20977558.0,
"step": 11365
},
{
"entropy": 5.588667678833008,
"epoch": 0.9552614996849401,
"grad_norm": 1.5078125,
"learning_rate": 0.00049148360370705,
"loss": 5.4812,
"mean_token_accuracy": 0.16379174292087556,
"num_tokens": 20986118.0,
"step": 11370
},
{
"entropy": 5.614265727996826,
"epoch": 0.955681579500105,
"grad_norm": 1.4921875,
"learning_rate": 0.0004914754406748204,
"loss": 5.3913,
"mean_token_accuracy": 0.1682723805308342,
"num_tokens": 20994623.0,
"step": 11375
},
{
"entropy": 5.693148231506347,
"epoch": 0.9561016593152699,
"grad_norm": 1.3125,
"learning_rate": 0.00049146727380784,
"loss": 5.5802,
"mean_token_accuracy": 0.16016335636377335,
"num_tokens": 21004193.0,
"step": 11380
},
{
"entropy": 5.597726583480835,
"epoch": 0.9565217391304348,
"grad_norm": 1.421875,
"learning_rate": 0.0004914591031062531,
"loss": 5.4044,
"mean_token_accuracy": 0.17047665268182755,
"num_tokens": 21013125.0,
"step": 11385
},
{
"entropy": 5.515113019943238,
"epoch": 0.9569418189455997,
"grad_norm": 1.3828125,
"learning_rate": 0.0004914509285702048,
"loss": 5.3387,
"mean_token_accuracy": 0.1710033819079399,
"num_tokens": 21021402.0,
"step": 11390
},
{
"entropy": 5.614928150177002,
"epoch": 0.9573618987607645,
"grad_norm": 1.3984375,
"learning_rate": 0.0004914427501998397,
"loss": 5.422,
"mean_token_accuracy": 0.16588329821825026,
"num_tokens": 21029639.0,
"step": 11395
},
{
"entropy": 5.619404268264771,
"epoch": 0.9577819785759294,
"grad_norm": 1.5,
"learning_rate": 0.0004914345679953027,
"loss": 5.4523,
"mean_token_accuracy": 0.16462402492761613,
"num_tokens": 21037525.0,
"step": 11400
},
{
"entropy": 5.662878179550171,
"epoch": 0.9582020583910943,
"grad_norm": 1.484375,
"learning_rate": 0.0004914263819567388,
"loss": 5.5605,
"mean_token_accuracy": 0.15533360093832016,
"num_tokens": 21047702.0,
"step": 11405
},
{
"entropy": 5.694554328918457,
"epoch": 0.9586221382062592,
"grad_norm": 1.40625,
"learning_rate": 0.000491418192084293,
"loss": 5.4548,
"mean_token_accuracy": 0.16688745468854904,
"num_tokens": 21056379.0,
"step": 11410
},
{
"entropy": 5.633595609664917,
"epoch": 0.9590422180214241,
"grad_norm": 1.40625,
"learning_rate": 0.0004914099983781104,
"loss": 5.4566,
"mean_token_accuracy": 0.16330905705690385,
"num_tokens": 21065283.0,
"step": 11415
},
{
"entropy": 5.640380907058716,
"epoch": 0.959462297836589,
"grad_norm": 1.5703125,
"learning_rate": 0.000491401800838336,
"loss": 5.5745,
"mean_token_accuracy": 0.15918146967887878,
"num_tokens": 21074938.0,
"step": 11420
},
{
"entropy": 5.626530456542969,
"epoch": 0.9598823776517539,
"grad_norm": 1.3359375,
"learning_rate": 0.0004913935994651153,
"loss": 5.4281,
"mean_token_accuracy": 0.17172765135765075,
"num_tokens": 21084729.0,
"step": 11425
},
{
"entropy": 5.564239358901977,
"epoch": 0.9603024574669187,
"grad_norm": 1.46875,
"learning_rate": 0.0004913853942585932,
"loss": 5.3283,
"mean_token_accuracy": 0.17437093555927277,
"num_tokens": 21093456.0,
"step": 11430
},
{
"entropy": 5.600566244125366,
"epoch": 0.9607225372820836,
"grad_norm": 1.3515625,
"learning_rate": 0.0004913771852189155,
"loss": 5.4625,
"mean_token_accuracy": 0.15994736552238464,
"num_tokens": 21102980.0,
"step": 11435
},
{
"entropy": 5.754529666900635,
"epoch": 0.9611426170972485,
"grad_norm": 1.3359375,
"learning_rate": 0.0004913689723462271,
"loss": 5.6763,
"mean_token_accuracy": 0.17234763503074646,
"num_tokens": 21112777.0,
"step": 11440
},
{
"entropy": 5.676597976684571,
"epoch": 0.9615626969124134,
"grad_norm": 1.6171875,
"learning_rate": 0.000491360755640674,
"loss": 5.5803,
"mean_token_accuracy": 0.15645822137594223,
"num_tokens": 21122139.0,
"step": 11445
},
{
"entropy": 5.639119720458984,
"epoch": 0.9619827767275783,
"grad_norm": 1.453125,
"learning_rate": 0.0004913525351024014,
"loss": 5.453,
"mean_token_accuracy": 0.1607119247317314,
"num_tokens": 21131425.0,
"step": 11450
},
{
"entropy": 5.6201681137084964,
"epoch": 0.9624028565427432,
"grad_norm": 1.25,
"learning_rate": 0.0004913443107315552,
"loss": 5.4341,
"mean_token_accuracy": 0.15983420610427856,
"num_tokens": 21140784.0,
"step": 11455
},
{
"entropy": 5.6580445766448975,
"epoch": 0.962822936357908,
"grad_norm": 1.4453125,
"learning_rate": 0.0004913360825282807,
"loss": 5.4335,
"mean_token_accuracy": 0.1675620973110199,
"num_tokens": 21150408.0,
"step": 11460
},
{
"entropy": 5.589797449111939,
"epoch": 0.9632430161730728,
"grad_norm": 1.328125,
"learning_rate": 0.000491327850492724,
"loss": 5.5304,
"mean_token_accuracy": 0.16759325116872786,
"num_tokens": 21158915.0,
"step": 11465
},
{
"entropy": 5.505474710464478,
"epoch": 0.9636630959882377,
"grad_norm": 1.4375,
"learning_rate": 0.0004913196146250309,
"loss": 5.3419,
"mean_token_accuracy": 0.1716780662536621,
"num_tokens": 21167336.0,
"step": 11470
},
{
"entropy": 5.689284896850586,
"epoch": 0.9640831758034026,
"grad_norm": 1.4140625,
"learning_rate": 0.0004913113749253472,
"loss": 5.6449,
"mean_token_accuracy": 0.16215803027153014,
"num_tokens": 21177499.0,
"step": 11475
},
{
"entropy": 5.74305419921875,
"epoch": 0.9645032556185675,
"grad_norm": 1.6484375,
"learning_rate": 0.0004913031313938188,
"loss": 5.5509,
"mean_token_accuracy": 0.1595839351415634,
"num_tokens": 21186961.0,
"step": 11480
},
{
"entropy": 5.683131408691406,
"epoch": 0.9649233354337324,
"grad_norm": 1.1953125,
"learning_rate": 0.0004912948840305919,
"loss": 5.4379,
"mean_token_accuracy": 0.17446769773960114,
"num_tokens": 21196364.0,
"step": 11485
},
{
"entropy": 5.647530221939087,
"epoch": 0.9653434152488973,
"grad_norm": 1.359375,
"learning_rate": 0.0004912866328358125,
"loss": 5.5191,
"mean_token_accuracy": 0.15950247049331664,
"num_tokens": 21206376.0,
"step": 11490
},
{
"entropy": 5.638066530227661,
"epoch": 0.9657634950640621,
"grad_norm": 1.375,
"learning_rate": 0.0004912783778096266,
"loss": 5.4934,
"mean_token_accuracy": 0.1693543791770935,
"num_tokens": 21215889.0,
"step": 11495
},
{
"entropy": 5.700996589660645,
"epoch": 0.966183574879227,
"grad_norm": 1.3046875,
"learning_rate": 0.0004912701189521808,
"loss": 5.5062,
"mean_token_accuracy": 0.16749416589736937,
"num_tokens": 21224959.0,
"step": 11500
},
{
"entropy": 5.757201671600342,
"epoch": 0.9666036546943919,
"grad_norm": 1.4765625,
"learning_rate": 0.0004912618562636211,
"loss": 5.6296,
"mean_token_accuracy": 0.15160779058933258,
"num_tokens": 21234495.0,
"step": 11505
},
{
"entropy": 5.642830944061279,
"epoch": 0.9670237345095568,
"grad_norm": 1.2578125,
"learning_rate": 0.000491253589744094,
"loss": 5.47,
"mean_token_accuracy": 0.16335225403308867,
"num_tokens": 21244555.0,
"step": 11510
},
{
"entropy": 5.703874015808106,
"epoch": 0.9674438143247217,
"grad_norm": 1.8671875,
"learning_rate": 0.0004912453193937459,
"loss": 5.6037,
"mean_token_accuracy": 0.1612747997045517,
"num_tokens": 21254199.0,
"step": 11515
},
{
"entropy": 5.69853458404541,
"epoch": 0.9678638941398866,
"grad_norm": 1.953125,
"learning_rate": 0.0004912370452127234,
"loss": 5.5001,
"mean_token_accuracy": 0.15972875952720642,
"num_tokens": 21262723.0,
"step": 11520
},
{
"entropy": 5.683942985534668,
"epoch": 0.9682839739550515,
"grad_norm": 1.234375,
"learning_rate": 0.0004912287672011728,
"loss": 5.426,
"mean_token_accuracy": 0.1622313767671585,
"num_tokens": 21271283.0,
"step": 11525
},
{
"entropy": 5.589861679077148,
"epoch": 0.9687040537702163,
"grad_norm": 1.4140625,
"learning_rate": 0.0004912204853592411,
"loss": 5.4774,
"mean_token_accuracy": 0.17342782616615296,
"num_tokens": 21279542.0,
"step": 11530
},
{
"entropy": 5.613545656204224,
"epoch": 0.9691241335853812,
"grad_norm": 1.3984375,
"learning_rate": 0.0004912121996870748,
"loss": 5.4448,
"mean_token_accuracy": 0.16918166279792785,
"num_tokens": 21288678.0,
"step": 11535
},
{
"entropy": 5.773236703872681,
"epoch": 0.9695442134005461,
"grad_norm": 1.28125,
"learning_rate": 0.0004912039101848207,
"loss": 5.5955,
"mean_token_accuracy": 0.16231737807393073,
"num_tokens": 21298982.0,
"step": 11540
},
{
"entropy": 5.701774406433105,
"epoch": 0.969964293215711,
"grad_norm": 1.7109375,
"learning_rate": 0.0004911956168526257,
"loss": 5.5289,
"mean_token_accuracy": 0.16124602109193803,
"num_tokens": 21307663.0,
"step": 11545
},
{
"entropy": 5.712226152420044,
"epoch": 0.9703843730308759,
"grad_norm": 1.4453125,
"learning_rate": 0.0004911873196906366,
"loss": 5.5326,
"mean_token_accuracy": 0.15940239503979683,
"num_tokens": 21318004.0,
"step": 11550
},
{
"entropy": 5.570864295959472,
"epoch": 0.9708044528460408,
"grad_norm": 1.5859375,
"learning_rate": 0.0004911790186990005,
"loss": 5.3405,
"mean_token_accuracy": 0.1776915341615677,
"num_tokens": 21327373.0,
"step": 11555
},
{
"entropy": 5.576349306106567,
"epoch": 0.9712245326612057,
"grad_norm": 1.5,
"learning_rate": 0.0004911707138778643,
"loss": 5.4399,
"mean_token_accuracy": 0.1666841670870781,
"num_tokens": 21335654.0,
"step": 11560
},
{
"entropy": 5.672850465774536,
"epoch": 0.9716446124763705,
"grad_norm": 1.4453125,
"learning_rate": 0.0004911624052273754,
"loss": 5.5129,
"mean_token_accuracy": 0.1632431373000145,
"num_tokens": 21344464.0,
"step": 11565
},
{
"entropy": 5.76579852104187,
"epoch": 0.9720646922915354,
"grad_norm": 1.4140625,
"learning_rate": 0.0004911540927476807,
"loss": 5.6111,
"mean_token_accuracy": 0.15846215337514877,
"num_tokens": 21354121.0,
"step": 11570
},
{
"entropy": 5.708047771453858,
"epoch": 0.9724847721067003,
"grad_norm": 1.4453125,
"learning_rate": 0.0004911457764389275,
"loss": 5.5433,
"mean_token_accuracy": 0.1658056989312172,
"num_tokens": 21363395.0,
"step": 11575
},
{
"entropy": 5.672170209884643,
"epoch": 0.9729048519218652,
"grad_norm": 2.203125,
"learning_rate": 0.0004911374563012633,
"loss": 5.4944,
"mean_token_accuracy": 0.16023199558258056,
"num_tokens": 21372126.0,
"step": 11580
},
{
"entropy": 5.726519393920898,
"epoch": 0.97332493173703,
"grad_norm": 1.609375,
"learning_rate": 0.0004911291323348352,
"loss": 5.5963,
"mean_token_accuracy": 0.1553143873810768,
"num_tokens": 21380554.0,
"step": 11585
},
{
"entropy": 5.6460357189178465,
"epoch": 0.973745011552195,
"grad_norm": 1.4296875,
"learning_rate": 0.0004911208045397909,
"loss": 5.4832,
"mean_token_accuracy": 0.16235848218202592,
"num_tokens": 21389317.0,
"step": 11590
},
{
"entropy": 5.753270435333252,
"epoch": 0.9741650913673598,
"grad_norm": 1.546875,
"learning_rate": 0.0004911124729162778,
"loss": 5.593,
"mean_token_accuracy": 0.15693895667791366,
"num_tokens": 21398926.0,
"step": 11595
},
{
"entropy": 5.707015132904052,
"epoch": 0.9745851711825246,
"grad_norm": 1.328125,
"learning_rate": 0.0004911041374644435,
"loss": 5.3961,
"mean_token_accuracy": 0.1671022891998291,
"num_tokens": 21406962.0,
"step": 11600
},
{
"entropy": 5.666591787338257,
"epoch": 0.9750052509976895,
"grad_norm": 1.484375,
"learning_rate": 0.0004910957981844357,
"loss": 5.4868,
"mean_token_accuracy": 0.164098384976387,
"num_tokens": 21415868.0,
"step": 11605
},
{
"entropy": 5.754735374450684,
"epoch": 0.9754253308128544,
"grad_norm": 1.484375,
"learning_rate": 0.0004910874550764022,
"loss": 5.6053,
"mean_token_accuracy": 0.1618281587958336,
"num_tokens": 21424544.0,
"step": 11610
},
{
"entropy": 5.600016689300537,
"epoch": 0.9758454106280193,
"grad_norm": 1.515625,
"learning_rate": 0.0004910791081404907,
"loss": 5.4663,
"mean_token_accuracy": 0.17111330032348632,
"num_tokens": 21433589.0,
"step": 11615
},
{
"entropy": 5.637977123260498,
"epoch": 0.9762654904431842,
"grad_norm": 1.5,
"learning_rate": 0.0004910707573768489,
"loss": 5.5349,
"mean_token_accuracy": 0.1595746397972107,
"num_tokens": 21442084.0,
"step": 11620
},
{
"entropy": 5.64194393157959,
"epoch": 0.9766855702583491,
"grad_norm": 1.640625,
"learning_rate": 0.0004910624027856251,
"loss": 5.447,
"mean_token_accuracy": 0.16450028717517853,
"num_tokens": 21450962.0,
"step": 11625
},
{
"entropy": 5.686526966094971,
"epoch": 0.977105650073514,
"grad_norm": 1.53125,
"learning_rate": 0.0004910540443669669,
"loss": 5.5391,
"mean_token_accuracy": 0.15900047048926352,
"num_tokens": 21461322.0,
"step": 11630
},
{
"entropy": 5.672098588943482,
"epoch": 0.9775257298886788,
"grad_norm": 1.40625,
"learning_rate": 0.0004910456821210227,
"loss": 5.5263,
"mean_token_accuracy": 0.16485550701618196,
"num_tokens": 21470800.0,
"step": 11635
},
{
"entropy": 5.630803632736206,
"epoch": 0.9779458097038437,
"grad_norm": 1.3125,
"learning_rate": 0.0004910373160479404,
"loss": 5.3681,
"mean_token_accuracy": 0.171659155189991,
"num_tokens": 21479707.0,
"step": 11640
},
{
"entropy": 5.655557298660279,
"epoch": 0.9783658895190086,
"grad_norm": 1.6171875,
"learning_rate": 0.0004910289461478683,
"loss": 5.5608,
"mean_token_accuracy": 0.15193586573004722,
"num_tokens": 21489469.0,
"step": 11645
},
{
"entropy": 5.692305946350098,
"epoch": 0.9787859693341735,
"grad_norm": 1.265625,
"learning_rate": 0.0004910205724209547,
"loss": 5.5266,
"mean_token_accuracy": 0.15883257240056992,
"num_tokens": 21499226.0,
"step": 11650
},
{
"entropy": 5.579811143875122,
"epoch": 0.9792060491493384,
"grad_norm": 1.3828125,
"learning_rate": 0.0004910121948673478,
"loss": 5.391,
"mean_token_accuracy": 0.16931790709495545,
"num_tokens": 21508129.0,
"step": 11655
},
{
"entropy": 5.625033140182495,
"epoch": 0.9796261289645033,
"grad_norm": 1.765625,
"learning_rate": 0.0004910038134871962,
"loss": 5.4415,
"mean_token_accuracy": 0.16273742616176606,
"num_tokens": 21516293.0,
"step": 11660
},
{
"entropy": 5.692939329147339,
"epoch": 0.9800462087796681,
"grad_norm": 1.40625,
"learning_rate": 0.0004909954282806482,
"loss": 5.5821,
"mean_token_accuracy": 0.1591893032193184,
"num_tokens": 21525393.0,
"step": 11665
},
{
"entropy": 5.590066194534302,
"epoch": 0.980466288594833,
"grad_norm": 1.5078125,
"learning_rate": 0.0004909870392478524,
"loss": 5.4386,
"mean_token_accuracy": 0.1672815203666687,
"num_tokens": 21534585.0,
"step": 11670
},
{
"entropy": 5.601227903366089,
"epoch": 0.9808863684099979,
"grad_norm": 1.390625,
"learning_rate": 0.0004909786463889575,
"loss": 5.3922,
"mean_token_accuracy": 0.16998750865459442,
"num_tokens": 21542947.0,
"step": 11675
},
{
"entropy": 5.647561883926391,
"epoch": 0.9813064482251628,
"grad_norm": 1.5390625,
"learning_rate": 0.0004909702497041121,
"loss": 5.4924,
"mean_token_accuracy": 0.16468349248170852,
"num_tokens": 21552168.0,
"step": 11680
},
{
"entropy": 5.675682210922242,
"epoch": 0.9817265280403277,
"grad_norm": 1.3046875,
"learning_rate": 0.0004909618491934648,
"loss": 5.5096,
"mean_token_accuracy": 0.1663383349776268,
"num_tokens": 21562131.0,
"step": 11685
},
{
"entropy": 5.604850959777832,
"epoch": 0.9821466078554926,
"grad_norm": 1.3515625,
"learning_rate": 0.0004909534448571647,
"loss": 5.4505,
"mean_token_accuracy": 0.17098020613193513,
"num_tokens": 21571363.0,
"step": 11690
},
{
"entropy": 5.635071516036987,
"epoch": 0.9825666876706575,
"grad_norm": 1.3046875,
"learning_rate": 0.0004909450366953604,
"loss": 5.4146,
"mean_token_accuracy": 0.16778166890144347,
"num_tokens": 21580754.0,
"step": 11695
},
{
"entropy": 5.620409727096558,
"epoch": 0.9829867674858223,
"grad_norm": 1.421875,
"learning_rate": 0.000490936624708201,
"loss": 5.5205,
"mean_token_accuracy": 0.16398802250623704,
"num_tokens": 21590053.0,
"step": 11700
},
{
"entropy": 5.581881427764893,
"epoch": 0.9834068473009872,
"grad_norm": 1.4296875,
"learning_rate": 0.0004909282088958356,
"loss": 5.4897,
"mean_token_accuracy": 0.16222208589315415,
"num_tokens": 21598681.0,
"step": 11705
},
{
"entropy": 5.7037766456604,
"epoch": 0.983826927116152,
"grad_norm": 1.3515625,
"learning_rate": 0.000490919789258413,
"loss": 5.5024,
"mean_token_accuracy": 0.16776590049266815,
"num_tokens": 21607465.0,
"step": 11710
},
{
"entropy": 5.673490762710571,
"epoch": 0.984247006931317,
"grad_norm": 1.3046875,
"learning_rate": 0.0004909113657960826,
"loss": 5.5709,
"mean_token_accuracy": 0.15339512825012208,
"num_tokens": 21617480.0,
"step": 11715
},
{
"entropy": 5.655919981002808,
"epoch": 0.9846670867464818,
"grad_norm": 1.4921875,
"learning_rate": 0.0004909029385089935,
"loss": 5.5013,
"mean_token_accuracy": 0.16508855521678925,
"num_tokens": 21626434.0,
"step": 11720
},
{
"entropy": 5.6823704719543455,
"epoch": 0.9850871665616467,
"grad_norm": 1.4765625,
"learning_rate": 0.000490894507397295,
"loss": 5.4905,
"mean_token_accuracy": 0.166464164853096,
"num_tokens": 21635627.0,
"step": 11725
},
{
"entropy": 5.6854860305786135,
"epoch": 0.9855072463768116,
"grad_norm": 1.3984375,
"learning_rate": 0.0004908860724611365,
"loss": 5.4769,
"mean_token_accuracy": 0.1653437554836273,
"num_tokens": 21644789.0,
"step": 11730
},
{
"entropy": 5.610777854919434,
"epoch": 0.9859273261919764,
"grad_norm": 1.546875,
"learning_rate": 0.0004908776337006675,
"loss": 5.4821,
"mean_token_accuracy": 0.16152163594961166,
"num_tokens": 21653696.0,
"step": 11735
},
{
"entropy": 5.6638861179351805,
"epoch": 0.9863474060071413,
"grad_norm": 1.4609375,
"learning_rate": 0.0004908691911160373,
"loss": 5.4699,
"mean_token_accuracy": 0.15693951398134232,
"num_tokens": 21664420.0,
"step": 11740
},
{
"entropy": 5.65993366241455,
"epoch": 0.9867674858223062,
"grad_norm": 1.453125,
"learning_rate": 0.0004908607447073954,
"loss": 5.4718,
"mean_token_accuracy": 0.16778032034635543,
"num_tokens": 21673716.0,
"step": 11745
},
{
"entropy": 5.634006547927856,
"epoch": 0.9871875656374711,
"grad_norm": 1.359375,
"learning_rate": 0.0004908522944748917,
"loss": 5.4514,
"mean_token_accuracy": 0.17267897576093674,
"num_tokens": 21682860.0,
"step": 11750
},
{
"entropy": 5.509404897689819,
"epoch": 0.987607645452636,
"grad_norm": 1.53125,
"learning_rate": 0.0004908438404186758,
"loss": 5.4731,
"mean_token_accuracy": 0.16950045078992843,
"num_tokens": 21691915.0,
"step": 11755
},
{
"entropy": 5.672783470153808,
"epoch": 0.9880277252678009,
"grad_norm": 1.34375,
"learning_rate": 0.0004908353825388973,
"loss": 5.5825,
"mean_token_accuracy": 0.15650345236063004,
"num_tokens": 21701666.0,
"step": 11760
},
{
"entropy": 5.7960083961486815,
"epoch": 0.9884478050829658,
"grad_norm": 1.4375,
"learning_rate": 0.0004908269208357062,
"loss": 5.5217,
"mean_token_accuracy": 0.16534726023674012,
"num_tokens": 21709267.0,
"step": 11765
},
{
"entropy": 5.622351837158203,
"epoch": 0.9888678848981306,
"grad_norm": 1.3515625,
"learning_rate": 0.0004908184553092523,
"loss": 5.3953,
"mean_token_accuracy": 0.16779804825782776,
"num_tokens": 21718117.0,
"step": 11770
},
{
"entropy": 5.647730779647827,
"epoch": 0.9892879647132955,
"grad_norm": 1.4140625,
"learning_rate": 0.0004908099859596856,
"loss": 5.543,
"mean_token_accuracy": 0.16623370349407196,
"num_tokens": 21727952.0,
"step": 11775
},
{
"entropy": 5.66669807434082,
"epoch": 0.9897080445284604,
"grad_norm": 1.390625,
"learning_rate": 0.0004908015127871561,
"loss": 5.411,
"mean_token_accuracy": 0.16361401975154877,
"num_tokens": 21737878.0,
"step": 11780
},
{
"entropy": 5.567669343948364,
"epoch": 0.9901281243436253,
"grad_norm": 1.484375,
"learning_rate": 0.000490793035791814,
"loss": 5.3689,
"mean_token_accuracy": 0.169732029736042,
"num_tokens": 21747391.0,
"step": 11785
},
{
"entropy": 5.570152378082275,
"epoch": 0.9905482041587902,
"grad_norm": 1.4453125,
"learning_rate": 0.0004907845549738093,
"loss": 5.3976,
"mean_token_accuracy": 0.16566923558712005,
"num_tokens": 21756791.0,
"step": 11790
},
{
"entropy": 5.51328330039978,
"epoch": 0.9909682839739551,
"grad_norm": 1.4453125,
"learning_rate": 0.0004907760703332923,
"loss": 5.4445,
"mean_token_accuracy": 0.16655617505311965,
"num_tokens": 21766020.0,
"step": 11795
},
{
"entropy": 5.70251579284668,
"epoch": 0.99138836378912,
"grad_norm": 1.4375,
"learning_rate": 0.0004907675818704134,
"loss": 5.5443,
"mean_token_accuracy": 0.16133727729320527,
"num_tokens": 21775895.0,
"step": 11800
},
{
"entropy": 5.703081703186035,
"epoch": 0.9918084436042848,
"grad_norm": 1.625,
"learning_rate": 0.0004907590895853228,
"loss": 5.4606,
"mean_token_accuracy": 0.16604946404695511,
"num_tokens": 21784543.0,
"step": 11805
},
{
"entropy": 5.630713033676147,
"epoch": 0.9922285234194497,
"grad_norm": 1.46875,
"learning_rate": 0.0004907505934781712,
"loss": 5.5144,
"mean_token_accuracy": 0.16048821806907654,
"num_tokens": 21793938.0,
"step": 11810
},
{
"entropy": 5.621685886383057,
"epoch": 0.9926486032346146,
"grad_norm": 1.3671875,
"learning_rate": 0.0004907420935491087,
"loss": 5.487,
"mean_token_accuracy": 0.16330500245094298,
"num_tokens": 21803641.0,
"step": 11815
},
{
"entropy": 5.652135419845581,
"epoch": 0.9930686830497795,
"grad_norm": 1.6484375,
"learning_rate": 0.0004907335897982862,
"loss": 5.4115,
"mean_token_accuracy": 0.17158966660499572,
"num_tokens": 21812542.0,
"step": 11820
},
{
"entropy": 5.599951648712159,
"epoch": 0.9934887628649444,
"grad_norm": 1.453125,
"learning_rate": 0.0004907250822258543,
"loss": 5.4967,
"mean_token_accuracy": 0.16001633405685425,
"num_tokens": 21821847.0,
"step": 11825
},
{
"entropy": 5.774102830886841,
"epoch": 0.9939088426801093,
"grad_norm": 1.390625,
"learning_rate": 0.0004907165708319637,
"loss": 5.5476,
"mean_token_accuracy": 0.16471952199935913,
"num_tokens": 21830799.0,
"step": 11830
},
{
"entropy": 5.616507863998413,
"epoch": 0.994328922495274,
"grad_norm": 1.53125,
"learning_rate": 0.0004907080556167651,
"loss": 5.464,
"mean_token_accuracy": 0.16898656040430068,
"num_tokens": 21840202.0,
"step": 11835
},
{
"entropy": 5.66267147064209,
"epoch": 0.994749002310439,
"grad_norm": 1.53125,
"learning_rate": 0.0004906995365804093,
"loss": 5.5721,
"mean_token_accuracy": 0.15978300273418428,
"num_tokens": 21849701.0,
"step": 11840
},
{
"entropy": 5.700849723815918,
"epoch": 0.9951690821256038,
"grad_norm": 1.375,
"learning_rate": 0.0004906910137230472,
"loss": 5.4583,
"mean_token_accuracy": 0.1672790139913559,
"num_tokens": 21859191.0,
"step": 11845
},
{
"entropy": 5.632527494430542,
"epoch": 0.9955891619407687,
"grad_norm": 1.8046875,
"learning_rate": 0.00049068248704483,
"loss": 5.4493,
"mean_token_accuracy": 0.16411733329296113,
"num_tokens": 21867944.0,
"step": 11850
},
{
"entropy": 5.5771478652954105,
"epoch": 0.9960092417559336,
"grad_norm": 1.625,
"learning_rate": 0.0004906739565459085,
"loss": 5.4698,
"mean_token_accuracy": 0.16230135709047316,
"num_tokens": 21876368.0,
"step": 11855
},
{
"entropy": 5.753003358840942,
"epoch": 0.9964293215710985,
"grad_norm": 1.5625,
"learning_rate": 0.000490665422226434,
"loss": 5.5633,
"mean_token_accuracy": 0.15508055686950684,
"num_tokens": 21885634.0,
"step": 11860
},
{
"entropy": 5.568900871276855,
"epoch": 0.9968494013862634,
"grad_norm": 1.6484375,
"learning_rate": 0.0004906568840865576,
"loss": 5.3646,
"mean_token_accuracy": 0.17266131490468978,
"num_tokens": 21894315.0,
"step": 11865
},
{
"entropy": 5.576476907730102,
"epoch": 0.9972694812014282,
"grad_norm": 1.5,
"learning_rate": 0.0004906483421264305,
"loss": 5.4972,
"mean_token_accuracy": 0.16627979129552842,
"num_tokens": 21903342.0,
"step": 11870
},
{
"entropy": 5.677194356918335,
"epoch": 0.9976895610165931,
"grad_norm": 1.4140625,
"learning_rate": 0.000490639796346204,
"loss": 5.5986,
"mean_token_accuracy": 0.15755953639745712,
"num_tokens": 21914158.0,
"step": 11875
},
{
"entropy": 5.770134115219117,
"epoch": 0.998109640831758,
"grad_norm": 1.421875,
"learning_rate": 0.0004906312467460297,
"loss": 5.488,
"mean_token_accuracy": 0.1663819894194603,
"num_tokens": 21922639.0,
"step": 11880
},
{
"entropy": 5.622851228713989,
"epoch": 0.9985297206469229,
"grad_norm": 1.5,
"learning_rate": 0.0004906226933260588,
"loss": 5.4633,
"mean_token_accuracy": 0.16364375054836272,
"num_tokens": 21931385.0,
"step": 11885
},
{
"entropy": 5.684553384780884,
"epoch": 0.9989498004620878,
"grad_norm": 1.3984375,
"learning_rate": 0.0004906141360864429,
"loss": 5.4898,
"mean_token_accuracy": 0.16104650050401687,
"num_tokens": 21940788.0,
"step": 11890
},
{
"entropy": 5.725577688217163,
"epoch": 0.9993698802772527,
"grad_norm": 1.640625,
"learning_rate": 0.0004906055750273336,
"loss": 5.4977,
"mean_token_accuracy": 0.16556380838155746,
"num_tokens": 21950309.0,
"step": 11895
},
{
"entropy": 5.6455409049987795,
"epoch": 0.9997899600924176,
"grad_norm": 1.6875,
"learning_rate": 0.0004905970101488826,
"loss": 5.5074,
"mean_token_accuracy": 0.16334970146417618,
"num_tokens": 21959141.0,
"step": 11900
},
{
"entropy": 5.710307068294949,
"epoch": 1.000168031926066,
"grad_norm": 1.2578125,
"learning_rate": 0.0004905884414512416,
"loss": 5.5582,
"mean_token_accuracy": 0.1661406440867318,
"num_tokens": 21966665.0,
"step": 11905
},
{
"entropy": 5.709056758880616,
"epoch": 1.0005881117412307,
"grad_norm": 1.4375,
"learning_rate": 0.0004905798689345623,
"loss": 5.4849,
"mean_token_accuracy": 0.16853023320436478,
"num_tokens": 21976728.0,
"step": 11910
},
{
"entropy": 5.63147554397583,
"epoch": 1.0010081915563958,
"grad_norm": 1.515625,
"learning_rate": 0.0004905712925989968,
"loss": 5.3332,
"mean_token_accuracy": 0.1637764275074005,
"num_tokens": 21985915.0,
"step": 11915
},
{
"entropy": 5.627887344360351,
"epoch": 1.0014282713715605,
"grad_norm": 1.3828125,
"learning_rate": 0.0004905627124446967,
"loss": 5.3974,
"mean_token_accuracy": 0.16635343581438064,
"num_tokens": 21995826.0,
"step": 11920
},
{
"entropy": 5.627153444290161,
"epoch": 1.0018483511867255,
"grad_norm": 1.4609375,
"learning_rate": 0.0004905541284718142,
"loss": 5.3478,
"mean_token_accuracy": 0.170246821641922,
"num_tokens": 22005299.0,
"step": 11925
},
{
"entropy": 5.641215467453003,
"epoch": 1.0022684310018903,
"grad_norm": 1.546875,
"learning_rate": 0.0004905455406805011,
"loss": 5.3837,
"mean_token_accuracy": 0.16680040806531907,
"num_tokens": 22014499.0,
"step": 11930
},
{
"entropy": 5.672396039962768,
"epoch": 1.0026885108170553,
"grad_norm": 1.484375,
"learning_rate": 0.00049053694907091,
"loss": 5.5323,
"mean_token_accuracy": 0.15733788013458253,
"num_tokens": 22024531.0,
"step": 11935
},
{
"entropy": 5.646308374404907,
"epoch": 1.0031085906322201,
"grad_norm": 2.21875,
"learning_rate": 0.0004905283536431928,
"loss": 5.426,
"mean_token_accuracy": 0.16464308202266692,
"num_tokens": 22034036.0,
"step": 11940
},
{
"entropy": 5.607198762893677,
"epoch": 1.003528670447385,
"grad_norm": 1.4296875,
"learning_rate": 0.0004905197543975017,
"loss": 5.3512,
"mean_token_accuracy": 0.1657976523041725,
"num_tokens": 22042910.0,
"step": 11945
},
{
"entropy": 5.660063123703003,
"epoch": 1.00394875026255,
"grad_norm": 1.34375,
"learning_rate": 0.0004905111513339892,
"loss": 5.461,
"mean_token_accuracy": 0.16730546355247497,
"num_tokens": 22052242.0,
"step": 11950
},
{
"entropy": 5.648103475570679,
"epoch": 1.0043688300777147,
"grad_norm": 1.6953125,
"learning_rate": 0.0004905025444528076,
"loss": 5.4245,
"mean_token_accuracy": 0.16425618678331375,
"num_tokens": 22061467.0,
"step": 11955
},
{
"entropy": 5.526777505874634,
"epoch": 1.0047889098928797,
"grad_norm": 1.421875,
"learning_rate": 0.0004904939337541093,
"loss": 5.2711,
"mean_token_accuracy": 0.17251382023096085,
"num_tokens": 22070300.0,
"step": 11960
},
{
"entropy": 5.673946237564087,
"epoch": 1.0052089897080445,
"grad_norm": 1.5859375,
"learning_rate": 0.0004904853192380472,
"loss": 5.4353,
"mean_token_accuracy": 0.1664825990796089,
"num_tokens": 22078960.0,
"step": 11965
},
{
"entropy": 5.659275007247925,
"epoch": 1.0056290695232095,
"grad_norm": 1.3515625,
"learning_rate": 0.0004904767009047733,
"loss": 5.3807,
"mean_token_accuracy": 0.16314680129289627,
"num_tokens": 22088135.0,
"step": 11970
},
{
"entropy": 5.664426994323731,
"epoch": 1.0060491493383743,
"grad_norm": 1.640625,
"learning_rate": 0.0004904680787544408,
"loss": 5.4914,
"mean_token_accuracy": 0.16022274345159532,
"num_tokens": 22098004.0,
"step": 11975
},
{
"entropy": 5.702074432373047,
"epoch": 1.006469229153539,
"grad_norm": 1.4140625,
"learning_rate": 0.0004904594527872022,
"loss": 5.4766,
"mean_token_accuracy": 0.15825158208608628,
"num_tokens": 22107680.0,
"step": 11980
},
{
"entropy": 5.68216609954834,
"epoch": 1.006889308968704,
"grad_norm": 1.375,
"learning_rate": 0.0004904508230032103,
"loss": 5.4545,
"mean_token_accuracy": 0.1670009523630142,
"num_tokens": 22118004.0,
"step": 11985
},
{
"entropy": 5.61159930229187,
"epoch": 1.0073093887838689,
"grad_norm": 1.4765625,
"learning_rate": 0.000490442189402618,
"loss": 5.406,
"mean_token_accuracy": 0.1737432822585106,
"num_tokens": 22127825.0,
"step": 11990
},
{
"entropy": 5.6002617359161375,
"epoch": 1.007729468599034,
"grad_norm": 1.3828125,
"learning_rate": 0.0004904335519855783,
"loss": 5.3418,
"mean_token_accuracy": 0.1677705705165863,
"num_tokens": 22136448.0,
"step": 11995
},
{
"entropy": 5.57069959640503,
"epoch": 1.0081495484141987,
"grad_norm": 1.609375,
"learning_rate": 0.0004904249107522442,
"loss": 5.4416,
"mean_token_accuracy": 0.16570404022932053,
"num_tokens": 22146415.0,
"step": 12000
},
{
"epoch": 1.0081495484141987,
"eval_entropy": 5.448210272077472,
"eval_loss": 5.511696815490723,
"eval_mean_token_accuracy": 0.17007384661069086,
"eval_num_tokens": 22146415.0,
"eval_runtime": 27.4734,
"eval_samples_per_second": 1360.078,
"eval_steps_per_second": 170.019,
"step": 12000
},
{
"entropy": 5.7423820972442625,
"epoch": 1.0085696282293637,
"grad_norm": 1.625,
"learning_rate": 0.0004904162657027685,
"loss": 5.5822,
"mean_token_accuracy": 0.16078717708587648,
"num_tokens": 22156327.0,
"step": 12005
},
{
"entropy": 5.66963791847229,
"epoch": 1.0089897080445285,
"grad_norm": 1.5078125,
"learning_rate": 0.0004904076168373049,
"loss": 5.3764,
"mean_token_accuracy": 0.17133131325244905,
"num_tokens": 22165677.0,
"step": 12010
},
{
"entropy": 5.660669994354248,
"epoch": 1.0094097878596933,
"grad_norm": 1.3515625,
"learning_rate": 0.0004903989641560061,
"loss": 5.5023,
"mean_token_accuracy": 0.1685717523097992,
"num_tokens": 22175232.0,
"step": 12015
},
{
"entropy": 5.688869380950928,
"epoch": 1.0098298676748583,
"grad_norm": 1.4609375,
"learning_rate": 0.0004903903076590256,
"loss": 5.3645,
"mean_token_accuracy": 0.16198563128709792,
"num_tokens": 22184026.0,
"step": 12020
},
{
"entropy": 5.54636435508728,
"epoch": 1.010249947490023,
"grad_norm": 1.5546875,
"learning_rate": 0.0004903816473465167,
"loss": 5.2858,
"mean_token_accuracy": 0.18181220293045045,
"num_tokens": 22192020.0,
"step": 12025
},
{
"entropy": 5.477762174606323,
"epoch": 1.010670027305188,
"grad_norm": 1.640625,
"learning_rate": 0.0004903729832186328,
"loss": 5.2837,
"mean_token_accuracy": 0.17269555926322938,
"num_tokens": 22200060.0,
"step": 12030
},
{
"entropy": 5.626640844345093,
"epoch": 1.0110901071203529,
"grad_norm": 1.421875,
"learning_rate": 0.0004903643152755274,
"loss": 5.3175,
"mean_token_accuracy": 0.1659637376666069,
"num_tokens": 22208625.0,
"step": 12035
},
{
"entropy": 5.63057050704956,
"epoch": 1.0115101869355176,
"grad_norm": 1.578125,
"learning_rate": 0.0004903556435173541,
"loss": 5.3185,
"mean_token_accuracy": 0.16828427612781524,
"num_tokens": 22217781.0,
"step": 12040
},
{
"entropy": 5.634527635574341,
"epoch": 1.0119302667506826,
"grad_norm": 1.546875,
"learning_rate": 0.0004903469679442665,
"loss": 5.4268,
"mean_token_accuracy": 0.16385108083486558,
"num_tokens": 22226432.0,
"step": 12045
},
{
"entropy": 5.567414665222168,
"epoch": 1.0123503465658474,
"grad_norm": 1.53125,
"learning_rate": 0.0004903382885564181,
"loss": 5.4451,
"mean_token_accuracy": 0.16746296286582946,
"num_tokens": 22234811.0,
"step": 12050
},
{
"entropy": 5.530305528640747,
"epoch": 1.0127704263810124,
"grad_norm": 1.5,
"learning_rate": 0.000490329605353963,
"loss": 5.3303,
"mean_token_accuracy": 0.1724723160266876,
"num_tokens": 22242808.0,
"step": 12055
},
{
"entropy": 5.664348220825195,
"epoch": 1.0131905061961772,
"grad_norm": 1.7109375,
"learning_rate": 0.0004903209183370547,
"loss": 5.383,
"mean_token_accuracy": 0.16988759338855744,
"num_tokens": 22251371.0,
"step": 12060
},
{
"entropy": 5.7766921043396,
"epoch": 1.0136105860113422,
"grad_norm": 1.5546875,
"learning_rate": 0.0004903122275058472,
"loss": 5.4667,
"mean_token_accuracy": 0.16579170525074005,
"num_tokens": 22260868.0,
"step": 12065
},
{
"entropy": 5.582829332351684,
"epoch": 1.014030665826507,
"grad_norm": 1.421875,
"learning_rate": 0.0004903035328604944,
"loss": 5.3622,
"mean_token_accuracy": 0.16627193689346315,
"num_tokens": 22270554.0,
"step": 12070
},
{
"entropy": 5.596042013168335,
"epoch": 1.0144507456416718,
"grad_norm": 1.75,
"learning_rate": 0.0004902948344011506,
"loss": 5.3626,
"mean_token_accuracy": 0.16696448624134064,
"num_tokens": 22279170.0,
"step": 12075
},
{
"entropy": 5.681842565536499,
"epoch": 1.0148708254568368,
"grad_norm": 1.4765625,
"learning_rate": 0.0004902861321279694,
"loss": 5.4919,
"mean_token_accuracy": 0.16172150075435637,
"num_tokens": 22288788.0,
"step": 12080
},
{
"entropy": 5.57488694190979,
"epoch": 1.0152909052720016,
"grad_norm": 1.5859375,
"learning_rate": 0.0004902774260411055,
"loss": 5.3062,
"mean_token_accuracy": 0.17076831310987473,
"num_tokens": 22297501.0,
"step": 12085
},
{
"entropy": 5.55194787979126,
"epoch": 1.0157109850871666,
"grad_norm": 1.59375,
"learning_rate": 0.0004902687161407126,
"loss": 5.2508,
"mean_token_accuracy": 0.17739553600549698,
"num_tokens": 22306181.0,
"step": 12090
},
{
"entropy": 5.574270343780517,
"epoch": 1.0161310649023314,
"grad_norm": 2.03125,
"learning_rate": 0.0004902600024269454,
"loss": 5.4038,
"mean_token_accuracy": 0.17074478268623353,
"num_tokens": 22315762.0,
"step": 12095
},
{
"entropy": 5.56434326171875,
"epoch": 1.0165511447174964,
"grad_norm": 1.9140625,
"learning_rate": 0.000490251284899958,
"loss": 5.3602,
"mean_token_accuracy": 0.166584350168705,
"num_tokens": 22325127.0,
"step": 12100
},
{
"entropy": 5.566229295730591,
"epoch": 1.0169712245326612,
"grad_norm": 1.625,
"learning_rate": 0.000490242563559905,
"loss": 5.4461,
"mean_token_accuracy": 0.1646648034453392,
"num_tokens": 22334038.0,
"step": 12105
},
{
"entropy": 5.610032463073731,
"epoch": 1.017391304347826,
"grad_norm": 1.3671875,
"learning_rate": 0.0004902338384069408,
"loss": 5.2956,
"mean_token_accuracy": 0.17210006713867188,
"num_tokens": 22342658.0,
"step": 12110
},
{
"entropy": 5.700829744338989,
"epoch": 1.017811384162991,
"grad_norm": 1.375,
"learning_rate": 0.00049022510944122,
"loss": 5.4687,
"mean_token_accuracy": 0.1618720069527626,
"num_tokens": 22352559.0,
"step": 12115
},
{
"entropy": 5.652797079086303,
"epoch": 1.0182314639781558,
"grad_norm": 1.421875,
"learning_rate": 0.0004902163766628972,
"loss": 5.3819,
"mean_token_accuracy": 0.16738210171461104,
"num_tokens": 22361455.0,
"step": 12120
},
{
"entropy": 5.668959331512451,
"epoch": 1.0186515437933208,
"grad_norm": 1.5625,
"learning_rate": 0.0004902076400721271,
"loss": 5.4144,
"mean_token_accuracy": 0.1660313591361046,
"num_tokens": 22371163.0,
"step": 12125
},
{
"entropy": 5.692332792282104,
"epoch": 1.0190716236084856,
"grad_norm": 1.8046875,
"learning_rate": 0.0004901988996690645,
"loss": 5.4026,
"mean_token_accuracy": 0.17311506420373918,
"num_tokens": 22379975.0,
"step": 12130
},
{
"entropy": 5.713879871368408,
"epoch": 1.0194917034236506,
"grad_norm": 1.5390625,
"learning_rate": 0.0004901901554538641,
"loss": 5.4513,
"mean_token_accuracy": 0.16618536561727523,
"num_tokens": 22389657.0,
"step": 12135
},
{
"entropy": 5.539191389083863,
"epoch": 1.0199117832388154,
"grad_norm": 1.4609375,
"learning_rate": 0.000490181407426681,
"loss": 5.289,
"mean_token_accuracy": 0.17534529268741608,
"num_tokens": 22398320.0,
"step": 12140
},
{
"entropy": 5.5694207668304445,
"epoch": 1.0203318630539802,
"grad_norm": 1.953125,
"learning_rate": 0.0004901726555876701,
"loss": 5.4911,
"mean_token_accuracy": 0.16052723973989486,
"num_tokens": 22406634.0,
"step": 12145
},
{
"entropy": 5.658481025695801,
"epoch": 1.0207519428691452,
"grad_norm": 1.640625,
"learning_rate": 0.0004901638999369862,
"loss": 5.5276,
"mean_token_accuracy": 0.1623757913708687,
"num_tokens": 22415939.0,
"step": 12150
},
{
"entropy": 5.691515064239502,
"epoch": 1.02117202268431,
"grad_norm": 1.4921875,
"learning_rate": 0.0004901551404747847,
"loss": 5.4431,
"mean_token_accuracy": 0.16109129637479783,
"num_tokens": 22425256.0,
"step": 12155
},
{
"entropy": 5.649143648147583,
"epoch": 1.021592102499475,
"grad_norm": 1.515625,
"learning_rate": 0.0004901463772012209,
"loss": 5.5128,
"mean_token_accuracy": 0.16139682829380037,
"num_tokens": 22434750.0,
"step": 12160
},
{
"entropy": 5.584879207611084,
"epoch": 1.0220121823146397,
"grad_norm": 1.578125,
"learning_rate": 0.0004901376101164495,
"loss": 5.3978,
"mean_token_accuracy": 0.16362105011940004,
"num_tokens": 22443426.0,
"step": 12165
},
{
"entropy": 5.607901620864868,
"epoch": 1.0224322621298048,
"grad_norm": 1.734375,
"learning_rate": 0.0004901288392206263,
"loss": 5.4015,
"mean_token_accuracy": 0.16088145673274995,
"num_tokens": 22452778.0,
"step": 12170
},
{
"entropy": 5.598963737487793,
"epoch": 1.0228523419449695,
"grad_norm": 1.5,
"learning_rate": 0.0004901200645139064,
"loss": 5.3614,
"mean_token_accuracy": 0.1733039990067482,
"num_tokens": 22462864.0,
"step": 12175
},
{
"entropy": 5.589771032333374,
"epoch": 1.0232724217601343,
"grad_norm": 1.75,
"learning_rate": 0.0004901112859964454,
"loss": 5.4078,
"mean_token_accuracy": 0.1707241028547287,
"num_tokens": 22472849.0,
"step": 12180
},
{
"entropy": 5.596211242675781,
"epoch": 1.0236925015752993,
"grad_norm": 1.7109375,
"learning_rate": 0.0004901025036683987,
"loss": 5.3457,
"mean_token_accuracy": 0.16777551621198655,
"num_tokens": 22481693.0,
"step": 12185
},
{
"entropy": 5.585362195968628,
"epoch": 1.0241125813904641,
"grad_norm": 1.390625,
"learning_rate": 0.0004900937175299219,
"loss": 5.3664,
"mean_token_accuracy": 0.17000767588615417,
"num_tokens": 22490934.0,
"step": 12190
},
{
"entropy": 5.606501674652099,
"epoch": 1.0245326612056291,
"grad_norm": 1.515625,
"learning_rate": 0.0004900849275811707,
"loss": 5.4103,
"mean_token_accuracy": 0.1638326808810234,
"num_tokens": 22500457.0,
"step": 12195
},
{
"entropy": 5.653303146362305,
"epoch": 1.024952741020794,
"grad_norm": 1.6328125,
"learning_rate": 0.0004900761338223007,
"loss": 5.3294,
"mean_token_accuracy": 0.16317504793405532,
"num_tokens": 22509641.0,
"step": 12200
},
{
"entropy": 5.549904870986938,
"epoch": 1.025372820835959,
"grad_norm": 1.4453125,
"learning_rate": 0.0004900673362534677,
"loss": 5.2831,
"mean_token_accuracy": 0.17752161622047424,
"num_tokens": 22518616.0,
"step": 12205
},
{
"entropy": 5.593005084991455,
"epoch": 1.0257929006511237,
"grad_norm": 1.390625,
"learning_rate": 0.0004900585348748277,
"loss": 5.4345,
"mean_token_accuracy": 0.1716150164604187,
"num_tokens": 22527599.0,
"step": 12210
},
{
"entropy": 5.5883626461029055,
"epoch": 1.0262129804662885,
"grad_norm": 1.6015625,
"learning_rate": 0.0004900497296865365,
"loss": 5.4513,
"mean_token_accuracy": 0.15790560841560364,
"num_tokens": 22537399.0,
"step": 12215
},
{
"entropy": 5.835514545440674,
"epoch": 1.0266330602814535,
"grad_norm": 1.5234375,
"learning_rate": 0.0004900409206887499,
"loss": 5.6197,
"mean_token_accuracy": 0.15950456708669664,
"num_tokens": 22546746.0,
"step": 12220
},
{
"entropy": 5.738766622543335,
"epoch": 1.0270531400966183,
"grad_norm": 1.4296875,
"learning_rate": 0.0004900321078816243,
"loss": 5.4261,
"mean_token_accuracy": 0.1695483461022377,
"num_tokens": 22555735.0,
"step": 12225
},
{
"entropy": 5.64575343132019,
"epoch": 1.0274732199117833,
"grad_norm": 1.3359375,
"learning_rate": 0.0004900232912653156,
"loss": 5.4061,
"mean_token_accuracy": 0.16903394013643264,
"num_tokens": 22565010.0,
"step": 12230
},
{
"entropy": 5.622378730773926,
"epoch": 1.027893299726948,
"grad_norm": 1.4296875,
"learning_rate": 0.00049001447083998,
"loss": 5.4049,
"mean_token_accuracy": 0.16650903224945068,
"num_tokens": 22573565.0,
"step": 12235
},
{
"entropy": 5.621281671524048,
"epoch": 1.028313379542113,
"grad_norm": 1.484375,
"learning_rate": 0.0004900056466057737,
"loss": 5.3948,
"mean_token_accuracy": 0.1670803725719452,
"num_tokens": 22582549.0,
"step": 12240
},
{
"entropy": 5.609878969192505,
"epoch": 1.028733459357278,
"grad_norm": 1.5,
"learning_rate": 0.0004899968185628531,
"loss": 5.4517,
"mean_token_accuracy": 0.16443442553281784,
"num_tokens": 22592112.0,
"step": 12245
},
{
"entropy": 5.614514493942261,
"epoch": 1.0291535391724427,
"grad_norm": 1.40625,
"learning_rate": 0.0004899879867113746,
"loss": 5.2862,
"mean_token_accuracy": 0.17238962799310684,
"num_tokens": 22600581.0,
"step": 12250
},
{
"entropy": 5.687255477905273,
"epoch": 1.0295736189876077,
"grad_norm": 1.4921875,
"learning_rate": 0.0004899791510514945,
"loss": 5.4855,
"mean_token_accuracy": 0.16246309280395507,
"num_tokens": 22610822.0,
"step": 12255
},
{
"entropy": 5.670682430267334,
"epoch": 1.0299936988027725,
"grad_norm": 1.4453125,
"learning_rate": 0.0004899703115833696,
"loss": 5.5025,
"mean_token_accuracy": 0.16369097977876662,
"num_tokens": 22619484.0,
"step": 12260
},
{
"entropy": 5.6433234214782715,
"epoch": 1.0304137786179375,
"grad_norm": 1.546875,
"learning_rate": 0.0004899614683071563,
"loss": 5.3418,
"mean_token_accuracy": 0.16978320479393005,
"num_tokens": 22629038.0,
"step": 12265
},
{
"entropy": 5.630164051055909,
"epoch": 1.0308338584331023,
"grad_norm": 1.953125,
"learning_rate": 0.0004899526212230112,
"loss": 5.4258,
"mean_token_accuracy": 0.15771337747573852,
"num_tokens": 22638619.0,
"step": 12270
},
{
"entropy": 5.556281185150146,
"epoch": 1.0312539382482673,
"grad_norm": 1.6875,
"learning_rate": 0.0004899437703310912,
"loss": 5.4118,
"mean_token_accuracy": 0.16302530169487,
"num_tokens": 22648065.0,
"step": 12275
},
{
"entropy": 5.7112713813781735,
"epoch": 1.031674018063432,
"grad_norm": 1.6328125,
"learning_rate": 0.0004899349156315529,
"loss": 5.4726,
"mean_token_accuracy": 0.16393186151981354,
"num_tokens": 22658107.0,
"step": 12280
},
{
"entropy": 5.662258052825928,
"epoch": 1.0320940978785969,
"grad_norm": 1.3828125,
"learning_rate": 0.0004899260571245533,
"loss": 5.3623,
"mean_token_accuracy": 0.17062537670135497,
"num_tokens": 22667103.0,
"step": 12285
},
{
"entropy": 5.555991840362549,
"epoch": 1.0325141776937619,
"grad_norm": 1.34375,
"learning_rate": 0.0004899171948102492,
"loss": 5.3425,
"mean_token_accuracy": 0.16608135551214218,
"num_tokens": 22676792.0,
"step": 12290
},
{
"entropy": 5.612425088882446,
"epoch": 1.0329342575089266,
"grad_norm": 1.53125,
"learning_rate": 0.0004899083286887977,
"loss": 5.3765,
"mean_token_accuracy": 0.16904007345438005,
"num_tokens": 22685344.0,
"step": 12295
},
{
"entropy": 5.665998268127441,
"epoch": 1.0333543373240917,
"grad_norm": 1.484375,
"learning_rate": 0.0004898994587603559,
"loss": 5.4213,
"mean_token_accuracy": 0.1652013510465622,
"num_tokens": 22694387.0,
"step": 12300
},
{
"entropy": 5.581104946136475,
"epoch": 1.0337744171392564,
"grad_norm": 1.4765625,
"learning_rate": 0.0004898905850250807,
"loss": 5.4686,
"mean_token_accuracy": 0.16657924205064772,
"num_tokens": 22704203.0,
"step": 12305
},
{
"entropy": 5.690647602081299,
"epoch": 1.0341944969544214,
"grad_norm": 1.484375,
"learning_rate": 0.0004898817074831295,
"loss": 5.5006,
"mean_token_accuracy": 0.16644016206264495,
"num_tokens": 22713518.0,
"step": 12310
},
{
"entropy": 5.730755805969238,
"epoch": 1.0346145767695862,
"grad_norm": 1.3203125,
"learning_rate": 0.0004898728261346595,
"loss": 5.5162,
"mean_token_accuracy": 0.15961872935295104,
"num_tokens": 22722997.0,
"step": 12315
},
{
"entropy": 5.690422677993775,
"epoch": 1.035034656584751,
"grad_norm": 1.4140625,
"learning_rate": 0.000489863940979828,
"loss": 5.448,
"mean_token_accuracy": 0.1639517143368721,
"num_tokens": 22732385.0,
"step": 12320
},
{
"entropy": 5.5706113338470455,
"epoch": 1.035454736399916,
"grad_norm": 1.2890625,
"learning_rate": 0.0004898550520187925,
"loss": 5.3195,
"mean_token_accuracy": 0.17500197887420654,
"num_tokens": 22741148.0,
"step": 12325
},
{
"entropy": 5.583190250396728,
"epoch": 1.0358748162150808,
"grad_norm": 1.640625,
"learning_rate": 0.0004898461592517103,
"loss": 5.3239,
"mean_token_accuracy": 0.17272863388061524,
"num_tokens": 22750239.0,
"step": 12330
},
{
"entropy": 5.684148597717285,
"epoch": 1.0362948960302458,
"grad_norm": 1.5078125,
"learning_rate": 0.0004898372626787391,
"loss": 5.4773,
"mean_token_accuracy": 0.16207296401262283,
"num_tokens": 22759290.0,
"step": 12335
},
{
"entropy": 5.689363956451416,
"epoch": 1.0367149758454106,
"grad_norm": 1.4140625,
"learning_rate": 0.0004898283623000364,
"loss": 5.4982,
"mean_token_accuracy": 0.15504724234342576,
"num_tokens": 22768450.0,
"step": 12340
},
{
"entropy": 5.665671014785767,
"epoch": 1.0371350556605754,
"grad_norm": 1.578125,
"learning_rate": 0.0004898194581157598,
"loss": 5.3457,
"mean_token_accuracy": 0.16443716287612914,
"num_tokens": 22777711.0,
"step": 12345
},
{
"entropy": 5.654232358932495,
"epoch": 1.0375551354757404,
"grad_norm": 1.546875,
"learning_rate": 0.0004898105501260671,
"loss": 5.446,
"mean_token_accuracy": 0.16645502150058747,
"num_tokens": 22787153.0,
"step": 12350
},
{
"entropy": 5.686613702774048,
"epoch": 1.0379752152909052,
"grad_norm": 1.4921875,
"learning_rate": 0.0004898016383311163,
"loss": 5.4173,
"mean_token_accuracy": 0.1745339259505272,
"num_tokens": 22797125.0,
"step": 12355
},
{
"entropy": 5.6651740074157715,
"epoch": 1.0383952951060702,
"grad_norm": 1.46875,
"learning_rate": 0.000489792722731065,
"loss": 5.4277,
"mean_token_accuracy": 0.16517363041639327,
"num_tokens": 22806478.0,
"step": 12360
},
{
"entropy": 5.676910877227783,
"epoch": 1.038815374921235,
"grad_norm": 1.453125,
"learning_rate": 0.0004897838033260712,
"loss": 5.4351,
"mean_token_accuracy": 0.15935799479484558,
"num_tokens": 22815375.0,
"step": 12365
},
{
"entropy": 5.680210399627685,
"epoch": 1.0392354547364,
"grad_norm": 2.359375,
"learning_rate": 0.0004897748801162929,
"loss": 5.4069,
"mean_token_accuracy": 0.16446397304534913,
"num_tokens": 22824401.0,
"step": 12370
},
{
"entropy": 5.650290250778198,
"epoch": 1.0396555345515648,
"grad_norm": 1.4765625,
"learning_rate": 0.0004897659531018882,
"loss": 5.5155,
"mean_token_accuracy": 0.1638687551021576,
"num_tokens": 22833933.0,
"step": 12375
},
{
"entropy": 5.5616044998168945,
"epoch": 1.0400756143667296,
"grad_norm": 1.46875,
"learning_rate": 0.0004897570222830152,
"loss": 5.3881,
"mean_token_accuracy": 0.16457553654909135,
"num_tokens": 22843779.0,
"step": 12380
},
{
"entropy": 5.750216913223267,
"epoch": 1.0404956941818946,
"grad_norm": 1.4140625,
"learning_rate": 0.0004897480876598322,
"loss": 5.4905,
"mean_token_accuracy": 0.16095164567232131,
"num_tokens": 22852951.0,
"step": 12385
},
{
"entropy": 5.728332042694092,
"epoch": 1.0409157739970594,
"grad_norm": 1.4453125,
"learning_rate": 0.0004897391492324974,
"loss": 5.4784,
"mean_token_accuracy": 0.16274571269750596,
"num_tokens": 22861398.0,
"step": 12390
},
{
"entropy": 5.6263368129730225,
"epoch": 1.0413358538122244,
"grad_norm": 1.8046875,
"learning_rate": 0.0004897302070011691,
"loss": 5.3824,
"mean_token_accuracy": 0.17079239189624787,
"num_tokens": 22870518.0,
"step": 12395
},
{
"entropy": 5.606213474273682,
"epoch": 1.0417559336273892,
"grad_norm": 1.3515625,
"learning_rate": 0.0004897212609660058,
"loss": 5.4476,
"mean_token_accuracy": 0.1651351511478424,
"num_tokens": 22879389.0,
"step": 12400
},
{
"entropy": 5.606588840484619,
"epoch": 1.0421760134425542,
"grad_norm": 1.59375,
"learning_rate": 0.0004897123111271659,
"loss": 5.4434,
"mean_token_accuracy": 0.17210057824850084,
"num_tokens": 22888977.0,
"step": 12405
},
{
"entropy": 5.714909410476684,
"epoch": 1.042596093257719,
"grad_norm": 1.46875,
"learning_rate": 0.0004897033574848079,
"loss": 5.4599,
"mean_token_accuracy": 0.16815780401229857,
"num_tokens": 22898446.0,
"step": 12410
},
{
"entropy": 5.635931825637817,
"epoch": 1.0430161730728837,
"grad_norm": 1.5703125,
"learning_rate": 0.0004896944000390907,
"loss": 5.4467,
"mean_token_accuracy": 0.16541572213172911,
"num_tokens": 22908044.0,
"step": 12415
},
{
"entropy": 5.68047137260437,
"epoch": 1.0434362528880488,
"grad_norm": 1.421875,
"learning_rate": 0.0004896854387901725,
"loss": 5.5176,
"mean_token_accuracy": 0.15805390030145644,
"num_tokens": 22917330.0,
"step": 12420
},
{
"entropy": 5.704816675186157,
"epoch": 1.0438563327032135,
"grad_norm": 1.4453125,
"learning_rate": 0.0004896764737382124,
"loss": 5.4724,
"mean_token_accuracy": 0.17054068595170974,
"num_tokens": 22927160.0,
"step": 12425
},
{
"entropy": 5.693249273300171,
"epoch": 1.0442764125183785,
"grad_norm": 1.2890625,
"learning_rate": 0.0004896675048833691,
"loss": 5.4172,
"mean_token_accuracy": 0.16629046201705933,
"num_tokens": 22936755.0,
"step": 12430
},
{
"entropy": 5.65311713218689,
"epoch": 1.0446964923335433,
"grad_norm": 1.484375,
"learning_rate": 0.0004896585322258014,
"loss": 5.4201,
"mean_token_accuracy": 0.16618072092533112,
"num_tokens": 22945699.0,
"step": 12435
},
{
"entropy": 5.583677911758423,
"epoch": 1.0451165721487083,
"grad_norm": 1.5078125,
"learning_rate": 0.0004896495557656685,
"loss": 5.3931,
"mean_token_accuracy": 0.17709663063287734,
"num_tokens": 22954001.0,
"step": 12440
},
{
"entropy": 5.696814155578613,
"epoch": 1.0455366519638731,
"grad_norm": 1.453125,
"learning_rate": 0.0004896405755031293,
"loss": 5.4766,
"mean_token_accuracy": 0.16204729974269866,
"num_tokens": 22963805.0,
"step": 12445
},
{
"entropy": 5.61180157661438,
"epoch": 1.045956731779038,
"grad_norm": 1.6015625,
"learning_rate": 0.0004896315914383427,
"loss": 5.4318,
"mean_token_accuracy": 0.15509928911924362,
"num_tokens": 22973542.0,
"step": 12450
},
{
"entropy": 5.540133476257324,
"epoch": 1.046376811594203,
"grad_norm": 1.421875,
"learning_rate": 0.0004896226035714679,
"loss": 5.3062,
"mean_token_accuracy": 0.17043613642454147,
"num_tokens": 22982417.0,
"step": 12455
},
{
"entropy": 5.597504425048828,
"epoch": 1.0467968914093677,
"grad_norm": 1.6328125,
"learning_rate": 0.0004896136119026642,
"loss": 5.41,
"mean_token_accuracy": 0.16592884063720703,
"num_tokens": 22992879.0,
"step": 12460
},
{
"entropy": 5.59436526298523,
"epoch": 1.0472169712245327,
"grad_norm": 1.5078125,
"learning_rate": 0.0004896046164320911,
"loss": 5.3045,
"mean_token_accuracy": 0.17583135962486268,
"num_tokens": 23001344.0,
"step": 12465
},
{
"entropy": 5.55146484375,
"epoch": 1.0476370510396975,
"grad_norm": 1.390625,
"learning_rate": 0.0004895956171599075,
"loss": 5.3568,
"mean_token_accuracy": 0.1739198938012123,
"num_tokens": 23010007.0,
"step": 12470
},
{
"entropy": 5.65653715133667,
"epoch": 1.0480571308548625,
"grad_norm": 1.4375,
"learning_rate": 0.0004895866140862731,
"loss": 5.4797,
"mean_token_accuracy": 0.16263213604688645,
"num_tokens": 23019120.0,
"step": 12475
},
{
"entropy": 5.662118196487427,
"epoch": 1.0484772106700273,
"grad_norm": 1.3515625,
"learning_rate": 0.0004895776072113473,
"loss": 5.4555,
"mean_token_accuracy": 0.1633812740445137,
"num_tokens": 23028562.0,
"step": 12480
},
{
"entropy": 5.583648824691773,
"epoch": 1.048897290485192,
"grad_norm": 1.546875,
"learning_rate": 0.0004895685965352898,
"loss": 5.4124,
"mean_token_accuracy": 0.1627751737833023,
"num_tokens": 23037687.0,
"step": 12485
},
{
"entropy": 5.637997770309449,
"epoch": 1.049317370300357,
"grad_norm": 1.2890625,
"learning_rate": 0.0004895595820582601,
"loss": 5.3911,
"mean_token_accuracy": 0.16933690309524535,
"num_tokens": 23047475.0,
"step": 12490
},
{
"entropy": 5.622006130218506,
"epoch": 1.0497374501155219,
"grad_norm": 1.5,
"learning_rate": 0.0004895505637804177,
"loss": 5.4334,
"mean_token_accuracy": 0.164679816365242,
"num_tokens": 23057475.0,
"step": 12495
},
{
"entropy": 5.546265506744385,
"epoch": 1.050157529930687,
"grad_norm": 1.484375,
"learning_rate": 0.0004895415417019227,
"loss": 5.3892,
"mean_token_accuracy": 0.16429776102304458,
"num_tokens": 23066419.0,
"step": 12500
},
{
"entropy": 5.677910614013672,
"epoch": 1.0505776097458517,
"grad_norm": 1.4921875,
"learning_rate": 0.0004895325158229346,
"loss": 5.456,
"mean_token_accuracy": 0.16541764587163926,
"num_tokens": 23075516.0,
"step": 12505
},
{
"entropy": 5.615484428405762,
"epoch": 1.0509976895610167,
"grad_norm": 1.5703125,
"learning_rate": 0.0004895234861436136,
"loss": 5.345,
"mean_token_accuracy": 0.1746780276298523,
"num_tokens": 23084132.0,
"step": 12510
},
{
"entropy": 5.671827888488769,
"epoch": 1.0514177693761815,
"grad_norm": 1.5703125,
"learning_rate": 0.0004895144526641194,
"loss": 5.4084,
"mean_token_accuracy": 0.16155407577753067,
"num_tokens": 23093958.0,
"step": 12515
},
{
"entropy": 5.699568223953247,
"epoch": 1.0518378491913463,
"grad_norm": 1.8046875,
"learning_rate": 0.0004895054153846123,
"loss": 5.452,
"mean_token_accuracy": 0.15948740541934966,
"num_tokens": 23103524.0,
"step": 12520
},
{
"entropy": 5.5832483768463135,
"epoch": 1.0522579290065113,
"grad_norm": 1.4609375,
"learning_rate": 0.0004894963743052521,
"loss": 5.366,
"mean_token_accuracy": 0.1638748675584793,
"num_tokens": 23112445.0,
"step": 12525
},
{
"entropy": 5.661851692199707,
"epoch": 1.052678008821676,
"grad_norm": 1.4453125,
"learning_rate": 0.0004894873294261991,
"loss": 5.428,
"mean_token_accuracy": 0.16422830671072006,
"num_tokens": 23121299.0,
"step": 12530
},
{
"entropy": 5.722447490692138,
"epoch": 1.053098088636841,
"grad_norm": 1.6640625,
"learning_rate": 0.0004894782807476134,
"loss": 5.4326,
"mean_token_accuracy": 0.16310823261737822,
"num_tokens": 23130260.0,
"step": 12535
},
{
"entropy": 5.64654655456543,
"epoch": 1.0535181684520059,
"grad_norm": 1.390625,
"learning_rate": 0.0004894692282696555,
"loss": 5.3685,
"mean_token_accuracy": 0.16636543869972228,
"num_tokens": 23139335.0,
"step": 12540
},
{
"entropy": 5.5218949794769285,
"epoch": 1.0539382482671709,
"grad_norm": 1.59375,
"learning_rate": 0.0004894601719924857,
"loss": 5.3766,
"mean_token_accuracy": 0.17038656622171403,
"num_tokens": 23149299.0,
"step": 12545
},
{
"entropy": 5.543263053894043,
"epoch": 1.0543583280823356,
"grad_norm": 1.34375,
"learning_rate": 0.0004894511119162644,
"loss": 5.3341,
"mean_token_accuracy": 0.17221007347106934,
"num_tokens": 23158651.0,
"step": 12550
},
{
"entropy": 5.678027200698852,
"epoch": 1.0547784078975004,
"grad_norm": 1.390625,
"learning_rate": 0.000489442048041152,
"loss": 5.4296,
"mean_token_accuracy": 0.15821332037448882,
"num_tokens": 23167629.0,
"step": 12555
},
{
"entropy": 5.635254526138306,
"epoch": 1.0551984877126654,
"grad_norm": 1.625,
"learning_rate": 0.0004894329803673092,
"loss": 5.416,
"mean_token_accuracy": 0.16229169964790344,
"num_tokens": 23177026.0,
"step": 12560
},
{
"entropy": 5.622007846832275,
"epoch": 1.0556185675278302,
"grad_norm": 1.65625,
"learning_rate": 0.0004894239088948964,
"loss": 5.3761,
"mean_token_accuracy": 0.16643615067005157,
"num_tokens": 23185297.0,
"step": 12565
},
{
"entropy": 5.585867691040039,
"epoch": 1.0560386473429952,
"grad_norm": 1.625,
"learning_rate": 0.0004894148336240747,
"loss": 5.3981,
"mean_token_accuracy": 0.16961814016103743,
"num_tokens": 23194804.0,
"step": 12570
},
{
"entropy": 5.6445718765258786,
"epoch": 1.05645872715816,
"grad_norm": 1.375,
"learning_rate": 0.0004894057545550045,
"loss": 5.4304,
"mean_token_accuracy": 0.1647735506296158,
"num_tokens": 23205063.0,
"step": 12575
},
{
"entropy": 5.633014106750489,
"epoch": 1.056878806973325,
"grad_norm": 1.84375,
"learning_rate": 0.0004893966716878467,
"loss": 5.3754,
"mean_token_accuracy": 0.16172896772623063,
"num_tokens": 23215038.0,
"step": 12580
},
{
"entropy": 5.695857429504395,
"epoch": 1.0572988867884898,
"grad_norm": 1.4609375,
"learning_rate": 0.0004893875850227624,
"loss": 5.5218,
"mean_token_accuracy": 0.16041227877140046,
"num_tokens": 23223530.0,
"step": 12585
},
{
"entropy": 5.65571699142456,
"epoch": 1.0577189666036546,
"grad_norm": 1.5,
"learning_rate": 0.0004893784945599124,
"loss": 5.4589,
"mean_token_accuracy": 0.16575109511613845,
"num_tokens": 23232547.0,
"step": 12590
},
{
"entropy": 5.610851764678955,
"epoch": 1.0581390464188196,
"grad_norm": 1.7265625,
"learning_rate": 0.0004893694002994577,
"loss": 5.4865,
"mean_token_accuracy": 0.16560802906751632,
"num_tokens": 23241305.0,
"step": 12595
},
{
"entropy": 5.7720519542694095,
"epoch": 1.0585591262339844,
"grad_norm": 1.4140625,
"learning_rate": 0.0004893603022415595,
"loss": 5.5079,
"mean_token_accuracy": 0.16662406772375107,
"num_tokens": 23250708.0,
"step": 12600
},
{
"entropy": 5.691384363174438,
"epoch": 1.0589792060491494,
"grad_norm": 1.5390625,
"learning_rate": 0.0004893512003863788,
"loss": 5.4175,
"mean_token_accuracy": 0.15989451110363007,
"num_tokens": 23260161.0,
"step": 12605
},
{
"entropy": 5.570341491699219,
"epoch": 1.0593992858643142,
"grad_norm": 1.40625,
"learning_rate": 0.0004893420947340771,
"loss": 5.3169,
"mean_token_accuracy": 0.16430822163820266,
"num_tokens": 23268932.0,
"step": 12610
},
{
"entropy": 5.551045656204224,
"epoch": 1.0598193656794792,
"grad_norm": 2.046875,
"learning_rate": 0.0004893329852848155,
"loss": 5.4217,
"mean_token_accuracy": 0.1701897993683815,
"num_tokens": 23277741.0,
"step": 12615
},
{
"entropy": 5.573561477661133,
"epoch": 1.060239445494644,
"grad_norm": 1.484375,
"learning_rate": 0.0004893238720387555,
"loss": 5.4199,
"mean_token_accuracy": 0.16308623701334,
"num_tokens": 23286982.0,
"step": 12620
},
{
"entropy": 5.668636322021484,
"epoch": 1.0606595253098088,
"grad_norm": 1.609375,
"learning_rate": 0.0004893147549960584,
"loss": 5.3587,
"mean_token_accuracy": 0.16592552214860917,
"num_tokens": 23296902.0,
"step": 12625
},
{
"entropy": 5.609336996078492,
"epoch": 1.0610796051249738,
"grad_norm": 1.796875,
"learning_rate": 0.0004893056341568857,
"loss": 5.3749,
"mean_token_accuracy": 0.174351204931736,
"num_tokens": 23305443.0,
"step": 12630
},
{
"entropy": 5.561859083175659,
"epoch": 1.0614996849401386,
"grad_norm": 1.53125,
"learning_rate": 0.0004892965095213992,
"loss": 5.3316,
"mean_token_accuracy": 0.1718878448009491,
"num_tokens": 23315420.0,
"step": 12635
},
{
"entropy": 5.622679424285889,
"epoch": 1.0619197647553036,
"grad_norm": 1.6875,
"learning_rate": 0.0004892873810897604,
"loss": 5.4359,
"mean_token_accuracy": 0.16049171537160872,
"num_tokens": 23324540.0,
"step": 12640
},
{
"entropy": 5.616107416152954,
"epoch": 1.0623398445704684,
"grad_norm": 1.7421875,
"learning_rate": 0.0004892782488621308,
"loss": 5.3718,
"mean_token_accuracy": 0.1706625148653984,
"num_tokens": 23334282.0,
"step": 12645
},
{
"entropy": 5.657419586181641,
"epoch": 1.0627599243856332,
"grad_norm": 2.109375,
"learning_rate": 0.0004892691128386725,
"loss": 5.3693,
"mean_token_accuracy": 0.16586438268423082,
"num_tokens": 23342836.0,
"step": 12650
},
{
"entropy": 5.64911413192749,
"epoch": 1.0631800042007982,
"grad_norm": 1.578125,
"learning_rate": 0.0004892599730195471,
"loss": 5.3567,
"mean_token_accuracy": 0.1705513373017311,
"num_tokens": 23351863.0,
"step": 12655
},
{
"entropy": 5.720103025436401,
"epoch": 1.063600084015963,
"grad_norm": 1.421875,
"learning_rate": 0.0004892508294049167,
"loss": 5.4967,
"mean_token_accuracy": 0.1687561333179474,
"num_tokens": 23361788.0,
"step": 12660
},
{
"entropy": 5.59014482498169,
"epoch": 1.064020163831128,
"grad_norm": 2.046875,
"learning_rate": 0.0004892416819949431,
"loss": 5.3525,
"mean_token_accuracy": 0.16308819353580475,
"num_tokens": 23370175.0,
"step": 12665
},
{
"entropy": 5.52400312423706,
"epoch": 1.0644402436462927,
"grad_norm": 1.3203125,
"learning_rate": 0.0004892325307897886,
"loss": 5.4032,
"mean_token_accuracy": 0.16833132803440093,
"num_tokens": 23378835.0,
"step": 12670
},
{
"entropy": 5.621381092071533,
"epoch": 1.0648603234614578,
"grad_norm": 1.5,
"learning_rate": 0.0004892233757896149,
"loss": 5.4064,
"mean_token_accuracy": 0.16472738981246948,
"num_tokens": 23389390.0,
"step": 12675
},
{
"entropy": 5.648614740371704,
"epoch": 1.0652804032766225,
"grad_norm": 2.1875,
"learning_rate": 0.0004892142169945845,
"loss": 5.4098,
"mean_token_accuracy": 0.1647957906126976,
"num_tokens": 23398802.0,
"step": 12680
},
{
"entropy": 5.630923318862915,
"epoch": 1.0657004830917876,
"grad_norm": 1.5234375,
"learning_rate": 0.0004892050544048596,
"loss": 5.3867,
"mean_token_accuracy": 0.16518343836069108,
"num_tokens": 23407731.0,
"step": 12685
},
{
"entropy": 5.618378114700318,
"epoch": 1.0661205629069523,
"grad_norm": 1.4609375,
"learning_rate": 0.0004891958880206024,
"loss": 5.4326,
"mean_token_accuracy": 0.16275406628847122,
"num_tokens": 23417046.0,
"step": 12690
},
{
"entropy": 5.613988065719605,
"epoch": 1.0665406427221171,
"grad_norm": 1.40625,
"learning_rate": 0.0004891867178419753,
"loss": 5.4201,
"mean_token_accuracy": 0.1696673572063446,
"num_tokens": 23426107.0,
"step": 12695
},
{
"entropy": 5.662841749191284,
"epoch": 1.0669607225372821,
"grad_norm": 1.4453125,
"learning_rate": 0.0004891775438691408,
"loss": 5.4467,
"mean_token_accuracy": 0.1709218829870224,
"num_tokens": 23435523.0,
"step": 12700
},
{
"entropy": 5.624844980239868,
"epoch": 1.067380802352447,
"grad_norm": 1.609375,
"learning_rate": 0.0004891683661022615,
"loss": 5.4013,
"mean_token_accuracy": 0.17338791787624358,
"num_tokens": 23444185.0,
"step": 12705
},
{
"entropy": 5.768259763717651,
"epoch": 1.067800882167612,
"grad_norm": 1.4609375,
"learning_rate": 0.0004891591845414997,
"loss": 5.6144,
"mean_token_accuracy": 0.14980095773935317,
"num_tokens": 23454100.0,
"step": 12710
},
{
"entropy": 5.727353096008301,
"epoch": 1.0682209619827767,
"grad_norm": 1.421875,
"learning_rate": 0.0004891499991870184,
"loss": 5.4873,
"mean_token_accuracy": 0.16231284141540528,
"num_tokens": 23463415.0,
"step": 12715
},
{
"entropy": 5.651056814193725,
"epoch": 1.0686410417979415,
"grad_norm": 1.5390625,
"learning_rate": 0.00048914081003898,
"loss": 5.3824,
"mean_token_accuracy": 0.16605685353279115,
"num_tokens": 23471515.0,
"step": 12720
},
{
"entropy": 5.669055986404419,
"epoch": 1.0690611216131065,
"grad_norm": 1.4375,
"learning_rate": 0.0004891316170975475,
"loss": 5.4333,
"mean_token_accuracy": 0.16459901630878448,
"num_tokens": 23481696.0,
"step": 12725
},
{
"entropy": 5.720963811874389,
"epoch": 1.0694812014282713,
"grad_norm": 1.7890625,
"learning_rate": 0.0004891224203628836,
"loss": 5.4151,
"mean_token_accuracy": 0.17059700489044188,
"num_tokens": 23490714.0,
"step": 12730
},
{
"entropy": 5.59767165184021,
"epoch": 1.0699012812434363,
"grad_norm": 1.34375,
"learning_rate": 0.0004891132198351514,
"loss": 5.3754,
"mean_token_accuracy": 0.17071119844913482,
"num_tokens": 23500368.0,
"step": 12735
},
{
"entropy": 5.492298889160156,
"epoch": 1.070321361058601,
"grad_norm": 1.6796875,
"learning_rate": 0.0004891040155145137,
"loss": 5.3248,
"mean_token_accuracy": 0.17299245446920394,
"num_tokens": 23508857.0,
"step": 12740
},
{
"entropy": 5.555435657501221,
"epoch": 1.070741440873766,
"grad_norm": 1.46875,
"learning_rate": 0.0004890948074011335,
"loss": 5.3092,
"mean_token_accuracy": 0.17295463085174562,
"num_tokens": 23518128.0,
"step": 12745
},
{
"entropy": 5.670081424713135,
"epoch": 1.071161520688931,
"grad_norm": 1.421875,
"learning_rate": 0.0004890855954951741,
"loss": 5.4175,
"mean_token_accuracy": 0.17026563137769699,
"num_tokens": 23527292.0,
"step": 12750
},
{
"entropy": 5.697415733337403,
"epoch": 1.0715816005040957,
"grad_norm": 1.484375,
"learning_rate": 0.0004890763797967987,
"loss": 5.4211,
"mean_token_accuracy": 0.16530265808105468,
"num_tokens": 23535694.0,
"step": 12755
},
{
"entropy": 5.633253765106201,
"epoch": 1.0720016803192607,
"grad_norm": 1.5,
"learning_rate": 0.0004890671603061704,
"loss": 5.41,
"mean_token_accuracy": 0.16988277137279512,
"num_tokens": 23544766.0,
"step": 12760
},
{
"entropy": 5.59822449684143,
"epoch": 1.0724217601344255,
"grad_norm": 1.5546875,
"learning_rate": 0.0004890579370234526,
"loss": 5.3867,
"mean_token_accuracy": 0.17475670129060744,
"num_tokens": 23554037.0,
"step": 12765
},
{
"entropy": 5.654851055145263,
"epoch": 1.0728418399495905,
"grad_norm": 1.4609375,
"learning_rate": 0.0004890487099488086,
"loss": 5.4579,
"mean_token_accuracy": 0.16293169558048248,
"num_tokens": 23562282.0,
"step": 12770
},
{
"entropy": 5.716164493560791,
"epoch": 1.0732619197647553,
"grad_norm": 1.5390625,
"learning_rate": 0.000489039479082402,
"loss": 5.496,
"mean_token_accuracy": 0.15957446098327638,
"num_tokens": 23571955.0,
"step": 12775
},
{
"entropy": 5.642240190505982,
"epoch": 1.0736819995799203,
"grad_norm": 1.3984375,
"learning_rate": 0.0004890302444243962,
"loss": 5.3917,
"mean_token_accuracy": 0.16422138661146163,
"num_tokens": 23580996.0,
"step": 12780
},
{
"entropy": 5.6740892887115475,
"epoch": 1.074102079395085,
"grad_norm": 1.484375,
"learning_rate": 0.0004890210059749549,
"loss": 5.5027,
"mean_token_accuracy": 0.15336771160364152,
"num_tokens": 23589618.0,
"step": 12785
},
{
"entropy": 5.63664698600769,
"epoch": 1.0745221592102498,
"grad_norm": 1.4921875,
"learning_rate": 0.0004890117637342416,
"loss": 5.34,
"mean_token_accuracy": 0.16738679707050325,
"num_tokens": 23599574.0,
"step": 12790
},
{
"entropy": 5.629761743545532,
"epoch": 1.0749422390254149,
"grad_norm": 1.296875,
"learning_rate": 0.0004890025177024202,
"loss": 5.4003,
"mean_token_accuracy": 0.16000826209783553,
"num_tokens": 23609205.0,
"step": 12795
},
{
"entropy": 5.607318305969239,
"epoch": 1.0753623188405796,
"grad_norm": 1.5078125,
"learning_rate": 0.0004889932678796543,
"loss": 5.4182,
"mean_token_accuracy": 0.15812145173549652,
"num_tokens": 23617554.0,
"step": 12800
},
{
"entropy": 5.6223344802856445,
"epoch": 1.0757823986557447,
"grad_norm": 1.359375,
"learning_rate": 0.0004889840142661078,
"loss": 5.4894,
"mean_token_accuracy": 0.16154404506087303,
"num_tokens": 23626757.0,
"step": 12805
},
{
"entropy": 5.682391214370727,
"epoch": 1.0762024784709094,
"grad_norm": 1.359375,
"learning_rate": 0.0004889747568619447,
"loss": 5.422,
"mean_token_accuracy": 0.16666488647460936,
"num_tokens": 23636111.0,
"step": 12810
},
{
"entropy": 5.675402879714966,
"epoch": 1.0766225582860744,
"grad_norm": 1.328125,
"learning_rate": 0.0004889654956673291,
"loss": 5.413,
"mean_token_accuracy": 0.16374361217021943,
"num_tokens": 23644579.0,
"step": 12815
},
{
"entropy": 5.6301075458526615,
"epoch": 1.0770426381012392,
"grad_norm": 1.421875,
"learning_rate": 0.0004889562306824248,
"loss": 5.3285,
"mean_token_accuracy": 0.16753521859645842,
"num_tokens": 23653263.0,
"step": 12820
},
{
"entropy": 5.493171119689942,
"epoch": 1.077462717916404,
"grad_norm": 1.5234375,
"learning_rate": 0.000488946961907396,
"loss": 5.2889,
"mean_token_accuracy": 0.17843078672885895,
"num_tokens": 23662529.0,
"step": 12825
},
{
"entropy": 5.500955486297608,
"epoch": 1.077882797731569,
"grad_norm": 1.3671875,
"learning_rate": 0.0004889376893424071,
"loss": 5.3314,
"mean_token_accuracy": 0.17671893090009688,
"num_tokens": 23671491.0,
"step": 12830
},
{
"entropy": 5.594093990325928,
"epoch": 1.0783028775467338,
"grad_norm": 1.3984375,
"learning_rate": 0.0004889284129876221,
"loss": 5.3215,
"mean_token_accuracy": 0.17135681062936783,
"num_tokens": 23680121.0,
"step": 12835
},
{
"entropy": 5.652432584762574,
"epoch": 1.0787229573618988,
"grad_norm": 1.4140625,
"learning_rate": 0.0004889191328432054,
"loss": 5.3861,
"mean_token_accuracy": 0.16644245833158494,
"num_tokens": 23689008.0,
"step": 12840
},
{
"entropy": 5.633866977691651,
"epoch": 1.0791430371770636,
"grad_norm": 1.6015625,
"learning_rate": 0.0004889098489093215,
"loss": 5.4145,
"mean_token_accuracy": 0.1661999359726906,
"num_tokens": 23698551.0,
"step": 12845
},
{
"entropy": 5.737457704544068,
"epoch": 1.0795631169922286,
"grad_norm": 1.5625,
"learning_rate": 0.0004889005611861347,
"loss": 5.5652,
"mean_token_accuracy": 0.16424480825662613,
"num_tokens": 23707438.0,
"step": 12850
},
{
"entropy": 5.6380579471588135,
"epoch": 1.0799831968073934,
"grad_norm": 1.6171875,
"learning_rate": 0.0004888912696738096,
"loss": 5.4216,
"mean_token_accuracy": 0.16508191749453544,
"num_tokens": 23715822.0,
"step": 12855
},
{
"entropy": 5.665700483322143,
"epoch": 1.0804032766225582,
"grad_norm": 1.4609375,
"learning_rate": 0.0004888819743725108,
"loss": 5.4439,
"mean_token_accuracy": 0.1673838436603546,
"num_tokens": 23725426.0,
"step": 12860
},
{
"entropy": 5.660395002365112,
"epoch": 1.0808233564377232,
"grad_norm": 1.390625,
"learning_rate": 0.000488872675282403,
"loss": 5.4191,
"mean_token_accuracy": 0.16672058403491974,
"num_tokens": 23735092.0,
"step": 12865
},
{
"entropy": 5.6598817825317385,
"epoch": 1.081243436252888,
"grad_norm": 1.3984375,
"learning_rate": 0.0004888633724036509,
"loss": 5.4285,
"mean_token_accuracy": 0.16880684196949006,
"num_tokens": 23744255.0,
"step": 12870
},
{
"entropy": 5.572845077514648,
"epoch": 1.081663516068053,
"grad_norm": 1.5078125,
"learning_rate": 0.0004888540657364192,
"loss": 5.2793,
"mean_token_accuracy": 0.17684861570596694,
"num_tokens": 23752978.0,
"step": 12875
},
{
"entropy": 5.612951946258545,
"epoch": 1.0820835958832178,
"grad_norm": 1.4140625,
"learning_rate": 0.0004888447552808729,
"loss": 5.3621,
"mean_token_accuracy": 0.16552072912454605,
"num_tokens": 23761051.0,
"step": 12880
},
{
"entropy": 5.67154974937439,
"epoch": 1.0825036756983828,
"grad_norm": 1.546875,
"learning_rate": 0.0004888354410371768,
"loss": 5.4789,
"mean_token_accuracy": 0.15792061686515807,
"num_tokens": 23770818.0,
"step": 12885
},
{
"entropy": 5.746588897705078,
"epoch": 1.0829237555135476,
"grad_norm": 1.4140625,
"learning_rate": 0.000488826123005496,
"loss": 5.4892,
"mean_token_accuracy": 0.16944090873003007,
"num_tokens": 23780597.0,
"step": 12890
},
{
"entropy": 5.60015287399292,
"epoch": 1.0833438353287124,
"grad_norm": 1.3984375,
"learning_rate": 0.0004888168011859957,
"loss": 5.3266,
"mean_token_accuracy": 0.16784373819828033,
"num_tokens": 23790119.0,
"step": 12895
},
{
"entropy": 5.59284782409668,
"epoch": 1.0837639151438774,
"grad_norm": 1.296875,
"learning_rate": 0.0004888074755788407,
"loss": 5.4074,
"mean_token_accuracy": 0.1683492362499237,
"num_tokens": 23798972.0,
"step": 12900
},
{
"entropy": 5.622417688369751,
"epoch": 1.0841839949590422,
"grad_norm": 1.75,
"learning_rate": 0.0004887981461841963,
"loss": 5.3743,
"mean_token_accuracy": 0.17346876859664917,
"num_tokens": 23808685.0,
"step": 12905
},
{
"entropy": 5.64064130783081,
"epoch": 1.0846040747742072,
"grad_norm": 1.453125,
"learning_rate": 0.0004887888130022279,
"loss": 5.3748,
"mean_token_accuracy": 0.16767643988132477,
"num_tokens": 23817721.0,
"step": 12910
},
{
"entropy": 5.5321691036224365,
"epoch": 1.085024154589372,
"grad_norm": 1.8046875,
"learning_rate": 0.0004887794760331008,
"loss": 5.3404,
"mean_token_accuracy": 0.16723188161849975,
"num_tokens": 23826892.0,
"step": 12915
},
{
"entropy": 5.568603563308716,
"epoch": 1.085444234404537,
"grad_norm": 1.7109375,
"learning_rate": 0.0004887701352769804,
"loss": 5.2879,
"mean_token_accuracy": 0.1780572459101677,
"num_tokens": 23835717.0,
"step": 12920
},
{
"entropy": 5.583135890960693,
"epoch": 1.0858643142197018,
"grad_norm": 1.7578125,
"learning_rate": 0.000488760790734032,
"loss": 5.4029,
"mean_token_accuracy": 0.1701557457447052,
"num_tokens": 23845814.0,
"step": 12925
},
{
"entropy": 5.669702625274658,
"epoch": 1.0862843940348665,
"grad_norm": 1.8046875,
"learning_rate": 0.0004887514424044214,
"loss": 5.3864,
"mean_token_accuracy": 0.16005658805370332,
"num_tokens": 23854779.0,
"step": 12930
},
{
"entropy": 5.590376663208008,
"epoch": 1.0867044738500315,
"grad_norm": 1.4609375,
"learning_rate": 0.000488742090288314,
"loss": 5.421,
"mean_token_accuracy": 0.16315811723470688,
"num_tokens": 23863533.0,
"step": 12935
},
{
"entropy": 5.5997072696685795,
"epoch": 1.0871245536651963,
"grad_norm": 1.484375,
"learning_rate": 0.0004887327343858755,
"loss": 5.4511,
"mean_token_accuracy": 0.16439618468284606,
"num_tokens": 23872725.0,
"step": 12940
},
{
"entropy": 5.6145484924316404,
"epoch": 1.0875446334803613,
"grad_norm": 1.53125,
"learning_rate": 0.0004887233746972717,
"loss": 5.4415,
"mean_token_accuracy": 0.16163852512836457,
"num_tokens": 23881799.0,
"step": 12945
},
{
"entropy": 5.652968406677246,
"epoch": 1.0879647132955261,
"grad_norm": 1.546875,
"learning_rate": 0.0004887140112226684,
"loss": 5.4582,
"mean_token_accuracy": 0.16318671107292176,
"num_tokens": 23890628.0,
"step": 12950
},
{
"entropy": 5.651101064682007,
"epoch": 1.088384793110691,
"grad_norm": 1.5625,
"learning_rate": 0.0004887046439622314,
"loss": 5.4392,
"mean_token_accuracy": 0.17248670607805253,
"num_tokens": 23899968.0,
"step": 12955
},
{
"entropy": 5.665786266326904,
"epoch": 1.088804872925856,
"grad_norm": 1.5859375,
"learning_rate": 0.0004886952729161267,
"loss": 5.3178,
"mean_token_accuracy": 0.16974089592695235,
"num_tokens": 23908634.0,
"step": 12960
},
{
"entropy": 5.6458399295806885,
"epoch": 1.0892249527410207,
"grad_norm": 1.515625,
"learning_rate": 0.0004886858980845202,
"loss": 5.4628,
"mean_token_accuracy": 0.16651467829942704,
"num_tokens": 23917925.0,
"step": 12965
},
{
"entropy": 5.533213710784912,
"epoch": 1.0896450325561857,
"grad_norm": 1.515625,
"learning_rate": 0.0004886765194675782,
"loss": 5.3655,
"mean_token_accuracy": 0.17183977067470552,
"num_tokens": 23927173.0,
"step": 12970
},
{
"entropy": 5.6211940288543705,
"epoch": 1.0900651123713505,
"grad_norm": 1.8125,
"learning_rate": 0.0004886671370654665,
"loss": 5.3276,
"mean_token_accuracy": 0.17467257082462312,
"num_tokens": 23936258.0,
"step": 12975
},
{
"entropy": 5.6253503322601315,
"epoch": 1.0904851921865155,
"grad_norm": 1.7734375,
"learning_rate": 0.0004886577508783516,
"loss": 5.3035,
"mean_token_accuracy": 0.17414060682058335,
"num_tokens": 23944215.0,
"step": 12980
},
{
"entropy": 5.640400552749634,
"epoch": 1.0909052720016803,
"grad_norm": 1.546875,
"learning_rate": 0.0004886483609063997,
"loss": 5.3807,
"mean_token_accuracy": 0.16728059202432632,
"num_tokens": 23953151.0,
"step": 12985
},
{
"entropy": 5.515423250198364,
"epoch": 1.0913253518168453,
"grad_norm": 1.5234375,
"learning_rate": 0.0004886389671497769,
"loss": 5.3847,
"mean_token_accuracy": 0.17380398660898208,
"num_tokens": 23962919.0,
"step": 12990
},
{
"entropy": 5.668184280395508,
"epoch": 1.09174543163201,
"grad_norm": 1.4609375,
"learning_rate": 0.00048862956960865,
"loss": 5.3966,
"mean_token_accuracy": 0.16712536364793779,
"num_tokens": 23971900.0,
"step": 12995
},
{
"entropy": 5.6605853080749515,
"epoch": 1.0921655114471749,
"grad_norm": 1.5390625,
"learning_rate": 0.0004886201682831852,
"loss": 5.3723,
"mean_token_accuracy": 0.17093730568885804,
"num_tokens": 23980945.0,
"step": 13000
},
{
"entropy": 5.574681234359741,
"epoch": 1.09258559126234,
"grad_norm": 1.3046875,
"learning_rate": 0.0004886107631735491,
"loss": 5.3296,
"mean_token_accuracy": 0.16935485750436782,
"num_tokens": 23990460.0,
"step": 13005
},
{
"entropy": 5.643135356903076,
"epoch": 1.0930056710775047,
"grad_norm": 2.28125,
"learning_rate": 0.0004886013542799083,
"loss": 5.5015,
"mean_token_accuracy": 0.15477217584848404,
"num_tokens": 23999925.0,
"step": 13010
},
{
"entropy": 5.581091022491455,
"epoch": 1.0934257508926697,
"grad_norm": 1.5234375,
"learning_rate": 0.0004885919416024296,
"loss": 5.3405,
"mean_token_accuracy": 0.16890386044979094,
"num_tokens": 24009039.0,
"step": 13015
},
{
"entropy": 5.655356359481812,
"epoch": 1.0938458307078345,
"grad_norm": 1.515625,
"learning_rate": 0.0004885825251412796,
"loss": 5.3971,
"mean_token_accuracy": 0.17056007087230682,
"num_tokens": 24017725.0,
"step": 13020
},
{
"entropy": 5.652591800689697,
"epoch": 1.0942659105229993,
"grad_norm": 1.8515625,
"learning_rate": 0.0004885731048966252,
"loss": 5.4265,
"mean_token_accuracy": 0.1609223946928978,
"num_tokens": 24027158.0,
"step": 13025
},
{
"entropy": 5.652519559860229,
"epoch": 1.0946859903381643,
"grad_norm": 1.8515625,
"learning_rate": 0.0004885636808686331,
"loss": 5.4738,
"mean_token_accuracy": 0.1661706432700157,
"num_tokens": 24037224.0,
"step": 13030
},
{
"entropy": 5.654554319381714,
"epoch": 1.095106070153329,
"grad_norm": 1.4609375,
"learning_rate": 0.0004885542530574705,
"loss": 5.4282,
"mean_token_accuracy": 0.1631125405430794,
"num_tokens": 24046097.0,
"step": 13035
},
{
"entropy": 5.627096652984619,
"epoch": 1.095526149968494,
"grad_norm": 1.4296875,
"learning_rate": 0.0004885448214633042,
"loss": 5.3062,
"mean_token_accuracy": 0.1716091126203537,
"num_tokens": 24055270.0,
"step": 13040
},
{
"entropy": 5.673101949691772,
"epoch": 1.0959462297836589,
"grad_norm": 1.859375,
"learning_rate": 0.0004885353860863013,
"loss": 5.4825,
"mean_token_accuracy": 0.15423855185508728,
"num_tokens": 24064995.0,
"step": 13045
},
{
"entropy": 5.692544794082641,
"epoch": 1.0963663095988239,
"grad_norm": 1.8515625,
"learning_rate": 0.000488525946926629,
"loss": 5.5296,
"mean_token_accuracy": 0.15968327820301056,
"num_tokens": 24075523.0,
"step": 13050
},
{
"entropy": 5.633663845062256,
"epoch": 1.0967863894139886,
"grad_norm": 1.4375,
"learning_rate": 0.0004885165039844545,
"loss": 5.4164,
"mean_token_accuracy": 0.17032116651535034,
"num_tokens": 24084933.0,
"step": 13055
},
{
"entropy": 5.626227521896363,
"epoch": 1.0972064692291534,
"grad_norm": 1.65625,
"learning_rate": 0.0004885070572599452,
"loss": 5.436,
"mean_token_accuracy": 0.15822404623031616,
"num_tokens": 24093964.0,
"step": 13060
},
{
"entropy": 5.616829252243042,
"epoch": 1.0976265490443184,
"grad_norm": 1.3828125,
"learning_rate": 0.0004884976067532681,
"loss": 5.3764,
"mean_token_accuracy": 0.15617975145578383,
"num_tokens": 24103951.0,
"step": 13065
},
{
"entropy": 5.588425588607788,
"epoch": 1.0980466288594832,
"grad_norm": 1.4296875,
"learning_rate": 0.000488488152464591,
"loss": 5.4879,
"mean_token_accuracy": 0.15913376361131668,
"num_tokens": 24113392.0,
"step": 13070
},
{
"entropy": 5.615650415420532,
"epoch": 1.0984667086746482,
"grad_norm": 1.609375,
"learning_rate": 0.0004884786943940812,
"loss": 5.3558,
"mean_token_accuracy": 0.16624071300029755,
"num_tokens": 24123165.0,
"step": 13075
},
{
"entropy": 5.608291578292847,
"epoch": 1.098886788489813,
"grad_norm": 1.375,
"learning_rate": 0.0004884692325419063,
"loss": 5.3988,
"mean_token_accuracy": 0.16643917858600615,
"num_tokens": 24132176.0,
"step": 13080
},
{
"entropy": 5.581974601745605,
"epoch": 1.099306868304978,
"grad_norm": 1.3828125,
"learning_rate": 0.0004884597669082336,
"loss": 5.458,
"mean_token_accuracy": 0.1615513488650322,
"num_tokens": 24141737.0,
"step": 13085
},
{
"entropy": 5.628212356567383,
"epoch": 1.0997269481201428,
"grad_norm": 1.40625,
"learning_rate": 0.0004884502974932313,
"loss": 5.3815,
"mean_token_accuracy": 0.16735443249344825,
"num_tokens": 24150477.0,
"step": 13090
},
{
"entropy": 5.739729356765747,
"epoch": 1.1001470279353076,
"grad_norm": 1.4453125,
"learning_rate": 0.0004884408242970668,
"loss": 5.4863,
"mean_token_accuracy": 0.1622896149754524,
"num_tokens": 24158739.0,
"step": 13095
},
{
"entropy": 5.5542552947998045,
"epoch": 1.1005671077504726,
"grad_norm": 1.4140625,
"learning_rate": 0.0004884313473199081,
"loss": 5.3319,
"mean_token_accuracy": 0.17589967846870422,
"num_tokens": 24167511.0,
"step": 13100
},
{
"entropy": 5.561865520477295,
"epoch": 1.1009871875656374,
"grad_norm": 1.4140625,
"learning_rate": 0.0004884218665619229,
"loss": 5.3163,
"mean_token_accuracy": 0.16600671410560608,
"num_tokens": 24176413.0,
"step": 13105
},
{
"entropy": 5.584618663787841,
"epoch": 1.1014072673808024,
"grad_norm": 1.578125,
"learning_rate": 0.0004884123820232792,
"loss": 5.2803,
"mean_token_accuracy": 0.17590437978506088,
"num_tokens": 24185135.0,
"step": 13110
},
{
"entropy": 5.556184577941894,
"epoch": 1.1018273471959672,
"grad_norm": 1.359375,
"learning_rate": 0.0004884028937041451,
"loss": 5.3612,
"mean_token_accuracy": 0.17218423038721084,
"num_tokens": 24193273.0,
"step": 13115
},
{
"entropy": 5.674712181091309,
"epoch": 1.1022474270111322,
"grad_norm": 1.34375,
"learning_rate": 0.0004883934016046886,
"loss": 5.4296,
"mean_token_accuracy": 0.16387299448251724,
"num_tokens": 24202509.0,
"step": 13120
},
{
"entropy": 5.638438415527344,
"epoch": 1.102667506826297,
"grad_norm": 1.484375,
"learning_rate": 0.000488383905725078,
"loss": 5.4361,
"mean_token_accuracy": 0.15972956120967866,
"num_tokens": 24212644.0,
"step": 13125
},
{
"entropy": 5.581481313705444,
"epoch": 1.1030875866414618,
"grad_norm": 1.3984375,
"learning_rate": 0.0004883744060654811,
"loss": 5.3231,
"mean_token_accuracy": 0.166090789437294,
"num_tokens": 24221838.0,
"step": 13130
},
{
"entropy": 5.599612712860107,
"epoch": 1.1035076664566268,
"grad_norm": 1.3359375,
"learning_rate": 0.0004883649026260667,
"loss": 5.4107,
"mean_token_accuracy": 0.1680230289697647,
"num_tokens": 24230987.0,
"step": 13135
},
{
"entropy": 5.583003854751587,
"epoch": 1.1039277462717916,
"grad_norm": 1.3671875,
"learning_rate": 0.0004883553954070028,
"loss": 5.3501,
"mean_token_accuracy": 0.16952130943536758,
"num_tokens": 24240523.0,
"step": 13140
},
{
"entropy": 5.640346050262451,
"epoch": 1.1043478260869566,
"grad_norm": 1.5078125,
"learning_rate": 0.000488345884408458,
"loss": 5.4524,
"mean_token_accuracy": 0.17038542330265044,
"num_tokens": 24249799.0,
"step": 13145
},
{
"entropy": 5.63459324836731,
"epoch": 1.1047679059021214,
"grad_norm": 1.4375,
"learning_rate": 0.0004883363696306007,
"loss": 5.3915,
"mean_token_accuracy": 0.17235672175884248,
"num_tokens": 24259361.0,
"step": 13150
},
{
"entropy": 5.598242950439453,
"epoch": 1.1051879857172864,
"grad_norm": 1.59375,
"learning_rate": 0.0004883268510735995,
"loss": 5.3733,
"mean_token_accuracy": 0.16741275489330293,
"num_tokens": 24268010.0,
"step": 13155
},
{
"entropy": 5.520478200912476,
"epoch": 1.1056080655324512,
"grad_norm": 1.609375,
"learning_rate": 0.0004883173287376229,
"loss": 5.4047,
"mean_token_accuracy": 0.1646900400519371,
"num_tokens": 24277416.0,
"step": 13160
},
{
"entropy": 5.688388013839722,
"epoch": 1.106028145347616,
"grad_norm": 1.6484375,
"learning_rate": 0.0004883078026228397,
"loss": 5.4788,
"mean_token_accuracy": 0.16411201059818267,
"num_tokens": 24286185.0,
"step": 13165
},
{
"entropy": 5.653516483306885,
"epoch": 1.106448225162781,
"grad_norm": 1.359375,
"learning_rate": 0.0004882982727294187,
"loss": 5.3392,
"mean_token_accuracy": 0.16574593335390092,
"num_tokens": 24295382.0,
"step": 13170
},
{
"entropy": 5.580877017974854,
"epoch": 1.1068683049779457,
"grad_norm": 1.53125,
"learning_rate": 0.0004882887390575284,
"loss": 5.3869,
"mean_token_accuracy": 0.16956889778375625,
"num_tokens": 24305197.0,
"step": 13175
},
{
"entropy": 5.642392110824585,
"epoch": 1.1072883847931108,
"grad_norm": 1.4453125,
"learning_rate": 0.0004882792016073381,
"loss": 5.4575,
"mean_token_accuracy": 0.1588519960641861,
"num_tokens": 24314149.0,
"step": 13180
},
{
"entropy": 5.680260229110718,
"epoch": 1.1077084646082755,
"grad_norm": 1.4375,
"learning_rate": 0.00048826966037901655,
"loss": 5.3905,
"mean_token_accuracy": 0.16521805226802827,
"num_tokens": 24323737.0,
"step": 13185
},
{
"entropy": 5.5753613948822025,
"epoch": 1.1081285444234406,
"grad_norm": 1.90625,
"learning_rate": 0.00048826011537273276,
"loss": 5.3475,
"mean_token_accuracy": 0.17049191743135453,
"num_tokens": 24332853.0,
"step": 13190
},
{
"entropy": 5.601475477218628,
"epoch": 1.1085486242386053,
"grad_norm": 1.640625,
"learning_rate": 0.0004882505665886558,
"loss": 5.4938,
"mean_token_accuracy": 0.16070522665977477,
"num_tokens": 24342632.0,
"step": 13195
},
{
"entropy": 5.597259950637818,
"epoch": 1.1089687040537701,
"grad_norm": 1.34375,
"learning_rate": 0.00048824101402695493,
"loss": 5.3294,
"mean_token_accuracy": 0.16487249583005906,
"num_tokens": 24351659.0,
"step": 13200
},
{
"entropy": 5.573502779006958,
"epoch": 1.1093887838689351,
"grad_norm": 1.3359375,
"learning_rate": 0.0004882314576877993,
"loss": 5.3573,
"mean_token_accuracy": 0.16396226584911347,
"num_tokens": 24360938.0,
"step": 13205
},
{
"entropy": 5.590566873550415,
"epoch": 1.1098088636841,
"grad_norm": 1.265625,
"learning_rate": 0.0004882218975713581,
"loss": 5.4325,
"mean_token_accuracy": 0.1662053346633911,
"num_tokens": 24369603.0,
"step": 13210
},
{
"entropy": 5.55758228302002,
"epoch": 1.110228943499265,
"grad_norm": 1.4609375,
"learning_rate": 0.0004882123336778009,
"loss": 5.3622,
"mean_token_accuracy": 0.16560360342264174,
"num_tokens": 24377605.0,
"step": 13215
},
{
"entropy": 5.629861640930176,
"epoch": 1.1106490233144297,
"grad_norm": 2.109375,
"learning_rate": 0.0004882027660072969,
"loss": 5.411,
"mean_token_accuracy": 0.16419745087623597,
"num_tokens": 24386930.0,
"step": 13220
},
{
"entropy": 5.585262393951416,
"epoch": 1.1110691031295947,
"grad_norm": 1.625,
"learning_rate": 0.0004881931945600157,
"loss": 5.3849,
"mean_token_accuracy": 0.17502808570861816,
"num_tokens": 24396473.0,
"step": 13225
},
{
"entropy": 5.652954816818237,
"epoch": 1.1114891829447595,
"grad_norm": 1.421875,
"learning_rate": 0.0004881836193361269,
"loss": 5.4586,
"mean_token_accuracy": 0.1681210294365883,
"num_tokens": 24405461.0,
"step": 13230
},
{
"entropy": 5.652054214477539,
"epoch": 1.1119092627599243,
"grad_norm": 1.328125,
"learning_rate": 0.0004881740403358,
"loss": 5.4116,
"mean_token_accuracy": 0.1679125174880028,
"num_tokens": 24414138.0,
"step": 13235
},
{
"entropy": 5.619665193557739,
"epoch": 1.1123293425750893,
"grad_norm": 1.5625,
"learning_rate": 0.00048816445755920474,
"loss": 5.4036,
"mean_token_accuracy": 0.17271453738212586,
"num_tokens": 24423386.0,
"step": 13240
},
{
"entropy": 5.601930093765259,
"epoch": 1.112749422390254,
"grad_norm": 1.3828125,
"learning_rate": 0.0004881548710065109,
"loss": 5.3854,
"mean_token_accuracy": 0.16733854860067368,
"num_tokens": 24433637.0,
"step": 13245
},
{
"entropy": 5.606436014175415,
"epoch": 1.113169502205419,
"grad_norm": 1.4296875,
"learning_rate": 0.0004881452806778883,
"loss": 5.4658,
"mean_token_accuracy": 0.1666820153594017,
"num_tokens": 24443677.0,
"step": 13250
},
{
"entropy": 5.591290473937988,
"epoch": 1.113589582020584,
"grad_norm": 1.4296875,
"learning_rate": 0.00048813568657350676,
"loss": 5.3565,
"mean_token_accuracy": 0.17067422717809677,
"num_tokens": 24452317.0,
"step": 13255
},
{
"entropy": 5.629110860824585,
"epoch": 1.1140096618357487,
"grad_norm": 1.5546875,
"learning_rate": 0.0004881260886935363,
"loss": 5.3715,
"mean_token_accuracy": 0.16931205689907075,
"num_tokens": 24460626.0,
"step": 13260
},
{
"entropy": 5.678458547592163,
"epoch": 1.1144297416509137,
"grad_norm": 1.6953125,
"learning_rate": 0.00048811648703814693,
"loss": 5.4479,
"mean_token_accuracy": 0.15688713639974594,
"num_tokens": 24469583.0,
"step": 13265
},
{
"entropy": 5.631380414962768,
"epoch": 1.1148498214660785,
"grad_norm": 1.40625,
"learning_rate": 0.0004881068816075087,
"loss": 5.401,
"mean_token_accuracy": 0.16380657255649567,
"num_tokens": 24478811.0,
"step": 13270
},
{
"entropy": 5.622896528244018,
"epoch": 1.1152699012812435,
"grad_norm": 1.671875,
"learning_rate": 0.00048809727240179193,
"loss": 5.4255,
"mean_token_accuracy": 0.1649166464805603,
"num_tokens": 24487818.0,
"step": 13275
},
{
"entropy": 5.600995302200317,
"epoch": 1.1156899810964083,
"grad_norm": 1.390625,
"learning_rate": 0.0004880876594211665,
"loss": 5.4096,
"mean_token_accuracy": 0.16489541977643968,
"num_tokens": 24497087.0,
"step": 13280
},
{
"entropy": 5.6360047340393065,
"epoch": 1.1161100609115733,
"grad_norm": 1.5703125,
"learning_rate": 0.00048807804266580304,
"loss": 5.3632,
"mean_token_accuracy": 0.16394616216421126,
"num_tokens": 24505347.0,
"step": 13285
},
{
"entropy": 5.695936107635498,
"epoch": 1.116530140726738,
"grad_norm": 1.3203125,
"learning_rate": 0.0004880684221358717,
"loss": 5.3831,
"mean_token_accuracy": 0.1704442098736763,
"num_tokens": 24514732.0,
"step": 13290
},
{
"entropy": 5.634682989120483,
"epoch": 1.116950220541903,
"grad_norm": 1.3359375,
"learning_rate": 0.00048805879783154305,
"loss": 5.4159,
"mean_token_accuracy": 0.1660299375653267,
"num_tokens": 24523295.0,
"step": 13295
},
{
"entropy": 5.53900842666626,
"epoch": 1.1173703003570679,
"grad_norm": 1.2421875,
"learning_rate": 0.00048804916975298744,
"loss": 5.3212,
"mean_token_accuracy": 0.1719472512602806,
"num_tokens": 24532415.0,
"step": 13300
},
{
"entropy": 5.6734466552734375,
"epoch": 1.1177903801722326,
"grad_norm": 1.2890625,
"learning_rate": 0.0004880395379003755,
"loss": 5.4459,
"mean_token_accuracy": 0.16593600958585739,
"num_tokens": 24541856.0,
"step": 13305
},
{
"entropy": 5.593643999099731,
"epoch": 1.1182104599873977,
"grad_norm": 1.390625,
"learning_rate": 0.00048802990227387797,
"loss": 5.4327,
"mean_token_accuracy": 0.16149331778287887,
"num_tokens": 24550982.0,
"step": 13310
},
{
"entropy": 5.648898267745972,
"epoch": 1.1186305398025624,
"grad_norm": 1.3125,
"learning_rate": 0.00048802026287366525,
"loss": 5.5156,
"mean_token_accuracy": 0.15624148845672609,
"num_tokens": 24561176.0,
"step": 13315
},
{
"entropy": 5.700360822677612,
"epoch": 1.1190506196177274,
"grad_norm": 1.3671875,
"learning_rate": 0.00048801061969990834,
"loss": 5.3903,
"mean_token_accuracy": 0.16208946257829665,
"num_tokens": 24570741.0,
"step": 13320
},
{
"entropy": 5.6200910091400145,
"epoch": 1.1194706994328922,
"grad_norm": 1.2890625,
"learning_rate": 0.00048800097275277795,
"loss": 5.3888,
"mean_token_accuracy": 0.16930052489042283,
"num_tokens": 24580175.0,
"step": 13325
},
{
"entropy": 5.646196603775024,
"epoch": 1.119890779248057,
"grad_norm": 2.140625,
"learning_rate": 0.000487991322032445,
"loss": 5.3924,
"mean_token_accuracy": 0.17066533267498016,
"num_tokens": 24588754.0,
"step": 13330
},
{
"entropy": 5.754522705078125,
"epoch": 1.120310859063222,
"grad_norm": 1.28125,
"learning_rate": 0.0004879816675390805,
"loss": 5.5695,
"mean_token_accuracy": 0.1605545163154602,
"num_tokens": 24599429.0,
"step": 13335
},
{
"entropy": 5.567851495742798,
"epoch": 1.1207309388783868,
"grad_norm": 1.4296875,
"learning_rate": 0.00048797200927285547,
"loss": 5.3155,
"mean_token_accuracy": 0.16831441819667817,
"num_tokens": 24608767.0,
"step": 13340
},
{
"entropy": 5.608456087112427,
"epoch": 1.1211510186935518,
"grad_norm": 1.40625,
"learning_rate": 0.0004879623472339409,
"loss": 5.4713,
"mean_token_accuracy": 0.16665953397750854,
"num_tokens": 24618232.0,
"step": 13345
},
{
"entropy": 5.660943508148193,
"epoch": 1.1215710985087166,
"grad_norm": 1.3046875,
"learning_rate": 0.000487952681422508,
"loss": 5.3619,
"mean_token_accuracy": 0.16547955572605133,
"num_tokens": 24626986.0,
"step": 13350
},
{
"entropy": 5.511749696731568,
"epoch": 1.1219911783238816,
"grad_norm": 1.34375,
"learning_rate": 0.000487943011838728,
"loss": 5.2437,
"mean_token_accuracy": 0.17598886042833328,
"num_tokens": 24635283.0,
"step": 13355
},
{
"entropy": 5.4977783203125,
"epoch": 1.1224112581390464,
"grad_norm": 1.5,
"learning_rate": 0.0004879333384827722,
"loss": 5.3552,
"mean_token_accuracy": 0.16658277064561844,
"num_tokens": 24644451.0,
"step": 13360
},
{
"entropy": 5.711081838607788,
"epoch": 1.1228313379542114,
"grad_norm": 1.2734375,
"learning_rate": 0.0004879236613548119,
"loss": 5.4771,
"mean_token_accuracy": 0.16327747851610183,
"num_tokens": 24654811.0,
"step": 13365
},
{
"entropy": 5.649907445907592,
"epoch": 1.1232514177693762,
"grad_norm": 1.3671875,
"learning_rate": 0.0004879139804550187,
"loss": 5.4013,
"mean_token_accuracy": 0.16892933398485183,
"num_tokens": 24663712.0,
"step": 13370
},
{
"entropy": 5.661358642578125,
"epoch": 1.123671497584541,
"grad_norm": 1.375,
"learning_rate": 0.00048790429578356387,
"loss": 5.5134,
"mean_token_accuracy": 0.16258434057235718,
"num_tokens": 24672518.0,
"step": 13375
},
{
"entropy": 5.632959175109863,
"epoch": 1.124091577399706,
"grad_norm": 1.296875,
"learning_rate": 0.00048789460734061915,
"loss": 5.3936,
"mean_token_accuracy": 0.16919077038764954,
"num_tokens": 24681900.0,
"step": 13380
},
{
"entropy": 5.604445552825927,
"epoch": 1.1245116572148708,
"grad_norm": 1.296875,
"learning_rate": 0.0004878849151263561,
"loss": 5.379,
"mean_token_accuracy": 0.17044458240270616,
"num_tokens": 24691760.0,
"step": 13385
},
{
"entropy": 5.671233224868774,
"epoch": 1.1249317370300358,
"grad_norm": 1.5234375,
"learning_rate": 0.0004878752191409463,
"loss": 5.3569,
"mean_token_accuracy": 0.17419452518224715,
"num_tokens": 24700742.0,
"step": 13390
},
{
"entropy": 5.60303225517273,
"epoch": 1.1253518168452006,
"grad_norm": 1.265625,
"learning_rate": 0.0004878655193845616,
"loss": 5.4102,
"mean_token_accuracy": 0.16591497808694838,
"num_tokens": 24709329.0,
"step": 13395
},
{
"entropy": 5.610186529159546,
"epoch": 1.1257718966603654,
"grad_norm": 1.6640625,
"learning_rate": 0.00048785581585737394,
"loss": 5.5499,
"mean_token_accuracy": 0.160182985663414,
"num_tokens": 24718475.0,
"step": 13400
},
{
"entropy": 5.656254863739013,
"epoch": 1.1261919764755304,
"grad_norm": 1.265625,
"learning_rate": 0.000487846108559555,
"loss": 5.4206,
"mean_token_accuracy": 0.17357761263847352,
"num_tokens": 24727817.0,
"step": 13405
},
{
"entropy": 5.617264366149902,
"epoch": 1.1266120562906952,
"grad_norm": 1.4921875,
"learning_rate": 0.00048783639749127694,
"loss": 5.4067,
"mean_token_accuracy": 0.164626245200634,
"num_tokens": 24737057.0,
"step": 13410
},
{
"entropy": 5.627209663391113,
"epoch": 1.1270321361058602,
"grad_norm": 1.3515625,
"learning_rate": 0.0004878266826527116,
"loss": 5.446,
"mean_token_accuracy": 0.15796937346458434,
"num_tokens": 24746016.0,
"step": 13415
},
{
"entropy": 5.673505258560181,
"epoch": 1.127452215921025,
"grad_norm": 1.3828125,
"learning_rate": 0.00048781696404403126,
"loss": 5.4348,
"mean_token_accuracy": 0.16286305636167525,
"num_tokens": 24755978.0,
"step": 13420
},
{
"entropy": 5.607972860336304,
"epoch": 1.12787229573619,
"grad_norm": 1.4453125,
"learning_rate": 0.00048780724166540794,
"loss": 5.3502,
"mean_token_accuracy": 0.17005843669176102,
"num_tokens": 24765255.0,
"step": 13425
},
{
"entropy": 5.568961668014526,
"epoch": 1.1282923755513548,
"grad_norm": 1.2890625,
"learning_rate": 0.0004877975155170139,
"loss": 5.4052,
"mean_token_accuracy": 0.16267422288656236,
"num_tokens": 24774339.0,
"step": 13430
},
{
"entropy": 5.621396780014038,
"epoch": 1.1287124553665198,
"grad_norm": 1.4921875,
"learning_rate": 0.0004877877855990215,
"loss": 5.4085,
"mean_token_accuracy": 0.16280805170536042,
"num_tokens": 24783236.0,
"step": 13435
},
{
"entropy": 5.57719235420227,
"epoch": 1.1291325351816845,
"grad_norm": 1.46875,
"learning_rate": 0.000487778051911603,
"loss": 5.3125,
"mean_token_accuracy": 0.17248792499303817,
"num_tokens": 24792168.0,
"step": 13440
},
{
"entropy": 5.6991523742675785,
"epoch": 1.1295526149968493,
"grad_norm": 1.3671875,
"learning_rate": 0.0004877683144549308,
"loss": 5.4719,
"mean_token_accuracy": 0.1690712794661522,
"num_tokens": 24800843.0,
"step": 13445
},
{
"entropy": 5.594174861907959,
"epoch": 1.1299726948120143,
"grad_norm": 1.6484375,
"learning_rate": 0.00048775857322917753,
"loss": 5.357,
"mean_token_accuracy": 0.1660827800631523,
"num_tokens": 24810475.0,
"step": 13450
},
{
"entropy": 5.524617290496826,
"epoch": 1.1303927746271791,
"grad_norm": 1.234375,
"learning_rate": 0.0004877488282345158,
"loss": 5.4315,
"mean_token_accuracy": 0.16663852632045745,
"num_tokens": 24820486.0,
"step": 13455
},
{
"entropy": 5.7005352020263675,
"epoch": 1.1308128544423441,
"grad_norm": 1.734375,
"learning_rate": 0.000487739079471118,
"loss": 5.4918,
"mean_token_accuracy": 0.16643402948975564,
"num_tokens": 24830243.0,
"step": 13460
},
{
"entropy": 5.7090178489685055,
"epoch": 1.131232934257509,
"grad_norm": 1.2578125,
"learning_rate": 0.000487729326939157,
"loss": 5.4034,
"mean_token_accuracy": 0.1664557173848152,
"num_tokens": 24839090.0,
"step": 13465
},
{
"entropy": 5.620492267608642,
"epoch": 1.1316530140726737,
"grad_norm": 1.5390625,
"learning_rate": 0.00048771957063880553,
"loss": 5.3924,
"mean_token_accuracy": 0.163766011595726,
"num_tokens": 24847933.0,
"step": 13470
},
{
"entropy": 5.647629690170288,
"epoch": 1.1320730938878387,
"grad_norm": 1.5234375,
"learning_rate": 0.0004877098105702363,
"loss": 5.3983,
"mean_token_accuracy": 0.16802556663751603,
"num_tokens": 24857037.0,
"step": 13475
},
{
"entropy": 5.5166588306427,
"epoch": 1.1324931737030035,
"grad_norm": 1.4609375,
"learning_rate": 0.00048770004673362243,
"loss": 5.2202,
"mean_token_accuracy": 0.17570091038942337,
"num_tokens": 24866042.0,
"step": 13480
},
{
"entropy": 5.475602483749389,
"epoch": 1.1329132535181685,
"grad_norm": 1.4765625,
"learning_rate": 0.00048769027912913673,
"loss": 5.1931,
"mean_token_accuracy": 0.17908476293087006,
"num_tokens": 24873735.0,
"step": 13485
},
{
"entropy": 5.45988130569458,
"epoch": 1.1333333333333333,
"grad_norm": 1.625,
"learning_rate": 0.0004876805077569522,
"loss": 5.28,
"mean_token_accuracy": 0.17178382575511933,
"num_tokens": 24882277.0,
"step": 13490
},
{
"entropy": 5.546327066421509,
"epoch": 1.133753413148498,
"grad_norm": 1.34375,
"learning_rate": 0.00048767073261724204,
"loss": 5.3993,
"mean_token_accuracy": 0.16492031663656234,
"num_tokens": 24891354.0,
"step": 13495
},
{
"entropy": 5.649689531326294,
"epoch": 1.134173492963663,
"grad_norm": 1.5546875,
"learning_rate": 0.0004876609537101793,
"loss": 5.387,
"mean_token_accuracy": 0.16514926701784133,
"num_tokens": 24899887.0,
"step": 13500
},
{
"entropy": 5.702745819091797,
"epoch": 1.1345935727788279,
"grad_norm": 1.4296875,
"learning_rate": 0.0004876511710359374,
"loss": 5.4035,
"mean_token_accuracy": 0.16728077232837676,
"num_tokens": 24908616.0,
"step": 13505
},
{
"entropy": 5.617979431152344,
"epoch": 1.135013652593993,
"grad_norm": 2.53125,
"learning_rate": 0.00048764138459468935,
"loss": 5.4362,
"mean_token_accuracy": 0.16735305339097978,
"num_tokens": 24917864.0,
"step": 13510
},
{
"entropy": 5.677032327651977,
"epoch": 1.1354337324091577,
"grad_norm": 1.4375,
"learning_rate": 0.00048763159438660876,
"loss": 5.4697,
"mean_token_accuracy": 0.16366630792617798,
"num_tokens": 24927864.0,
"step": 13515
},
{
"entropy": 5.5547889232635494,
"epoch": 1.1358538122243227,
"grad_norm": 1.46875,
"learning_rate": 0.00048762180041186893,
"loss": 5.3571,
"mean_token_accuracy": 0.17089581340551377,
"num_tokens": 24937146.0,
"step": 13520
},
{
"entropy": 5.657744407653809,
"epoch": 1.1362738920394875,
"grad_norm": 1.515625,
"learning_rate": 0.0004876120026706434,
"loss": 5.4354,
"mean_token_accuracy": 0.1635855630040169,
"num_tokens": 24945694.0,
"step": 13525
},
{
"entropy": 5.61405930519104,
"epoch": 1.1366939718546525,
"grad_norm": 1.4609375,
"learning_rate": 0.0004876022011631057,
"loss": 5.3377,
"mean_token_accuracy": 0.17318065762519835,
"num_tokens": 24955325.0,
"step": 13530
},
{
"entropy": 5.529127979278565,
"epoch": 1.1371140516698173,
"grad_norm": 1.3671875,
"learning_rate": 0.0004875923958894295,
"loss": 5.228,
"mean_token_accuracy": 0.17365142405033113,
"num_tokens": 24964028.0,
"step": 13535
},
{
"entropy": 5.591036605834961,
"epoch": 1.137534131484982,
"grad_norm": 1.40625,
"learning_rate": 0.00048758258684978846,
"loss": 5.413,
"mean_token_accuracy": 0.1684279128909111,
"num_tokens": 24972923.0,
"step": 13540
},
{
"entropy": 5.604875659942627,
"epoch": 1.137954211300147,
"grad_norm": 1.5078125,
"learning_rate": 0.00048757277404435636,
"loss": 5.301,
"mean_token_accuracy": 0.16950101405382156,
"num_tokens": 24982156.0,
"step": 13545
},
{
"entropy": 5.595570707321167,
"epoch": 1.1383742911153119,
"grad_norm": 1.515625,
"learning_rate": 0.000487562957473307,
"loss": 5.3656,
"mean_token_accuracy": 0.16865910440683365,
"num_tokens": 24991616.0,
"step": 13550
},
{
"entropy": 5.593280410766601,
"epoch": 1.1387943709304769,
"grad_norm": 1.328125,
"learning_rate": 0.0004875531371368144,
"loss": 5.4201,
"mean_token_accuracy": 0.16665563136339187,
"num_tokens": 25001140.0,
"step": 13555
},
{
"entropy": 5.606621646881104,
"epoch": 1.1392144507456416,
"grad_norm": 1.4609375,
"learning_rate": 0.00048754331303505236,
"loss": 5.3333,
"mean_token_accuracy": 0.17235686779022216,
"num_tokens": 25010863.0,
"step": 13560
},
{
"entropy": 5.632106494903565,
"epoch": 1.1396345305608064,
"grad_norm": 1.6171875,
"learning_rate": 0.00048753348516819496,
"loss": 5.4277,
"mean_token_accuracy": 0.1671194389462471,
"num_tokens": 25019770.0,
"step": 13565
},
{
"entropy": 5.715825510025025,
"epoch": 1.1400546103759714,
"grad_norm": 1.359375,
"learning_rate": 0.0004875236535364163,
"loss": 5.4702,
"mean_token_accuracy": 0.16148280426859857,
"num_tokens": 25029900.0,
"step": 13570
},
{
"entropy": 5.705228042602539,
"epoch": 1.1404746901911362,
"grad_norm": 1.296875,
"learning_rate": 0.0004875138181398906,
"loss": 5.4363,
"mean_token_accuracy": 0.165011228621006,
"num_tokens": 25039428.0,
"step": 13575
},
{
"entropy": 5.6060789108276365,
"epoch": 1.1408947700063012,
"grad_norm": 1.7734375,
"learning_rate": 0.000487503978978792,
"loss": 5.3975,
"mean_token_accuracy": 0.16259874999523163,
"num_tokens": 25049145.0,
"step": 13580
},
{
"entropy": 5.631357717514038,
"epoch": 1.141314849821466,
"grad_norm": 1.3671875,
"learning_rate": 0.00048749413605329487,
"loss": 5.4398,
"mean_token_accuracy": 0.1695747569203377,
"num_tokens": 25058772.0,
"step": 13585
},
{
"entropy": 5.635422277450561,
"epoch": 1.141734929636631,
"grad_norm": 1.3828125,
"learning_rate": 0.00048748428936357346,
"loss": 5.3524,
"mean_token_accuracy": 0.1705583453178406,
"num_tokens": 25067249.0,
"step": 13590
},
{
"entropy": 5.56702938079834,
"epoch": 1.1421550094517958,
"grad_norm": 1.484375,
"learning_rate": 0.0004874744389098024,
"loss": 5.3114,
"mean_token_accuracy": 0.16375732421875,
"num_tokens": 25076893.0,
"step": 13595
},
{
"entropy": 5.507349300384521,
"epoch": 1.1425750892669608,
"grad_norm": 1.4453125,
"learning_rate": 0.0004874645846921559,
"loss": 5.3208,
"mean_token_accuracy": 0.1698176808655262,
"num_tokens": 25086238.0,
"step": 13600
},
{
"entropy": 5.549192476272583,
"epoch": 1.1429951690821256,
"grad_norm": 1.625,
"learning_rate": 0.00048745472671080884,
"loss": 5.3589,
"mean_token_accuracy": 0.16234881430864334,
"num_tokens": 25095334.0,
"step": 13605
},
{
"entropy": 5.610916471481323,
"epoch": 1.1434152488972904,
"grad_norm": 1.3671875,
"learning_rate": 0.00048744486496593565,
"loss": 5.3356,
"mean_token_accuracy": 0.16913952976465224,
"num_tokens": 25104136.0,
"step": 13610
},
{
"entropy": 5.624828624725342,
"epoch": 1.1438353287124554,
"grad_norm": 1.65625,
"learning_rate": 0.000487434999457711,
"loss": 5.3358,
"mean_token_accuracy": 0.17875144481658936,
"num_tokens": 25112629.0,
"step": 13615
},
{
"entropy": 5.611724948883056,
"epoch": 1.1442554085276202,
"grad_norm": 1.640625,
"learning_rate": 0.0004874251301863098,
"loss": 5.3789,
"mean_token_accuracy": 0.1664452612400055,
"num_tokens": 25121014.0,
"step": 13620
},
{
"entropy": 5.567342233657837,
"epoch": 1.1446754883427852,
"grad_norm": 1.4375,
"learning_rate": 0.00048741525715190675,
"loss": 5.4235,
"mean_token_accuracy": 0.16376064270734786,
"num_tokens": 25130097.0,
"step": 13625
},
{
"entropy": 5.6526110649108885,
"epoch": 1.14509556815795,
"grad_norm": 1.234375,
"learning_rate": 0.0004874053803546769,
"loss": 5.4046,
"mean_token_accuracy": 0.17253290265798568,
"num_tokens": 25139065.0,
"step": 13630
},
{
"entropy": 5.585218048095703,
"epoch": 1.1455156479731148,
"grad_norm": 1.609375,
"learning_rate": 0.000487395499794795,
"loss": 5.3932,
"mean_token_accuracy": 0.16701920330524445,
"num_tokens": 25148852.0,
"step": 13635
},
{
"entropy": 5.5763102054595945,
"epoch": 1.1459357277882798,
"grad_norm": 1.46875,
"learning_rate": 0.0004873856154724362,
"loss": 5.2999,
"mean_token_accuracy": 0.17658825665712358,
"num_tokens": 25157580.0,
"step": 13640
},
{
"entropy": 5.624576044082642,
"epoch": 1.1463558076034446,
"grad_norm": 1.65625,
"learning_rate": 0.0004873757273877756,
"loss": 5.3987,
"mean_token_accuracy": 0.16335485577583314,
"num_tokens": 25166243.0,
"step": 13645
},
{
"entropy": 5.665804243087768,
"epoch": 1.1467758874186096,
"grad_norm": 1.4609375,
"learning_rate": 0.00048736583554098836,
"loss": 5.4097,
"mean_token_accuracy": 0.16634505987167358,
"num_tokens": 25174674.0,
"step": 13650
},
{
"entropy": 5.593222188949585,
"epoch": 1.1471959672337744,
"grad_norm": 1.734375,
"learning_rate": 0.00048735593993224973,
"loss": 5.3078,
"mean_token_accuracy": 0.17730923742055893,
"num_tokens": 25183892.0,
"step": 13655
},
{
"entropy": 5.549776411056518,
"epoch": 1.1476160470489394,
"grad_norm": 1.6640625,
"learning_rate": 0.00048734604056173495,
"loss": 5.3598,
"mean_token_accuracy": 0.16848810911178588,
"num_tokens": 25192731.0,
"step": 13660
},
{
"entropy": 5.613305473327637,
"epoch": 1.1480361268641042,
"grad_norm": 1.5,
"learning_rate": 0.00048733613742961933,
"loss": 5.4558,
"mean_token_accuracy": 0.16849016547203063,
"num_tokens": 25201280.0,
"step": 13665
},
{
"entropy": 5.5975159168243405,
"epoch": 1.1484562066792692,
"grad_norm": 1.5859375,
"learning_rate": 0.00048732623053607846,
"loss": 5.3408,
"mean_token_accuracy": 0.1680586501955986,
"num_tokens": 25209929.0,
"step": 13670
},
{
"entropy": 5.595581865310669,
"epoch": 1.148876286494434,
"grad_norm": 1.3515625,
"learning_rate": 0.0004873163198812877,
"loss": 5.2593,
"mean_token_accuracy": 0.17326510548591614,
"num_tokens": 25218583.0,
"step": 13675
},
{
"entropy": 5.670962858200073,
"epoch": 1.1492963663095987,
"grad_norm": 1.5390625,
"learning_rate": 0.0004873064054654227,
"loss": 5.4711,
"mean_token_accuracy": 0.16238503828644751,
"num_tokens": 25228949.0,
"step": 13680
},
{
"entropy": 5.6353504180908205,
"epoch": 1.1497164461247638,
"grad_norm": 1.3828125,
"learning_rate": 0.00048729648728865904,
"loss": 5.3113,
"mean_token_accuracy": 0.1810284286737442,
"num_tokens": 25238603.0,
"step": 13685
},
{
"entropy": 5.592605924606323,
"epoch": 1.1501365259399285,
"grad_norm": 1.4296875,
"learning_rate": 0.00048728656535117237,
"loss": 5.4358,
"mean_token_accuracy": 0.15900354832410812,
"num_tokens": 25248265.0,
"step": 13690
},
{
"entropy": 5.600039768218994,
"epoch": 1.1505566057550936,
"grad_norm": 1.5,
"learning_rate": 0.0004872766396531386,
"loss": 5.4156,
"mean_token_accuracy": 0.17142994403839112,
"num_tokens": 25258195.0,
"step": 13695
},
{
"entropy": 5.654155158996582,
"epoch": 1.1509766855702583,
"grad_norm": 1.328125,
"learning_rate": 0.00048726671019473335,
"loss": 5.3735,
"mean_token_accuracy": 0.17005601674318313,
"num_tokens": 25267886.0,
"step": 13700
},
{
"entropy": 5.640446376800537,
"epoch": 1.1513967653854231,
"grad_norm": 1.4375,
"learning_rate": 0.00048725677697613267,
"loss": 5.4046,
"mean_token_accuracy": 0.16572085916996002,
"num_tokens": 25277304.0,
"step": 13705
},
{
"entropy": 5.649771642684937,
"epoch": 1.1518168452005881,
"grad_norm": 1.40625,
"learning_rate": 0.0004872468399975125,
"loss": 5.421,
"mean_token_accuracy": 0.15717388838529586,
"num_tokens": 25286771.0,
"step": 13710
},
{
"entropy": 5.703133583068848,
"epoch": 1.152236925015753,
"grad_norm": 1.46875,
"learning_rate": 0.00048723689925904884,
"loss": 5.4695,
"mean_token_accuracy": 0.16371950656175613,
"num_tokens": 25296018.0,
"step": 13715
},
{
"entropy": 5.61578860282898,
"epoch": 1.152657004830918,
"grad_norm": 1.453125,
"learning_rate": 0.0004872269547609179,
"loss": 5.4264,
"mean_token_accuracy": 0.1729790225625038,
"num_tokens": 25305737.0,
"step": 13720
},
{
"entropy": 5.544830274581909,
"epoch": 1.1530770846460827,
"grad_norm": 1.328125,
"learning_rate": 0.0004872170065032956,
"loss": 5.2474,
"mean_token_accuracy": 0.17219720929861068,
"num_tokens": 25314625.0,
"step": 13725
},
{
"entropy": 5.61985330581665,
"epoch": 1.1534971644612477,
"grad_norm": 1.4453125,
"learning_rate": 0.0004872070544863584,
"loss": 5.3848,
"mean_token_accuracy": 0.1666283816099167,
"num_tokens": 25323453.0,
"step": 13730
},
{
"entropy": 5.626963329315186,
"epoch": 1.1539172442764125,
"grad_norm": 1.2578125,
"learning_rate": 0.0004871970987102824,
"loss": 5.4085,
"mean_token_accuracy": 0.16893158257007598,
"num_tokens": 25333236.0,
"step": 13735
},
{
"entropy": 5.609739208221436,
"epoch": 1.1543373240915775,
"grad_norm": 1.5703125,
"learning_rate": 0.0004871871391752442,
"loss": 5.3066,
"mean_token_accuracy": 0.16515172719955445,
"num_tokens": 25341993.0,
"step": 13740
},
{
"entropy": 5.62867693901062,
"epoch": 1.1547574039067423,
"grad_norm": 1.5703125,
"learning_rate": 0.00048717717588141993,
"loss": 5.3267,
"mean_token_accuracy": 0.1705668330192566,
"num_tokens": 25350695.0,
"step": 13745
},
{
"entropy": 5.608707094192505,
"epoch": 1.155177483721907,
"grad_norm": 1.40625,
"learning_rate": 0.0004871672088289863,
"loss": 5.39,
"mean_token_accuracy": 0.17076995223760605,
"num_tokens": 25359044.0,
"step": 13750
},
{
"entropy": 5.566635465621948,
"epoch": 1.155597563537072,
"grad_norm": 1.328125,
"learning_rate": 0.00048715723801811986,
"loss": 5.3938,
"mean_token_accuracy": 0.1646006867289543,
"num_tokens": 25367959.0,
"step": 13755
},
{
"entropy": 5.59621958732605,
"epoch": 1.156017643352237,
"grad_norm": 1.5390625,
"learning_rate": 0.00048714726344899716,
"loss": 5.4061,
"mean_token_accuracy": 0.16776042878627778,
"num_tokens": 25376968.0,
"step": 13760
},
{
"entropy": 5.534719324111938,
"epoch": 1.156437723167402,
"grad_norm": 1.3359375,
"learning_rate": 0.0004871372851217949,
"loss": 5.303,
"mean_token_accuracy": 0.17216388881206512,
"num_tokens": 25385381.0,
"step": 13765
},
{
"entropy": 5.618631601333618,
"epoch": 1.1568578029825667,
"grad_norm": 1.2734375,
"learning_rate": 0.0004871273030366899,
"loss": 5.4117,
"mean_token_accuracy": 0.1639874517917633,
"num_tokens": 25394647.0,
"step": 13770
},
{
"entropy": 5.5666650295257565,
"epoch": 1.1572778827977315,
"grad_norm": 1.34375,
"learning_rate": 0.0004871173171938589,
"loss": 5.3698,
"mean_token_accuracy": 0.17487365901470184,
"num_tokens": 25403973.0,
"step": 13775
},
{
"entropy": 5.593236780166626,
"epoch": 1.1576979626128965,
"grad_norm": 1.6953125,
"learning_rate": 0.0004871073275934789,
"loss": 5.3521,
"mean_token_accuracy": 0.16769609302282334,
"num_tokens": 25412319.0,
"step": 13780
},
{
"entropy": 5.55703592300415,
"epoch": 1.1581180424280613,
"grad_norm": 2.0,
"learning_rate": 0.00048709733423572685,
"loss": 5.3784,
"mean_token_accuracy": 0.16842261999845504,
"num_tokens": 25420558.0,
"step": 13785
},
{
"entropy": 5.518136262893677,
"epoch": 1.1585381222432263,
"grad_norm": 1.3125,
"learning_rate": 0.00048708733712077973,
"loss": 5.3248,
"mean_token_accuracy": 0.16903136074543,
"num_tokens": 25429258.0,
"step": 13790
},
{
"entropy": 5.607215785980225,
"epoch": 1.158958202058391,
"grad_norm": 1.59375,
"learning_rate": 0.0004870773362488146,
"loss": 5.2969,
"mean_token_accuracy": 0.1754231795668602,
"num_tokens": 25438005.0,
"step": 13795
},
{
"entropy": 5.542892122268677,
"epoch": 1.159378281873556,
"grad_norm": 1.4296875,
"learning_rate": 0.0004870673316200087,
"loss": 5.3058,
"mean_token_accuracy": 0.17361946552991867,
"num_tokens": 25447120.0,
"step": 13800
},
{
"entropy": 5.529282331466675,
"epoch": 1.1597983616887209,
"grad_norm": 1.359375,
"learning_rate": 0.0004870573232345392,
"loss": 5.295,
"mean_token_accuracy": 0.17088494449853897,
"num_tokens": 25456216.0,
"step": 13805
},
{
"entropy": 5.784336519241333,
"epoch": 1.1602184415038856,
"grad_norm": 1.296875,
"learning_rate": 0.0004870473110925834,
"loss": 5.5661,
"mean_token_accuracy": 0.16316404044628144,
"num_tokens": 25466456.0,
"step": 13810
},
{
"entropy": 5.544894504547119,
"epoch": 1.1606385213190507,
"grad_norm": 1.3125,
"learning_rate": 0.0004870372951943187,
"loss": 5.2263,
"mean_token_accuracy": 0.17741502523422242,
"num_tokens": 25475217.0,
"step": 13815
},
{
"entropy": 5.584645557403564,
"epoch": 1.1610586011342154,
"grad_norm": 1.453125,
"learning_rate": 0.00048702727553992243,
"loss": 5.5097,
"mean_token_accuracy": 0.15981806218624114,
"num_tokens": 25484617.0,
"step": 13820
},
{
"entropy": 5.6074260711669925,
"epoch": 1.1614786809493804,
"grad_norm": 1.390625,
"learning_rate": 0.00048701725212957223,
"loss": 5.3281,
"mean_token_accuracy": 0.17539868801832198,
"num_tokens": 25493936.0,
"step": 13825
},
{
"entropy": 5.609130477905273,
"epoch": 1.1618987607645452,
"grad_norm": 1.4375,
"learning_rate": 0.0004870072249634455,
"loss": 5.2952,
"mean_token_accuracy": 0.17726642042398452,
"num_tokens": 25502306.0,
"step": 13830
},
{
"entropy": 5.567359161376953,
"epoch": 1.1623188405797102,
"grad_norm": 1.296875,
"learning_rate": 0.00048699719404172006,
"loss": 5.3697,
"mean_token_accuracy": 0.166751691699028,
"num_tokens": 25511247.0,
"step": 13835
},
{
"entropy": 5.600587511062622,
"epoch": 1.162738920394875,
"grad_norm": 1.375,
"learning_rate": 0.00048698715936457344,
"loss": 5.4214,
"mean_token_accuracy": 0.164691025018692,
"num_tokens": 25520482.0,
"step": 13840
},
{
"entropy": 5.6135289669036865,
"epoch": 1.1631590002100398,
"grad_norm": 1.359375,
"learning_rate": 0.00048697712093218336,
"loss": 5.3012,
"mean_token_accuracy": 0.17024406641721726,
"num_tokens": 25529854.0,
"step": 13845
},
{
"entropy": 5.528973960876465,
"epoch": 1.1635790800252048,
"grad_norm": 1.34375,
"learning_rate": 0.0004869670787447279,
"loss": 5.2452,
"mean_token_accuracy": 0.1754808947443962,
"num_tokens": 25538251.0,
"step": 13850
},
{
"entropy": 5.520005512237549,
"epoch": 1.1639991598403696,
"grad_norm": 1.4453125,
"learning_rate": 0.0004869570328023846,
"loss": 5.3322,
"mean_token_accuracy": 0.17265767753124237,
"num_tokens": 25546889.0,
"step": 13855
},
{
"entropy": 5.559149742126465,
"epoch": 1.1644192396555346,
"grad_norm": 1.578125,
"learning_rate": 0.00048694698310533177,
"loss": 5.3699,
"mean_token_accuracy": 0.167795892059803,
"num_tokens": 25557040.0,
"step": 13860
},
{
"entropy": 5.695978212356567,
"epoch": 1.1648393194706994,
"grad_norm": 1.421875,
"learning_rate": 0.0004869369296537472,
"loss": 5.5365,
"mean_token_accuracy": 0.15896812081336975,
"num_tokens": 25565798.0,
"step": 13865
},
{
"entropy": 5.715288591384888,
"epoch": 1.1652593992858642,
"grad_norm": 1.4296875,
"learning_rate": 0.0004869268724478091,
"loss": 5.386,
"mean_token_accuracy": 0.1702848941087723,
"num_tokens": 25575039.0,
"step": 13870
},
{
"entropy": 5.666198587417602,
"epoch": 1.1656794791010292,
"grad_norm": 1.3046875,
"learning_rate": 0.00048691681148769545,
"loss": 5.382,
"mean_token_accuracy": 0.16508956253528595,
"num_tokens": 25584635.0,
"step": 13875
},
{
"entropy": 5.506886911392212,
"epoch": 1.166099558916194,
"grad_norm": 1.4453125,
"learning_rate": 0.0004869067467735847,
"loss": 5.2984,
"mean_token_accuracy": 0.17627811133861543,
"num_tokens": 25593736.0,
"step": 13880
},
{
"entropy": 5.5315882682800295,
"epoch": 1.166519638731359,
"grad_norm": 1.640625,
"learning_rate": 0.0004868966783056551,
"loss": 5.2671,
"mean_token_accuracy": 0.18442590683698654,
"num_tokens": 25602685.0,
"step": 13885
},
{
"entropy": 5.558185482025147,
"epoch": 1.1669397185465238,
"grad_norm": 1.40625,
"learning_rate": 0.00048688660608408484,
"loss": 5.3657,
"mean_token_accuracy": 0.1673026517033577,
"num_tokens": 25610690.0,
"step": 13890
},
{
"entropy": 5.519009828567505,
"epoch": 1.1673597983616888,
"grad_norm": 1.3671875,
"learning_rate": 0.00048687653010905254,
"loss": 5.2548,
"mean_token_accuracy": 0.1778439462184906,
"num_tokens": 25619805.0,
"step": 13895
},
{
"entropy": 5.68157844543457,
"epoch": 1.1677798781768536,
"grad_norm": 1.3671875,
"learning_rate": 0.00048686645038073664,
"loss": 5.457,
"mean_token_accuracy": 0.16299156993627548,
"num_tokens": 25629447.0,
"step": 13900
},
{
"entropy": 5.638311815261841,
"epoch": 1.1681999579920186,
"grad_norm": 1.359375,
"learning_rate": 0.00048685636689931554,
"loss": 5.3085,
"mean_token_accuracy": 0.1740482419729233,
"num_tokens": 25638619.0,
"step": 13905
},
{
"entropy": 5.6293586730957035,
"epoch": 1.1686200378071834,
"grad_norm": 1.3359375,
"learning_rate": 0.00048684627966496803,
"loss": 5.3962,
"mean_token_accuracy": 0.17413543313741683,
"num_tokens": 25648255.0,
"step": 13910
},
{
"entropy": 5.560601282119751,
"epoch": 1.1690401176223482,
"grad_norm": 1.421875,
"learning_rate": 0.00048683618867787284,
"loss": 5.4378,
"mean_token_accuracy": 0.1661060631275177,
"num_tokens": 25657881.0,
"step": 13915
},
{
"entropy": 5.648235702514649,
"epoch": 1.1694601974375132,
"grad_norm": 1.5390625,
"learning_rate": 0.0004868260939382086,
"loss": 5.452,
"mean_token_accuracy": 0.1665305510163307,
"num_tokens": 25666773.0,
"step": 13920
},
{
"entropy": 5.632356214523315,
"epoch": 1.169880277252678,
"grad_norm": 1.3515625,
"learning_rate": 0.0004868159954461542,
"loss": 5.3471,
"mean_token_accuracy": 0.1685002401471138,
"num_tokens": 25675152.0,
"step": 13925
},
{
"entropy": 5.725224542617798,
"epoch": 1.170300357067843,
"grad_norm": 1.5703125,
"learning_rate": 0.00048680589320188847,
"loss": 5.4441,
"mean_token_accuracy": 0.16157277077436447,
"num_tokens": 25684962.0,
"step": 13930
},
{
"entropy": 5.576214838027954,
"epoch": 1.1707204368830078,
"grad_norm": 1.7265625,
"learning_rate": 0.0004867957872055904,
"loss": 5.3605,
"mean_token_accuracy": 0.16613447070121765,
"num_tokens": 25693782.0,
"step": 13935
},
{
"entropy": 5.543605089187622,
"epoch": 1.1711405166981725,
"grad_norm": 1.46875,
"learning_rate": 0.00048678567745743905,
"loss": 5.325,
"mean_token_accuracy": 0.1713411509990692,
"num_tokens": 25703081.0,
"step": 13940
},
{
"entropy": 5.59487566947937,
"epoch": 1.1715605965133375,
"grad_norm": 1.328125,
"learning_rate": 0.0004867755639576135,
"loss": 5.3436,
"mean_token_accuracy": 0.1717812567949295,
"num_tokens": 25711628.0,
"step": 13945
},
{
"entropy": 5.524281072616577,
"epoch": 1.1719806763285023,
"grad_norm": 1.375,
"learning_rate": 0.0004867654467062928,
"loss": 5.3767,
"mean_token_accuracy": 0.17248852252960206,
"num_tokens": 25720676.0,
"step": 13950
},
{
"entropy": 5.5976708889007565,
"epoch": 1.1724007561436673,
"grad_norm": 1.3515625,
"learning_rate": 0.00048675532570365633,
"loss": 5.3365,
"mean_token_accuracy": 0.17378393113613128,
"num_tokens": 25729920.0,
"step": 13955
},
{
"entropy": 5.5958537578582765,
"epoch": 1.1728208359588321,
"grad_norm": 1.359375,
"learning_rate": 0.00048674520094988327,
"loss": 5.3109,
"mean_token_accuracy": 0.17501317262649535,
"num_tokens": 25739745.0,
"step": 13960
},
{
"entropy": 5.550080633163452,
"epoch": 1.1732409157739971,
"grad_norm": 1.4140625,
"learning_rate": 0.00048673507244515303,
"loss": 5.3275,
"mean_token_accuracy": 0.17092910557985305,
"num_tokens": 25748636.0,
"step": 13965
},
{
"entropy": 5.649429512023926,
"epoch": 1.173660995589162,
"grad_norm": 1.4140625,
"learning_rate": 0.000486724940189645,
"loss": 5.4664,
"mean_token_accuracy": 0.16761331260204315,
"num_tokens": 25758393.0,
"step": 13970
},
{
"entropy": 5.626720714569092,
"epoch": 1.174081075404327,
"grad_norm": 1.359375,
"learning_rate": 0.0004867148041835386,
"loss": 5.439,
"mean_token_accuracy": 0.1595470979809761,
"num_tokens": 25768520.0,
"step": 13975
},
{
"entropy": 5.5244852066040036,
"epoch": 1.1745011552194917,
"grad_norm": 1.4375,
"learning_rate": 0.0004867046644270136,
"loss": 5.2576,
"mean_token_accuracy": 0.1772868111729622,
"num_tokens": 25777168.0,
"step": 13980
},
{
"entropy": 5.71324348449707,
"epoch": 1.1749212350346565,
"grad_norm": 1.328125,
"learning_rate": 0.0004866945209202494,
"loss": 5.5505,
"mean_token_accuracy": 0.15432295054197312,
"num_tokens": 25787042.0,
"step": 13985
},
{
"entropy": 5.648117446899414,
"epoch": 1.1753413148498215,
"grad_norm": 1.453125,
"learning_rate": 0.0004866843736634258,
"loss": 5.4295,
"mean_token_accuracy": 0.1675956055521965,
"num_tokens": 25796784.0,
"step": 13990
},
{
"entropy": 5.692070960998535,
"epoch": 1.1757613946649863,
"grad_norm": 1.515625,
"learning_rate": 0.0004866742226567225,
"loss": 5.4677,
"mean_token_accuracy": 0.15732361227273942,
"num_tokens": 25806285.0,
"step": 13995
},
{
"entropy": 5.618652057647705,
"epoch": 1.1761814744801513,
"grad_norm": 1.4140625,
"learning_rate": 0.00048666406790031936,
"loss": 5.3144,
"mean_token_accuracy": 0.1640985354781151,
"num_tokens": 25814889.0,
"step": 14000
},
{
"entropy": 5.572588062286377,
"epoch": 1.176601554295316,
"grad_norm": 1.4140625,
"learning_rate": 0.0004866539093943962,
"loss": 5.3716,
"mean_token_accuracy": 0.1675340011715889,
"num_tokens": 25824551.0,
"step": 14005
},
{
"entropy": 5.628482294082642,
"epoch": 1.1770216341104809,
"grad_norm": 1.4921875,
"learning_rate": 0.00048664374713913304,
"loss": 5.4206,
"mean_token_accuracy": 0.16298734694719313,
"num_tokens": 25834482.0,
"step": 14010
},
{
"entropy": 5.671939754486084,
"epoch": 1.177441713925646,
"grad_norm": 1.4921875,
"learning_rate": 0.0004866335811347099,
"loss": 5.4331,
"mean_token_accuracy": 0.16675578504800798,
"num_tokens": 25843274.0,
"step": 14015
},
{
"entropy": 5.684998893737793,
"epoch": 1.1778617937408107,
"grad_norm": 1.8828125,
"learning_rate": 0.00048662341138130683,
"loss": 5.4321,
"mean_token_accuracy": 0.1594082310795784,
"num_tokens": 25852482.0,
"step": 14020
},
{
"entropy": 5.665592050552368,
"epoch": 1.1782818735559757,
"grad_norm": 1.3125,
"learning_rate": 0.00048661323787910405,
"loss": 5.4246,
"mean_token_accuracy": 0.16257014721632004,
"num_tokens": 25862657.0,
"step": 14025
},
{
"entropy": 5.586136817932129,
"epoch": 1.1787019533711405,
"grad_norm": 1.390625,
"learning_rate": 0.0004866030606282817,
"loss": 5.3726,
"mean_token_accuracy": 0.16970161497592925,
"num_tokens": 25871492.0,
"step": 14030
},
{
"entropy": 5.62667407989502,
"epoch": 1.1791220331863055,
"grad_norm": 1.5,
"learning_rate": 0.00048659287962902006,
"loss": 5.3666,
"mean_token_accuracy": 0.17091266363859176,
"num_tokens": 25880979.0,
"step": 14035
},
{
"entropy": 5.580191278457642,
"epoch": 1.1795421130014703,
"grad_norm": 1.46875,
"learning_rate": 0.00048658269488149945,
"loss": 5.3535,
"mean_token_accuracy": 0.16362757086753846,
"num_tokens": 25891060.0,
"step": 14040
},
{
"entropy": 5.705811500549316,
"epoch": 1.1799621928166353,
"grad_norm": 1.4375,
"learning_rate": 0.0004865725063859005,
"loss": 5.488,
"mean_token_accuracy": 0.1669595867395401,
"num_tokens": 25900421.0,
"step": 14045
},
{
"entropy": 5.618188953399658,
"epoch": 1.1803822726318,
"grad_norm": 1.3984375,
"learning_rate": 0.00048656231414240345,
"loss": 5.3797,
"mean_token_accuracy": 0.16654564589262008,
"num_tokens": 25909614.0,
"step": 14050
},
{
"entropy": 5.541171455383301,
"epoch": 1.1808023524469649,
"grad_norm": 1.3203125,
"learning_rate": 0.000486552118151189,
"loss": 5.4244,
"mean_token_accuracy": 0.16173808872699738,
"num_tokens": 25919324.0,
"step": 14055
},
{
"entropy": 5.612274122238159,
"epoch": 1.1812224322621299,
"grad_norm": 1.234375,
"learning_rate": 0.00048654191841243763,
"loss": 5.4059,
"mean_token_accuracy": 0.1739427775144577,
"num_tokens": 25928818.0,
"step": 14060
},
{
"entropy": 5.616962432861328,
"epoch": 1.1816425120772946,
"grad_norm": 1.5,
"learning_rate": 0.0004865317149263301,
"loss": 5.4399,
"mean_token_accuracy": 0.16519006937742234,
"num_tokens": 25938148.0,
"step": 14065
},
{
"entropy": 5.56295223236084,
"epoch": 1.1820625918924597,
"grad_norm": 1.546875,
"learning_rate": 0.0004865215076930473,
"loss": 5.3754,
"mean_token_accuracy": 0.16697040051221848,
"num_tokens": 25947210.0,
"step": 14070
},
{
"entropy": 5.593865633010864,
"epoch": 1.1824826717076244,
"grad_norm": 1.328125,
"learning_rate": 0.0004865112967127697,
"loss": 5.3583,
"mean_token_accuracy": 0.170071779191494,
"num_tokens": 25955949.0,
"step": 14075
},
{
"entropy": 5.5404736518859865,
"epoch": 1.1829027515227892,
"grad_norm": 1.5625,
"learning_rate": 0.0004865010819856786,
"loss": 5.2969,
"mean_token_accuracy": 0.16847216039896012,
"num_tokens": 25964193.0,
"step": 14080
},
{
"entropy": 5.551244688034058,
"epoch": 1.1833228313379542,
"grad_norm": 1.4765625,
"learning_rate": 0.0004864908635119546,
"loss": 5.3896,
"mean_token_accuracy": 0.16945898830890654,
"num_tokens": 25973141.0,
"step": 14085
},
{
"entropy": 5.681430816650391,
"epoch": 1.183742911153119,
"grad_norm": 1.390625,
"learning_rate": 0.0004864806412917788,
"loss": 5.4456,
"mean_token_accuracy": 0.17200513929128647,
"num_tokens": 25982650.0,
"step": 14090
},
{
"entropy": 5.736534643173218,
"epoch": 1.184162990968284,
"grad_norm": 1.359375,
"learning_rate": 0.0004864704153253325,
"loss": 5.4406,
"mean_token_accuracy": 0.1592070296406746,
"num_tokens": 25992096.0,
"step": 14095
},
{
"entropy": 5.6888525009155275,
"epoch": 1.1845830707834488,
"grad_norm": 1.640625,
"learning_rate": 0.00048646018561279665,
"loss": 5.4168,
"mean_token_accuracy": 0.16635299921035768,
"num_tokens": 26002063.0,
"step": 14100
},
{
"entropy": 5.502923154830933,
"epoch": 1.1850031505986138,
"grad_norm": 1.3125,
"learning_rate": 0.00048644995215435245,
"loss": 5.2612,
"mean_token_accuracy": 0.1743235930800438,
"num_tokens": 26010716.0,
"step": 14105
},
{
"entropy": 5.574075698852539,
"epoch": 1.1854232304137786,
"grad_norm": 1.359375,
"learning_rate": 0.0004864397149501812,
"loss": 5.3586,
"mean_token_accuracy": 0.167898553609848,
"num_tokens": 26019136.0,
"step": 14110
},
{
"entropy": 5.629922819137573,
"epoch": 1.1858433102289434,
"grad_norm": 1.7421875,
"learning_rate": 0.00048642947400046434,
"loss": 5.3827,
"mean_token_accuracy": 0.17956559509038925,
"num_tokens": 26028029.0,
"step": 14115
},
{
"entropy": 5.686328029632568,
"epoch": 1.1862633900441084,
"grad_norm": 1.5390625,
"learning_rate": 0.00048641922930538325,
"loss": 5.536,
"mean_token_accuracy": 0.16046979129314423,
"num_tokens": 26038025.0,
"step": 14120
},
{
"entropy": 5.6663109302520756,
"epoch": 1.1866834698592732,
"grad_norm": 1.3046875,
"learning_rate": 0.0004864089808651193,
"loss": 5.493,
"mean_token_accuracy": 0.1532455489039421,
"num_tokens": 26048427.0,
"step": 14125
},
{
"entropy": 5.605689144134521,
"epoch": 1.1871035496744382,
"grad_norm": 1.796875,
"learning_rate": 0.0004863987286798541,
"loss": 5.3066,
"mean_token_accuracy": 0.16599408239126207,
"num_tokens": 26057682.0,
"step": 14130
},
{
"entropy": 5.567851543426514,
"epoch": 1.187523629489603,
"grad_norm": 1.5,
"learning_rate": 0.0004863884727497693,
"loss": 5.3759,
"mean_token_accuracy": 0.1674002528190613,
"num_tokens": 26066562.0,
"step": 14135
},
{
"entropy": 5.563521385192871,
"epoch": 1.187943709304768,
"grad_norm": 1.4453125,
"learning_rate": 0.0004863782130750466,
"loss": 5.2897,
"mean_token_accuracy": 0.17027911990880967,
"num_tokens": 26075633.0,
"step": 14140
},
{
"entropy": 5.6153826236724855,
"epoch": 1.1883637891199328,
"grad_norm": 1.40625,
"learning_rate": 0.00048636794965586764,
"loss": 5.4471,
"mean_token_accuracy": 0.16176531463861465,
"num_tokens": 26085160.0,
"step": 14145
},
{
"entropy": 5.546812963485718,
"epoch": 1.1887838689350976,
"grad_norm": 1.3203125,
"learning_rate": 0.00048635768249241434,
"loss": 5.3516,
"mean_token_accuracy": 0.17043941468000412,
"num_tokens": 26094157.0,
"step": 14150
},
{
"entropy": 5.674842119216919,
"epoch": 1.1892039487502626,
"grad_norm": 1.3828125,
"learning_rate": 0.0004863474115848685,
"loss": 5.4592,
"mean_token_accuracy": 0.16983729153871535,
"num_tokens": 26104459.0,
"step": 14155
},
{
"entropy": 5.590853452682495,
"epoch": 1.1896240285654274,
"grad_norm": 1.3125,
"learning_rate": 0.00048633713693341214,
"loss": 5.3959,
"mean_token_accuracy": 0.1641297832131386,
"num_tokens": 26114468.0,
"step": 14160
},
{
"entropy": 5.568353366851807,
"epoch": 1.1900441083805924,
"grad_norm": 1.328125,
"learning_rate": 0.00048632685853822714,
"loss": 5.3698,
"mean_token_accuracy": 0.16373370140790938,
"num_tokens": 26123408.0,
"step": 14165
},
{
"entropy": 5.572306394577026,
"epoch": 1.1904641881957572,
"grad_norm": 1.3515625,
"learning_rate": 0.0004863165763994957,
"loss": 5.3678,
"mean_token_accuracy": 0.1610241025686264,
"num_tokens": 26132692.0,
"step": 14170
},
{
"entropy": 5.696545028686524,
"epoch": 1.190884268010922,
"grad_norm": 1.453125,
"learning_rate": 0.0004863062905173999,
"loss": 5.5264,
"mean_token_accuracy": 0.16376537829637527,
"num_tokens": 26142259.0,
"step": 14175
},
{
"entropy": 5.65267333984375,
"epoch": 1.191304347826087,
"grad_norm": 1.4453125,
"learning_rate": 0.000486296000892122,
"loss": 5.3862,
"mean_token_accuracy": 0.16619327515363694,
"num_tokens": 26151782.0,
"step": 14180
},
{
"entropy": 5.531172466278076,
"epoch": 1.1917244276412517,
"grad_norm": 1.53125,
"learning_rate": 0.00048628570752384424,
"loss": 5.2208,
"mean_token_accuracy": 0.17376811504364015,
"num_tokens": 26160449.0,
"step": 14185
},
{
"entropy": 5.609526109695435,
"epoch": 1.1921445074564168,
"grad_norm": 1.375,
"learning_rate": 0.00048627541041274897,
"loss": 5.4643,
"mean_token_accuracy": 0.16051455587148666,
"num_tokens": 26169764.0,
"step": 14190
},
{
"entropy": 5.644153261184693,
"epoch": 1.1925645872715815,
"grad_norm": 1.5078125,
"learning_rate": 0.00048626510955901854,
"loss": 5.3284,
"mean_token_accuracy": 0.16696933507919312,
"num_tokens": 26178759.0,
"step": 14195
},
{
"entropy": 5.687915563583374,
"epoch": 1.1929846670867466,
"grad_norm": 1.3359375,
"learning_rate": 0.0004862548049628356,
"loss": 5.4591,
"mean_token_accuracy": 0.16625112295150757,
"num_tokens": 26187904.0,
"step": 14200
},
{
"entropy": 5.636867952346802,
"epoch": 1.1934047469019113,
"grad_norm": 1.359375,
"learning_rate": 0.0004862444966243824,
"loss": 5.3852,
"mean_token_accuracy": 0.17131132632493973,
"num_tokens": 26196563.0,
"step": 14205
},
{
"entropy": 5.701703214645386,
"epoch": 1.1938248267170763,
"grad_norm": 2.328125,
"learning_rate": 0.0004862341845438419,
"loss": 5.3917,
"mean_token_accuracy": 0.1661001428961754,
"num_tokens": 26206573.0,
"step": 14210
},
{
"entropy": 5.6225780010223385,
"epoch": 1.1942449065322411,
"grad_norm": 1.90625,
"learning_rate": 0.00048622386872139645,
"loss": 5.3207,
"mean_token_accuracy": 0.16739476919174195,
"num_tokens": 26215308.0,
"step": 14215
},
{
"entropy": 5.550726795196534,
"epoch": 1.194664986347406,
"grad_norm": 1.3828125,
"learning_rate": 0.000486213549157229,
"loss": 5.3796,
"mean_token_accuracy": 0.16657185107469558,
"num_tokens": 26224379.0,
"step": 14220
},
{
"entropy": 5.580405950546265,
"epoch": 1.195085066162571,
"grad_norm": 1.421875,
"learning_rate": 0.0004862032258515222,
"loss": 5.3572,
"mean_token_accuracy": 0.17274150252342224,
"num_tokens": 26233620.0,
"step": 14225
},
{
"entropy": 5.579659271240234,
"epoch": 1.1955051459777357,
"grad_norm": 1.3671875,
"learning_rate": 0.0004861928988044592,
"loss": 5.4351,
"mean_token_accuracy": 0.16125431805849075,
"num_tokens": 26242556.0,
"step": 14230
},
{
"entropy": 5.631987619400024,
"epoch": 1.1959252257929007,
"grad_norm": 1.4453125,
"learning_rate": 0.0004861825680162226,
"loss": 5.4119,
"mean_token_accuracy": 0.16598995327949523,
"num_tokens": 26251561.0,
"step": 14235
},
{
"entropy": 5.5751265525817875,
"epoch": 1.1963453056080655,
"grad_norm": 1.4296875,
"learning_rate": 0.00048617223348699546,
"loss": 5.3389,
"mean_token_accuracy": 0.17048846334218978,
"num_tokens": 26261115.0,
"step": 14240
},
{
"entropy": 5.708127737045288,
"epoch": 1.1967653854232303,
"grad_norm": 1.5859375,
"learning_rate": 0.0004861618952169611,
"loss": 5.4989,
"mean_token_accuracy": 0.17341294065117835,
"num_tokens": 26271165.0,
"step": 14245
},
{
"entropy": 5.619239473342896,
"epoch": 1.1971854652383953,
"grad_norm": 1.34375,
"learning_rate": 0.0004861515532063025,
"loss": 5.4541,
"mean_token_accuracy": 0.1659351631999016,
"num_tokens": 26280822.0,
"step": 14250
},
{
"entropy": 5.605314683914185,
"epoch": 1.19760554505356,
"grad_norm": 1.421875,
"learning_rate": 0.00048614120745520275,
"loss": 5.3373,
"mean_token_accuracy": 0.17087636142969131,
"num_tokens": 26288747.0,
"step": 14255
},
{
"entropy": 5.633758163452148,
"epoch": 1.198025624868725,
"grad_norm": 1.5078125,
"learning_rate": 0.00048613085796384524,
"loss": 5.4121,
"mean_token_accuracy": 0.16057505309581757,
"num_tokens": 26298387.0,
"step": 14260
},
{
"entropy": 5.604875898361206,
"epoch": 1.19844570468389,
"grad_norm": 1.46875,
"learning_rate": 0.00048612050473241335,
"loss": 5.3026,
"mean_token_accuracy": 0.17800578474998474,
"num_tokens": 26307016.0,
"step": 14265
},
{
"entropy": 5.608559846878052,
"epoch": 1.198865784499055,
"grad_norm": 1.2890625,
"learning_rate": 0.0004861101477610905,
"loss": 5.4087,
"mean_token_accuracy": 0.16609525978565215,
"num_tokens": 26316296.0,
"step": 14270
},
{
"entropy": 5.620381259918213,
"epoch": 1.1992858643142197,
"grad_norm": 1.3671875,
"learning_rate": 0.00048609978705006,
"loss": 5.4088,
"mean_token_accuracy": 0.16030900180339813,
"num_tokens": 26325525.0,
"step": 14275
},
{
"entropy": 5.588323783874512,
"epoch": 1.1997059441293847,
"grad_norm": 1.4453125,
"learning_rate": 0.0004860894225995055,
"loss": 5.2936,
"mean_token_accuracy": 0.17848464250564575,
"num_tokens": 26334195.0,
"step": 14280
},
{
"entropy": 5.591770601272583,
"epoch": 1.2001260239445495,
"grad_norm": 1.4765625,
"learning_rate": 0.00048607905440961054,
"loss": 5.4193,
"mean_token_accuracy": 0.16965034008026122,
"num_tokens": 26343933.0,
"step": 14285
},
{
"entropy": 5.585553359985352,
"epoch": 1.2005461037597143,
"grad_norm": 1.5859375,
"learning_rate": 0.00048606868248055887,
"loss": 5.3655,
"mean_token_accuracy": 0.17240449637174607,
"num_tokens": 26353455.0,
"step": 14290
},
{
"entropy": 5.625445175170898,
"epoch": 1.2009661835748793,
"grad_norm": 1.5625,
"learning_rate": 0.0004860583068125341,
"loss": 5.3595,
"mean_token_accuracy": 0.1763713002204895,
"num_tokens": 26362662.0,
"step": 14295
},
{
"entropy": 5.617782211303711,
"epoch": 1.201386263390044,
"grad_norm": 1.7109375,
"learning_rate": 0.0004860479274057202,
"loss": 5.3823,
"mean_token_accuracy": 0.16244335919618608,
"num_tokens": 26371536.0,
"step": 14300
},
{
"entropy": 5.688301944732666,
"epoch": 1.201806343205209,
"grad_norm": 1.625,
"learning_rate": 0.00048603754426030087,
"loss": 5.4778,
"mean_token_accuracy": 0.16066131889820098,
"num_tokens": 26381925.0,
"step": 14305
},
{
"entropy": 5.561420583724976,
"epoch": 1.2022264230203739,
"grad_norm": 1.5078125,
"learning_rate": 0.00048602715737646016,
"loss": 5.3225,
"mean_token_accuracy": 0.1739985778927803,
"num_tokens": 26391111.0,
"step": 14310
},
{
"entropy": 5.65993103981018,
"epoch": 1.2026465028355386,
"grad_norm": 1.375,
"learning_rate": 0.00048601676675438197,
"loss": 5.468,
"mean_token_accuracy": 0.15840389132499694,
"num_tokens": 26401667.0,
"step": 14315
},
{
"entropy": 5.5718177318572994,
"epoch": 1.2030665826507037,
"grad_norm": 1.546875,
"learning_rate": 0.00048600637239425045,
"loss": 5.2976,
"mean_token_accuracy": 0.1749270662665367,
"num_tokens": 26411261.0,
"step": 14320
},
{
"entropy": 5.594204330444336,
"epoch": 1.2034866624658684,
"grad_norm": 1.4140625,
"learning_rate": 0.00048599597429624966,
"loss": 5.4523,
"mean_token_accuracy": 0.1678444340825081,
"num_tokens": 26419808.0,
"step": 14325
},
{
"entropy": 5.619553756713867,
"epoch": 1.2039067422810334,
"grad_norm": 1.4453125,
"learning_rate": 0.00048598557246056385,
"loss": 5.3716,
"mean_token_accuracy": 0.17062055170536042,
"num_tokens": 26429160.0,
"step": 14330
},
{
"entropy": 5.646615076065063,
"epoch": 1.2043268220961982,
"grad_norm": 1.3046875,
"learning_rate": 0.00048597516688737727,
"loss": 5.323,
"mean_token_accuracy": 0.16848173439502717,
"num_tokens": 26437675.0,
"step": 14335
},
{
"entropy": 5.617009687423706,
"epoch": 1.2047469019113632,
"grad_norm": 1.4375,
"learning_rate": 0.00048596475757687425,
"loss": 5.3784,
"mean_token_accuracy": 0.16564128547906876,
"num_tokens": 26446317.0,
"step": 14340
},
{
"entropy": 5.646167993545532,
"epoch": 1.205166981726528,
"grad_norm": 1.5234375,
"learning_rate": 0.00048595434452923915,
"loss": 5.4281,
"mean_token_accuracy": 0.1695418119430542,
"num_tokens": 26456183.0,
"step": 14345
},
{
"entropy": 5.607500267028809,
"epoch": 1.205587061541693,
"grad_norm": 1.375,
"learning_rate": 0.00048594392774465656,
"loss": 5.3788,
"mean_token_accuracy": 0.16330490559339522,
"num_tokens": 26466324.0,
"step": 14350
},
{
"entropy": 5.617480373382568,
"epoch": 1.2060071413568578,
"grad_norm": 1.6328125,
"learning_rate": 0.00048593350722331074,
"loss": 5.3833,
"mean_token_accuracy": 0.1691987097263336,
"num_tokens": 26475560.0,
"step": 14355
},
{
"entropy": 5.594552421569825,
"epoch": 1.2064272211720226,
"grad_norm": 1.5625,
"learning_rate": 0.00048592308296538654,
"loss": 5.3589,
"mean_token_accuracy": 0.1720819890499115,
"num_tokens": 26484955.0,
"step": 14360
},
{
"entropy": 5.626417207717895,
"epoch": 1.2068473009871876,
"grad_norm": 1.28125,
"learning_rate": 0.0004859126549710686,
"loss": 5.3285,
"mean_token_accuracy": 0.1755720019340515,
"num_tokens": 26494306.0,
"step": 14365
},
{
"entropy": 5.5393060684204105,
"epoch": 1.2072673808023524,
"grad_norm": 1.421875,
"learning_rate": 0.00048590222324054153,
"loss": 5.3148,
"mean_token_accuracy": 0.1715349316596985,
"num_tokens": 26503871.0,
"step": 14370
},
{
"entropy": 5.69525637626648,
"epoch": 1.2076874606175174,
"grad_norm": 1.5,
"learning_rate": 0.0004858917877739901,
"loss": 5.4348,
"mean_token_accuracy": 0.16756290048360825,
"num_tokens": 26511929.0,
"step": 14375
},
{
"entropy": 5.635641384124756,
"epoch": 1.2081075404326822,
"grad_norm": 1.6484375,
"learning_rate": 0.0004858813485715994,
"loss": 5.4164,
"mean_token_accuracy": 0.1570291668176651,
"num_tokens": 26520469.0,
"step": 14380
},
{
"entropy": 5.578332471847534,
"epoch": 1.208527620247847,
"grad_norm": 1.40625,
"learning_rate": 0.0004858709056335541,
"loss": 5.3824,
"mean_token_accuracy": 0.17001325488090516,
"num_tokens": 26530102.0,
"step": 14385
},
{
"entropy": 5.59905424118042,
"epoch": 1.208947700063012,
"grad_norm": 1.453125,
"learning_rate": 0.00048586045896003926,
"loss": 5.4087,
"mean_token_accuracy": 0.16630999445915223,
"num_tokens": 26538705.0,
"step": 14390
},
{
"entropy": 5.69228982925415,
"epoch": 1.2093677798781768,
"grad_norm": 1.484375,
"learning_rate": 0.0004858500085512401,
"loss": 5.4881,
"mean_token_accuracy": 0.1661173954606056,
"num_tokens": 26548315.0,
"step": 14395
},
{
"entropy": 5.649106121063232,
"epoch": 1.2097878596933418,
"grad_norm": 1.3984375,
"learning_rate": 0.00048583955440734144,
"loss": 5.3282,
"mean_token_accuracy": 0.17240462452173233,
"num_tokens": 26556412.0,
"step": 14400
},
{
"entropy": 5.625435066223145,
"epoch": 1.2102079395085066,
"grad_norm": 1.265625,
"learning_rate": 0.00048582909652852873,
"loss": 5.4968,
"mean_token_accuracy": 0.16067309379577638,
"num_tokens": 26566146.0,
"step": 14405
},
{
"entropy": 5.625951051712036,
"epoch": 1.2106280193236716,
"grad_norm": 1.515625,
"learning_rate": 0.0004858186349149871,
"loss": 5.3675,
"mean_token_accuracy": 0.17302857339382172,
"num_tokens": 26576019.0,
"step": 14410
},
{
"entropy": 5.502639245986939,
"epoch": 1.2110480991388364,
"grad_norm": 1.765625,
"learning_rate": 0.000485808169566902,
"loss": 5.2506,
"mean_token_accuracy": 0.1733314648270607,
"num_tokens": 26585461.0,
"step": 14415
},
{
"entropy": 5.525038766860962,
"epoch": 1.2114681789540014,
"grad_norm": 1.375,
"learning_rate": 0.00048579770048445863,
"loss": 5.2694,
"mean_token_accuracy": 0.18916076570749282,
"num_tokens": 26594021.0,
"step": 14420
},
{
"entropy": 5.684835815429688,
"epoch": 1.2118882587691662,
"grad_norm": 1.3828125,
"learning_rate": 0.00048578722766784253,
"loss": 5.4226,
"mean_token_accuracy": 0.17051784992218016,
"num_tokens": 26602712.0,
"step": 14425
},
{
"entropy": 5.488742208480835,
"epoch": 1.212308338584331,
"grad_norm": 1.578125,
"learning_rate": 0.00048577675111723925,
"loss": 5.126,
"mean_token_accuracy": 0.18584653735160828,
"num_tokens": 26610970.0,
"step": 14430
},
{
"entropy": 5.540399980545044,
"epoch": 1.212728418399496,
"grad_norm": 1.3671875,
"learning_rate": 0.00048576627083283435,
"loss": 5.4163,
"mean_token_accuracy": 0.1722578689455986,
"num_tokens": 26619840.0,
"step": 14435
},
{
"entropy": 5.57360143661499,
"epoch": 1.2131484982146608,
"grad_norm": 1.5,
"learning_rate": 0.0004857557868148136,
"loss": 5.2954,
"mean_token_accuracy": 0.1812925085425377,
"num_tokens": 26629271.0,
"step": 14440
},
{
"entropy": 5.578191137313842,
"epoch": 1.2135685780298258,
"grad_norm": 1.4140625,
"learning_rate": 0.0004857452990633625,
"loss": 5.3284,
"mean_token_accuracy": 0.16607633531093596,
"num_tokens": 26638610.0,
"step": 14445
},
{
"entropy": 5.695629978179932,
"epoch": 1.2139886578449905,
"grad_norm": 1.4296875,
"learning_rate": 0.00048573480757866695,
"loss": 5.4857,
"mean_token_accuracy": 0.1657136708498001,
"num_tokens": 26648504.0,
"step": 14450
},
{
"entropy": 5.607351779937744,
"epoch": 1.2144087376601553,
"grad_norm": 1.3671875,
"learning_rate": 0.00048572431236091284,
"loss": 5.3873,
"mean_token_accuracy": 0.16882047355175017,
"num_tokens": 26658084.0,
"step": 14455
},
{
"entropy": 5.644829940795899,
"epoch": 1.2148288174753203,
"grad_norm": 1.7265625,
"learning_rate": 0.00048571381341028604,
"loss": 5.4706,
"mean_token_accuracy": 0.1678345263004303,
"num_tokens": 26666933.0,
"step": 14460
},
{
"entropy": 5.644724893569946,
"epoch": 1.2152488972904851,
"grad_norm": 1.4609375,
"learning_rate": 0.0004857033107269725,
"loss": 5.3232,
"mean_token_accuracy": 0.1732431948184967,
"num_tokens": 26675049.0,
"step": 14465
},
{
"entropy": 5.52176342010498,
"epoch": 1.2156689771056501,
"grad_norm": 1.3671875,
"learning_rate": 0.00048569280431115823,
"loss": 5.4043,
"mean_token_accuracy": 0.16803978532552719,
"num_tokens": 26684223.0,
"step": 14470
},
{
"entropy": 5.621050262451172,
"epoch": 1.216089056920815,
"grad_norm": 1.328125,
"learning_rate": 0.0004856822941630296,
"loss": 5.3435,
"mean_token_accuracy": 0.16737112104892732,
"num_tokens": 26693605.0,
"step": 14475
},
{
"entropy": 5.710792303085327,
"epoch": 1.2165091367359797,
"grad_norm": 1.375,
"learning_rate": 0.00048567178028277255,
"loss": 5.4134,
"mean_token_accuracy": 0.1695218563079834,
"num_tokens": 26702829.0,
"step": 14480
},
{
"entropy": 5.642229652404785,
"epoch": 1.2169292165511447,
"grad_norm": 1.34375,
"learning_rate": 0.0004856612626705733,
"loss": 5.4595,
"mean_token_accuracy": 0.16188946068286897,
"num_tokens": 26712466.0,
"step": 14485
},
{
"entropy": 5.596743965148926,
"epoch": 1.2173492963663095,
"grad_norm": 1.5,
"learning_rate": 0.0004856507413266183,
"loss": 5.3383,
"mean_token_accuracy": 0.1772727981209755,
"num_tokens": 26721730.0,
"step": 14490
},
{
"entropy": 5.599400043487549,
"epoch": 1.2177693761814745,
"grad_norm": 1.390625,
"learning_rate": 0.000485640216251094,
"loss": 5.4108,
"mean_token_accuracy": 0.16504266113042831,
"num_tokens": 26731017.0,
"step": 14495
},
{
"entropy": 5.61780891418457,
"epoch": 1.2181894559966393,
"grad_norm": 1.40625,
"learning_rate": 0.00048562968744418665,
"loss": 5.3921,
"mean_token_accuracy": 0.16558818370103837,
"num_tokens": 26739588.0,
"step": 14500
},
{
"entropy": 5.671893072128296,
"epoch": 1.2186095358118043,
"grad_norm": 1.4921875,
"learning_rate": 0.0004856191549060828,
"loss": 5.4964,
"mean_token_accuracy": 0.1626824587583542,
"num_tokens": 26748889.0,
"step": 14505
},
{
"entropy": 5.732660531997681,
"epoch": 1.219029615626969,
"grad_norm": 1.484375,
"learning_rate": 0.00048560861863696913,
"loss": 5.4597,
"mean_token_accuracy": 0.16027596443891526,
"num_tokens": 26757979.0,
"step": 14510
},
{
"entropy": 5.643172311782837,
"epoch": 1.219449695442134,
"grad_norm": 1.40625,
"learning_rate": 0.0004855980786370322,
"loss": 5.3682,
"mean_token_accuracy": 0.17041491568088532,
"num_tokens": 26767225.0,
"step": 14515
},
{
"entropy": 5.515580940246582,
"epoch": 1.219869775257299,
"grad_norm": 1.34375,
"learning_rate": 0.0004855875349064588,
"loss": 5.3028,
"mean_token_accuracy": 0.17166967391967775,
"num_tokens": 26776289.0,
"step": 14520
},
{
"entropy": 5.628920555114746,
"epoch": 1.2202898550724637,
"grad_norm": 1.5390625,
"learning_rate": 0.0004855769874454356,
"loss": 5.4263,
"mean_token_accuracy": 0.16554307341575622,
"num_tokens": 26785631.0,
"step": 14525
},
{
"entropy": 5.6312535285949705,
"epoch": 1.2207099348876287,
"grad_norm": 1.390625,
"learning_rate": 0.0004855664362541495,
"loss": 5.4299,
"mean_token_accuracy": 0.1641710191965103,
"num_tokens": 26795285.0,
"step": 14530
},
{
"entropy": 5.590521383285522,
"epoch": 1.2211300147027935,
"grad_norm": 1.5390625,
"learning_rate": 0.00048555588133278744,
"loss": 5.3621,
"mean_token_accuracy": 0.16726839244365693,
"num_tokens": 26804584.0,
"step": 14535
},
{
"entropy": 5.49451699256897,
"epoch": 1.2215500945179585,
"grad_norm": 1.265625,
"learning_rate": 0.0004855453226815363,
"loss": 5.246,
"mean_token_accuracy": 0.17162280678749084,
"num_tokens": 26814354.0,
"step": 14540
},
{
"entropy": 5.510982656478882,
"epoch": 1.2219701743331233,
"grad_norm": 1.84375,
"learning_rate": 0.00048553476030058326,
"loss": 5.253,
"mean_token_accuracy": 0.17902337461709977,
"num_tokens": 26824274.0,
"step": 14545
},
{
"entropy": 5.539727735519409,
"epoch": 1.222390254148288,
"grad_norm": 1.3984375,
"learning_rate": 0.00048552419419011536,
"loss": 5.3939,
"mean_token_accuracy": 0.16667712926864625,
"num_tokens": 26833155.0,
"step": 14550
},
{
"entropy": 5.56984052658081,
"epoch": 1.222810333963453,
"grad_norm": 1.3359375,
"learning_rate": 0.0004855136243503196,
"loss": 5.3381,
"mean_token_accuracy": 0.1722673550248146,
"num_tokens": 26842545.0,
"step": 14555
},
{
"entropy": 5.685318660736084,
"epoch": 1.2232304137786179,
"grad_norm": 1.4921875,
"learning_rate": 0.00048550305078138363,
"loss": 5.3956,
"mean_token_accuracy": 0.17062428295612336,
"num_tokens": 26851772.0,
"step": 14560
},
{
"entropy": 5.568029260635376,
"epoch": 1.2236504935937829,
"grad_norm": 1.6171875,
"learning_rate": 0.00048549247348349435,
"loss": 5.2793,
"mean_token_accuracy": 0.174759641289711,
"num_tokens": 26860884.0,
"step": 14565
},
{
"entropy": 5.564513111114502,
"epoch": 1.2240705734089476,
"grad_norm": 1.3125,
"learning_rate": 0.00048548189245683934,
"loss": 5.4267,
"mean_token_accuracy": 0.16911133825778962,
"num_tokens": 26869435.0,
"step": 14570
},
{
"entropy": 5.592051601409912,
"epoch": 1.2244906532241127,
"grad_norm": 1.28125,
"learning_rate": 0.00048547130770160596,
"loss": 5.3124,
"mean_token_accuracy": 0.16932346522808076,
"num_tokens": 26878852.0,
"step": 14575
},
{
"entropy": 5.577689075469971,
"epoch": 1.2249107330392774,
"grad_norm": 1.421875,
"learning_rate": 0.0004854607192179817,
"loss": 5.2905,
"mean_token_accuracy": 0.17675579339265823,
"num_tokens": 26887532.0,
"step": 14580
},
{
"entropy": 5.715707492828369,
"epoch": 1.2253308128544425,
"grad_norm": 1.46875,
"learning_rate": 0.0004854501270061543,
"loss": 5.5296,
"mean_token_accuracy": 0.16146958619356155,
"num_tokens": 26897459.0,
"step": 14585
},
{
"entropy": 5.579214477539063,
"epoch": 1.2257508926696072,
"grad_norm": 1.3203125,
"learning_rate": 0.00048543953106631115,
"loss": 5.2892,
"mean_token_accuracy": 0.17801006734371186,
"num_tokens": 26907156.0,
"step": 14590
},
{
"entropy": 5.656476449966431,
"epoch": 1.226170972484772,
"grad_norm": 1.75,
"learning_rate": 0.0004854289313986401,
"loss": 5.3707,
"mean_token_accuracy": 0.17163809537887573,
"num_tokens": 26915764.0,
"step": 14595
},
{
"entropy": 5.503813743591309,
"epoch": 1.226591052299937,
"grad_norm": 1.3515625,
"learning_rate": 0.0004854183280033289,
"loss": 5.2498,
"mean_token_accuracy": 0.17583397030830383,
"num_tokens": 26924166.0,
"step": 14600
},
{
"entropy": 5.597974634170532,
"epoch": 1.2270111321151018,
"grad_norm": 1.3828125,
"learning_rate": 0.0004854077208805654,
"loss": 5.4538,
"mean_token_accuracy": 0.16351360082626343,
"num_tokens": 26933546.0,
"step": 14605
},
{
"entropy": 5.665269470214843,
"epoch": 1.2274312119302668,
"grad_norm": 1.390625,
"learning_rate": 0.0004853971100305374,
"loss": 5.4106,
"mean_token_accuracy": 0.17088524252176285,
"num_tokens": 26943213.0,
"step": 14610
},
{
"entropy": 5.642615509033203,
"epoch": 1.2278512917454316,
"grad_norm": 1.34375,
"learning_rate": 0.000485386495453433,
"loss": 5.3828,
"mean_token_accuracy": 0.17016868144273758,
"num_tokens": 26952968.0,
"step": 14615
},
{
"entropy": 5.602711772918701,
"epoch": 1.2282713715605964,
"grad_norm": 1.265625,
"learning_rate": 0.00048537587714944007,
"loss": 5.3617,
"mean_token_accuracy": 0.17121365219354628,
"num_tokens": 26962230.0,
"step": 14620
},
{
"entropy": 5.569738960266113,
"epoch": 1.2286914513757614,
"grad_norm": 1.75,
"learning_rate": 0.0004853652551187469,
"loss": 5.4234,
"mean_token_accuracy": 0.1704752415418625,
"num_tokens": 26970985.0,
"step": 14625
},
{
"entropy": 5.595652675628662,
"epoch": 1.2291115311909262,
"grad_norm": 1.328125,
"learning_rate": 0.00048535462936154147,
"loss": 5.4432,
"mean_token_accuracy": 0.16540713161230086,
"num_tokens": 26981138.0,
"step": 14630
},
{
"entropy": 5.571217012405396,
"epoch": 1.2295316110060912,
"grad_norm": 1.3515625,
"learning_rate": 0.0004853439998780122,
"loss": 5.2923,
"mean_token_accuracy": 0.17418538480997087,
"num_tokens": 26990158.0,
"step": 14635
},
{
"entropy": 5.555435848236084,
"epoch": 1.229951690821256,
"grad_norm": 1.5703125,
"learning_rate": 0.0004853333666683472,
"loss": 5.4524,
"mean_token_accuracy": 0.15979965180158615,
"num_tokens": 26998889.0,
"step": 14640
},
{
"entropy": 5.5809015274047855,
"epoch": 1.230371770636421,
"grad_norm": 1.453125,
"learning_rate": 0.00048532272973273496,
"loss": 5.3746,
"mean_token_accuracy": 0.1682994320988655,
"num_tokens": 27008912.0,
"step": 14645
},
{
"entropy": 5.584909915924072,
"epoch": 1.2307918504515858,
"grad_norm": 1.5234375,
"learning_rate": 0.00048531208907136384,
"loss": 5.2717,
"mean_token_accuracy": 0.1803266391158104,
"num_tokens": 27017573.0,
"step": 14650
},
{
"entropy": 5.545006895065308,
"epoch": 1.2312119302667508,
"grad_norm": 1.34375,
"learning_rate": 0.00048530144468442236,
"loss": 5.3466,
"mean_token_accuracy": 0.16579411923885345,
"num_tokens": 27027205.0,
"step": 14655
},
{
"entropy": 5.5912316799163815,
"epoch": 1.2316320100819156,
"grad_norm": 1.5,
"learning_rate": 0.00048529079657209906,
"loss": 5.2963,
"mean_token_accuracy": 0.17231107950210572,
"num_tokens": 27035882.0,
"step": 14660
},
{
"entropy": 5.567312002182007,
"epoch": 1.2320520898970804,
"grad_norm": 1.578125,
"learning_rate": 0.0004852801447345826,
"loss": 5.3731,
"mean_token_accuracy": 0.17508108913898468,
"num_tokens": 27044761.0,
"step": 14665
},
{
"entropy": 5.584702253341675,
"epoch": 1.2324721697122454,
"grad_norm": 1.5390625,
"learning_rate": 0.0004852694891720617,
"loss": 5.4038,
"mean_token_accuracy": 0.16734053641557695,
"num_tokens": 27054149.0,
"step": 14670
},
{
"entropy": 5.618700885772705,
"epoch": 1.2328922495274102,
"grad_norm": 1.3828125,
"learning_rate": 0.000485258829884725,
"loss": 5.4268,
"mean_token_accuracy": 0.16779596209526063,
"num_tokens": 27063145.0,
"step": 14675
},
{
"entropy": 5.663679790496826,
"epoch": 1.2333123293425752,
"grad_norm": 1.453125,
"learning_rate": 0.0004852481668727614,
"loss": 5.4075,
"mean_token_accuracy": 0.16674899011850358,
"num_tokens": 27072378.0,
"step": 14680
},
{
"entropy": 5.482738351821899,
"epoch": 1.23373240915774,
"grad_norm": 1.515625,
"learning_rate": 0.00048523750013635986,
"loss": 5.2766,
"mean_token_accuracy": 0.16852893084287643,
"num_tokens": 27082241.0,
"step": 14685
},
{
"entropy": 5.538816165924072,
"epoch": 1.2341524889729047,
"grad_norm": 1.484375,
"learning_rate": 0.0004852268296757092,
"loss": 5.2809,
"mean_token_accuracy": 0.1719914510846138,
"num_tokens": 27091488.0,
"step": 14690
},
{
"entropy": 5.646978235244751,
"epoch": 1.2345725687880698,
"grad_norm": 1.40625,
"learning_rate": 0.0004852161554909985,
"loss": 5.3468,
"mean_token_accuracy": 0.17167492508888244,
"num_tokens": 27100378.0,
"step": 14695
},
{
"entropy": 5.596823120117188,
"epoch": 1.2349926486032345,
"grad_norm": 1.3203125,
"learning_rate": 0.00048520547758241686,
"loss": 5.3657,
"mean_token_accuracy": 0.1682916134595871,
"num_tokens": 27110341.0,
"step": 14700
},
{
"entropy": 5.5891765594482425,
"epoch": 1.2354127284183996,
"grad_norm": 1.359375,
"learning_rate": 0.00048519479595015343,
"loss": 5.301,
"mean_token_accuracy": 0.16862540543079377,
"num_tokens": 27119381.0,
"step": 14705
},
{
"entropy": 5.552208662033081,
"epoch": 1.2358328082335643,
"grad_norm": 1.3359375,
"learning_rate": 0.00048518411059439746,
"loss": 5.4066,
"mean_token_accuracy": 0.16474692076444625,
"num_tokens": 27129167.0,
"step": 14710
},
{
"entropy": 5.632622337341308,
"epoch": 1.2362528880487293,
"grad_norm": 1.453125,
"learning_rate": 0.00048517342151533813,
"loss": 5.4217,
"mean_token_accuracy": 0.16225503087043763,
"num_tokens": 27138479.0,
"step": 14715
},
{
"entropy": 5.650741481781006,
"epoch": 1.2366729678638941,
"grad_norm": 1.421875,
"learning_rate": 0.0004851627287131649,
"loss": 5.3093,
"mean_token_accuracy": 0.17363323420286178,
"num_tokens": 27147197.0,
"step": 14720
},
{
"entropy": 5.559444570541382,
"epoch": 1.2370930476790591,
"grad_norm": 1.453125,
"learning_rate": 0.0004851520321880672,
"loss": 5.3244,
"mean_token_accuracy": 0.17298350632190704,
"num_tokens": 27155854.0,
"step": 14725
},
{
"entropy": 5.521408653259277,
"epoch": 1.237513127494224,
"grad_norm": 1.765625,
"learning_rate": 0.0004851413319402344,
"loss": 5.2974,
"mean_token_accuracy": 0.17052881568670272,
"num_tokens": 27165069.0,
"step": 14730
},
{
"entropy": 5.574164962768554,
"epoch": 1.2379332073093887,
"grad_norm": 1.5703125,
"learning_rate": 0.0004851306279698561,
"loss": 5.3423,
"mean_token_accuracy": 0.16862837523221968,
"num_tokens": 27174070.0,
"step": 14735
},
{
"entropy": 5.6579419612884525,
"epoch": 1.2383532871245537,
"grad_norm": 1.375,
"learning_rate": 0.0004851199202771219,
"loss": 5.4081,
"mean_token_accuracy": 0.17232888638973237,
"num_tokens": 27182903.0,
"step": 14740
},
{
"entropy": 5.600107097625733,
"epoch": 1.2387733669397185,
"grad_norm": 1.703125,
"learning_rate": 0.0004851092088622216,
"loss": 5.3261,
"mean_token_accuracy": 0.17474351972341537,
"num_tokens": 27192747.0,
"step": 14745
},
{
"entropy": 5.612848854064941,
"epoch": 1.2391934467548835,
"grad_norm": 2.484375,
"learning_rate": 0.0004850984937253448,
"loss": 5.3556,
"mean_token_accuracy": 0.17478209882974624,
"num_tokens": 27201657.0,
"step": 14750
},
{
"entropy": 5.615834140777588,
"epoch": 1.2396135265700483,
"grad_norm": 1.515625,
"learning_rate": 0.0004850877748666814,
"loss": 5.3847,
"mean_token_accuracy": 0.16818059235811234,
"num_tokens": 27211794.0,
"step": 14755
},
{
"entropy": 5.55868353843689,
"epoch": 1.240033606385213,
"grad_norm": 1.6015625,
"learning_rate": 0.00048507705228642117,
"loss": 5.3335,
"mean_token_accuracy": 0.1659922033548355,
"num_tokens": 27221852.0,
"step": 14760
},
{
"entropy": 5.535787153244018,
"epoch": 1.240453686200378,
"grad_norm": 1.3203125,
"learning_rate": 0.0004850663259847542,
"loss": 5.3732,
"mean_token_accuracy": 0.16893673092126846,
"num_tokens": 27231558.0,
"step": 14765
},
{
"entropy": 5.583010578155518,
"epoch": 1.240873766015543,
"grad_norm": 1.375,
"learning_rate": 0.00048505559596187037,
"loss": 5.367,
"mean_token_accuracy": 0.1675555646419525,
"num_tokens": 27241053.0,
"step": 14770
},
{
"entropy": 5.5506453037261965,
"epoch": 1.241293845830708,
"grad_norm": 1.40625,
"learning_rate": 0.0004850448622179599,
"loss": 5.2436,
"mean_token_accuracy": 0.1718742474913597,
"num_tokens": 27249770.0,
"step": 14775
},
{
"entropy": 5.689253664016723,
"epoch": 1.2417139256458727,
"grad_norm": 1.3984375,
"learning_rate": 0.0004850341247532128,
"loss": 5.4902,
"mean_token_accuracy": 0.16661347299814225,
"num_tokens": 27258883.0,
"step": 14780
},
{
"entropy": 5.706023788452148,
"epoch": 1.2421340054610377,
"grad_norm": 1.3828125,
"learning_rate": 0.0004850233835678194,
"loss": 5.4073,
"mean_token_accuracy": 0.16898039877414703,
"num_tokens": 27268056.0,
"step": 14785
},
{
"entropy": 5.5874651908874515,
"epoch": 1.2425540852762025,
"grad_norm": 1.2734375,
"learning_rate": 0.0004850126386619699,
"loss": 5.2741,
"mean_token_accuracy": 0.17798781841993333,
"num_tokens": 27276965.0,
"step": 14790
},
{
"entropy": 5.542857933044433,
"epoch": 1.2429741650913673,
"grad_norm": 1.234375,
"learning_rate": 0.0004850018900358545,
"loss": 5.3366,
"mean_token_accuracy": 0.172660693526268,
"num_tokens": 27286173.0,
"step": 14795
},
{
"entropy": 5.521266126632691,
"epoch": 1.2433942449065323,
"grad_norm": 1.2734375,
"learning_rate": 0.00048499113768966386,
"loss": 5.3327,
"mean_token_accuracy": 0.17302975356578826,
"num_tokens": 27294863.0,
"step": 14800
},
{
"entropy": 5.655036687850952,
"epoch": 1.243814324721697,
"grad_norm": 1.3203125,
"learning_rate": 0.0004849803816235884,
"loss": 5.3766,
"mean_token_accuracy": 0.16847763061523438,
"num_tokens": 27304427.0,
"step": 14805
},
{
"entropy": 5.688740253448486,
"epoch": 1.244234404536862,
"grad_norm": 1.3984375,
"learning_rate": 0.0004849696218378185,
"loss": 5.4333,
"mean_token_accuracy": 0.17155794352293013,
"num_tokens": 27313716.0,
"step": 14810
},
{
"entropy": 5.617390871047974,
"epoch": 1.2446544843520269,
"grad_norm": 1.421875,
"learning_rate": 0.0004849588583325449,
"loss": 5.3292,
"mean_token_accuracy": 0.18261649310588837,
"num_tokens": 27322342.0,
"step": 14815
},
{
"entropy": 5.616417503356933,
"epoch": 1.2450745641671919,
"grad_norm": 1.3671875,
"learning_rate": 0.0004849480911079583,
"loss": 5.4174,
"mean_token_accuracy": 0.15718486905097961,
"num_tokens": 27331892.0,
"step": 14820
},
{
"entropy": 5.610023641586304,
"epoch": 1.2454946439823567,
"grad_norm": 1.3671875,
"learning_rate": 0.0004849373201642493,
"loss": 5.3836,
"mean_token_accuracy": 0.1622963383793831,
"num_tokens": 27340428.0,
"step": 14825
},
{
"entropy": 5.621714544296265,
"epoch": 1.2459147237975214,
"grad_norm": 1.3046875,
"learning_rate": 0.0004849265455016088,
"loss": 5.3862,
"mean_token_accuracy": 0.1723904699087143,
"num_tokens": 27349224.0,
"step": 14830
},
{
"entropy": 5.606583881378174,
"epoch": 1.2463348036126864,
"grad_norm": 1.34375,
"learning_rate": 0.0004849157671202277,
"loss": 5.3828,
"mean_token_accuracy": 0.1659926727414131,
"num_tokens": 27357480.0,
"step": 14835
},
{
"entropy": 5.552874517440796,
"epoch": 1.2467548834278512,
"grad_norm": 1.375,
"learning_rate": 0.0004849049850202968,
"loss": 5.2868,
"mean_token_accuracy": 0.17431298196315764,
"num_tokens": 27366732.0,
"step": 14840
},
{
"entropy": 5.611221361160278,
"epoch": 1.2471749632430162,
"grad_norm": 1.5234375,
"learning_rate": 0.0004848941992020072,
"loss": 5.3869,
"mean_token_accuracy": 0.16717041432857513,
"num_tokens": 27375834.0,
"step": 14845
},
{
"entropy": 5.643020820617676,
"epoch": 1.247595043058181,
"grad_norm": 1.46875,
"learning_rate": 0.0004848834096655499,
"loss": 5.3826,
"mean_token_accuracy": 0.17003615349531173,
"num_tokens": 27385311.0,
"step": 14850
},
{
"entropy": 5.604761075973511,
"epoch": 1.2480151228733458,
"grad_norm": 1.3984375,
"learning_rate": 0.00048487261641111607,
"loss": 5.4224,
"mean_token_accuracy": 0.168009015917778,
"num_tokens": 27394587.0,
"step": 14855
},
{
"entropy": 5.527986001968384,
"epoch": 1.2484352026885108,
"grad_norm": 1.2578125,
"learning_rate": 0.000484861819438897,
"loss": 5.2958,
"mean_token_accuracy": 0.16891696453094482,
"num_tokens": 27403316.0,
"step": 14860
},
{
"entropy": 5.620045137405396,
"epoch": 1.2488552825036756,
"grad_norm": 1.390625,
"learning_rate": 0.0004848510187490838,
"loss": 5.3531,
"mean_token_accuracy": 0.1735784664750099,
"num_tokens": 27412709.0,
"step": 14865
},
{
"entropy": 5.631797504425049,
"epoch": 1.2492753623188406,
"grad_norm": 1.4453125,
"learning_rate": 0.0004848402143418679,
"loss": 5.3955,
"mean_token_accuracy": 0.16247193217277528,
"num_tokens": 27422004.0,
"step": 14870
},
{
"entropy": 5.619765853881836,
"epoch": 1.2496954421340054,
"grad_norm": 1.4375,
"learning_rate": 0.00048482940621744053,
"loss": 5.428,
"mean_token_accuracy": 0.1660287693142891,
"num_tokens": 27431931.0,
"step": 14875
},
{
"entropy": 5.5691510200500485,
"epoch": 1.2501155219491704,
"grad_norm": 1.1875,
"learning_rate": 0.0004848185943759934,
"loss": 5.2593,
"mean_token_accuracy": 0.17852565497159958,
"num_tokens": 27441527.0,
"step": 14880
},
{
"entropy": 5.679601383209229,
"epoch": 1.2505356017643352,
"grad_norm": 1.5,
"learning_rate": 0.00048480777881771786,
"loss": 5.4032,
"mean_token_accuracy": 0.17205158323049546,
"num_tokens": 27449964.0,
"step": 14885
},
{
"entropy": 5.577618885040283,
"epoch": 1.2509556815795002,
"grad_norm": 1.375,
"learning_rate": 0.0004847969595428056,
"loss": 5.4068,
"mean_token_accuracy": 0.16642248779535293,
"num_tokens": 27459044.0,
"step": 14890
},
{
"entropy": 5.516439294815063,
"epoch": 1.251375761394665,
"grad_norm": 1.34375,
"learning_rate": 0.00048478613655144817,
"loss": 5.3848,
"mean_token_accuracy": 0.17271924316883086,
"num_tokens": 27467644.0,
"step": 14895
},
{
"entropy": 5.647580671310425,
"epoch": 1.2517958412098298,
"grad_norm": 1.4609375,
"learning_rate": 0.0004847753098438374,
"loss": 5.4163,
"mean_token_accuracy": 0.1645412638783455,
"num_tokens": 27476899.0,
"step": 14900
},
{
"entropy": 5.652610969543457,
"epoch": 1.2522159210249948,
"grad_norm": 1.328125,
"learning_rate": 0.000484764479420165,
"loss": 5.3022,
"mean_token_accuracy": 0.17347924262285233,
"num_tokens": 27485167.0,
"step": 14905
},
{
"entropy": 5.605359888076782,
"epoch": 1.2526360008401596,
"grad_norm": 2.125,
"learning_rate": 0.00048475364528062287,
"loss": 5.3489,
"mean_token_accuracy": 0.16680347472429274,
"num_tokens": 27493986.0,
"step": 14910
},
{
"entropy": 5.598525142669677,
"epoch": 1.2530560806553246,
"grad_norm": 1.40625,
"learning_rate": 0.0004847428074254029,
"loss": 5.3958,
"mean_token_accuracy": 0.17133786529302597,
"num_tokens": 27503896.0,
"step": 14915
},
{
"entropy": 5.594638013839722,
"epoch": 1.2534761604704894,
"grad_norm": 1.484375,
"learning_rate": 0.00048473196585469713,
"loss": 5.3445,
"mean_token_accuracy": 0.17187246382236482,
"num_tokens": 27513485.0,
"step": 14920
},
{
"entropy": 5.675473594665528,
"epoch": 1.2538962402856542,
"grad_norm": 1.3359375,
"learning_rate": 0.00048472112056869763,
"loss": 5.4025,
"mean_token_accuracy": 0.16582921296358108,
"num_tokens": 27523164.0,
"step": 14925
},
{
"entropy": 5.656335401535034,
"epoch": 1.2543163201008192,
"grad_norm": 1.3359375,
"learning_rate": 0.0004847102715675964,
"loss": 5.3543,
"mean_token_accuracy": 0.1656375214457512,
"num_tokens": 27531387.0,
"step": 14930
},
{
"entropy": 5.529454326629638,
"epoch": 1.254736399915984,
"grad_norm": 1.546875,
"learning_rate": 0.0004846994188515857,
"loss": 5.3584,
"mean_token_accuracy": 0.1758265182375908,
"num_tokens": 27541754.0,
"step": 14935
},
{
"entropy": 5.71507420539856,
"epoch": 1.255156479731149,
"grad_norm": 1.2265625,
"learning_rate": 0.0004846885624208578,
"loss": 5.4257,
"mean_token_accuracy": 0.1660853862762451,
"num_tokens": 27551458.0,
"step": 14940
},
{
"entropy": 5.602577018737793,
"epoch": 1.2555765595463138,
"grad_norm": 1.546875,
"learning_rate": 0.000484677702275605,
"loss": 5.3479,
"mean_token_accuracy": 0.17095886170864105,
"num_tokens": 27560797.0,
"step": 14945
},
{
"entropy": 5.6042085647583,
"epoch": 1.2559966393614788,
"grad_norm": 1.421875,
"learning_rate": 0.00048466683841601963,
"loss": 5.3411,
"mean_token_accuracy": 0.17054116278886794,
"num_tokens": 27570166.0,
"step": 14950
},
{
"entropy": 5.517857694625855,
"epoch": 1.2564167191766435,
"grad_norm": 1.3203125,
"learning_rate": 0.00048465597084229416,
"loss": 5.2558,
"mean_token_accuracy": 0.17219745814800264,
"num_tokens": 27579411.0,
"step": 14955
},
{
"entropy": 5.601072359085083,
"epoch": 1.2568367989918086,
"grad_norm": 1.3984375,
"learning_rate": 0.0004846450995546212,
"loss": 5.4983,
"mean_token_accuracy": 0.16506696194410325,
"num_tokens": 27589124.0,
"step": 14960
},
{
"entropy": 5.6472714900970455,
"epoch": 1.2572568788069733,
"grad_norm": 1.5390625,
"learning_rate": 0.0004846342245531932,
"loss": 5.464,
"mean_token_accuracy": 0.16000686436891556,
"num_tokens": 27598664.0,
"step": 14965
},
{
"entropy": 5.678275871276855,
"epoch": 1.2576769586221381,
"grad_norm": 1.4921875,
"learning_rate": 0.0004846233458382029,
"loss": 5.3874,
"mean_token_accuracy": 0.1715864896774292,
"num_tokens": 27607189.0,
"step": 14970
},
{
"entropy": 5.669829320907593,
"epoch": 1.2580970384373031,
"grad_norm": 1.5703125,
"learning_rate": 0.00048461246340984293,
"loss": 5.4133,
"mean_token_accuracy": 0.16904016584157944,
"num_tokens": 27616415.0,
"step": 14975
},
{
"entropy": 5.568628215789795,
"epoch": 1.258517118252468,
"grad_norm": 1.375,
"learning_rate": 0.0004846015772683061,
"loss": 5.3917,
"mean_token_accuracy": 0.17225440591573715,
"num_tokens": 27624492.0,
"step": 14980
},
{
"entropy": 5.503430938720703,
"epoch": 1.258937198067633,
"grad_norm": 1.328125,
"learning_rate": 0.00048459068741378526,
"loss": 5.284,
"mean_token_accuracy": 0.17117979377508163,
"num_tokens": 27634243.0,
"step": 14985
},
{
"entropy": 5.620176553726196,
"epoch": 1.2593572778827977,
"grad_norm": 1.3515625,
"learning_rate": 0.0004845797938464734,
"loss": 5.3965,
"mean_token_accuracy": 0.17155803442001344,
"num_tokens": 27642887.0,
"step": 14990
},
{
"entropy": 5.611432361602783,
"epoch": 1.2597773576979625,
"grad_norm": 1.453125,
"learning_rate": 0.0004845688965665633,
"loss": 5.4007,
"mean_token_accuracy": 0.16408128738403321,
"num_tokens": 27652524.0,
"step": 14995
},
{
"entropy": 5.605330944061279,
"epoch": 1.2601974375131275,
"grad_norm": 1.4921875,
"learning_rate": 0.00048455799557424814,
"loss": 5.2526,
"mean_token_accuracy": 0.18249338418245314,
"num_tokens": 27661306.0,
"step": 15000
},
{
"epoch": 1.2601974375131275,
"eval_entropy": 5.487832695583898,
"eval_loss": 5.448311805725098,
"eval_mean_token_accuracy": 0.17504042461099753,
"eval_num_tokens": 27661306.0,
"eval_runtime": 27.5042,
"eval_samples_per_second": 1358.558,
"eval_steps_per_second": 169.829,
"step": 15000
},
{
"entropy": 5.662668323516845,
"epoch": 1.2606175173282923,
"grad_norm": 1.296875,
"learning_rate": 0.0004845470908697209,
"loss": 5.4405,
"mean_token_accuracy": 0.17284592539072036,
"num_tokens": 27671728.0,
"step": 15005
},
{
"entropy": 5.541242790222168,
"epoch": 1.2610375971434573,
"grad_norm": 1.390625,
"learning_rate": 0.000484536182453175,
"loss": 5.2564,
"mean_token_accuracy": 0.17675976157188417,
"num_tokens": 27680740.0,
"step": 15010
},
{
"entropy": 5.560620641708374,
"epoch": 1.261457676958622,
"grad_norm": 1.5546875,
"learning_rate": 0.0004845252703248035,
"loss": 5.3236,
"mean_token_accuracy": 0.1745853379368782,
"num_tokens": 27689865.0,
"step": 15015
},
{
"entropy": 5.602686214447021,
"epoch": 1.2618777567737869,
"grad_norm": 1.4140625,
"learning_rate": 0.0004845143544847997,
"loss": 5.344,
"mean_token_accuracy": 0.17240457534790038,
"num_tokens": 27700366.0,
"step": 15020
},
{
"entropy": 5.6088262557983395,
"epoch": 1.262297836588952,
"grad_norm": 1.4453125,
"learning_rate": 0.00048450343493335697,
"loss": 5.2669,
"mean_token_accuracy": 0.17327335923910142,
"num_tokens": 27708893.0,
"step": 15025
},
{
"entropy": 5.5244677543640135,
"epoch": 1.262717916404117,
"grad_norm": 1.4921875,
"learning_rate": 0.0004844925116706688,
"loss": 5.2864,
"mean_token_accuracy": 0.16837399303913117,
"num_tokens": 27717494.0,
"step": 15030
},
{
"entropy": 5.472778987884522,
"epoch": 1.2631379962192817,
"grad_norm": 1.578125,
"learning_rate": 0.00048448158469692866,
"loss": 5.2256,
"mean_token_accuracy": 0.185105562210083,
"num_tokens": 27726487.0,
"step": 15035
},
{
"entropy": 5.6607684135437015,
"epoch": 1.2635580760344465,
"grad_norm": 1.6328125,
"learning_rate": 0.0004844706540123301,
"loss": 5.4316,
"mean_token_accuracy": 0.1625412255525589,
"num_tokens": 27736602.0,
"step": 15040
},
{
"entropy": 5.723721265792847,
"epoch": 1.2639781558496115,
"grad_norm": 1.59375,
"learning_rate": 0.00048445971961706675,
"loss": 5.4587,
"mean_token_accuracy": 0.16471450924873351,
"num_tokens": 27746322.0,
"step": 15045
},
{
"entropy": 5.543186521530151,
"epoch": 1.2643982356647763,
"grad_norm": 1.4921875,
"learning_rate": 0.0004844487815113323,
"loss": 5.3143,
"mean_token_accuracy": 0.17797768414020537,
"num_tokens": 27754941.0,
"step": 15050
},
{
"entropy": 5.565147113800049,
"epoch": 1.2648183154799413,
"grad_norm": 1.3671875,
"learning_rate": 0.0004844378396953206,
"loss": 5.3657,
"mean_token_accuracy": 0.16895974725484847,
"num_tokens": 27763941.0,
"step": 15055
},
{
"entropy": 5.639039325714111,
"epoch": 1.265238395295106,
"grad_norm": 1.3828125,
"learning_rate": 0.00048442689416922536,
"loss": 5.3918,
"mean_token_accuracy": 0.17052255123853682,
"num_tokens": 27773087.0,
"step": 15060
},
{
"entropy": 5.4961700439453125,
"epoch": 1.2656584751102709,
"grad_norm": 1.4609375,
"learning_rate": 0.00048441594493324057,
"loss": 5.2178,
"mean_token_accuracy": 0.18083741068840026,
"num_tokens": 27782648.0,
"step": 15065
},
{
"entropy": 5.520091819763183,
"epoch": 1.2660785549254359,
"grad_norm": 1.953125,
"learning_rate": 0.00048440499198756015,
"loss": 5.4132,
"mean_token_accuracy": 0.1683144211769104,
"num_tokens": 27791567.0,
"step": 15070
},
{
"entropy": 5.607470178604126,
"epoch": 1.2664986347406006,
"grad_norm": 1.5390625,
"learning_rate": 0.00048439403533237816,
"loss": 5.4149,
"mean_token_accuracy": 0.16157330721616744,
"num_tokens": 27801397.0,
"step": 15075
},
{
"entropy": 5.700303411483764,
"epoch": 1.2669187145557657,
"grad_norm": 1.34375,
"learning_rate": 0.0004843830749678886,
"loss": 5.4321,
"mean_token_accuracy": 0.16523478776216508,
"num_tokens": 27810831.0,
"step": 15080
},
{
"entropy": 5.551168394088745,
"epoch": 1.2673387943709304,
"grad_norm": 1.40625,
"learning_rate": 0.0004843721108942856,
"loss": 5.3269,
"mean_token_accuracy": 0.16938031911849977,
"num_tokens": 27819591.0,
"step": 15085
},
{
"entropy": 5.537439775466919,
"epoch": 1.2677588741860952,
"grad_norm": 1.46875,
"learning_rate": 0.0004843611431117636,
"loss": 5.3289,
"mean_token_accuracy": 0.17461262345314027,
"num_tokens": 27828614.0,
"step": 15090
},
{
"entropy": 5.593230390548706,
"epoch": 1.2681789540012602,
"grad_norm": 1.3828125,
"learning_rate": 0.0004843501716205167,
"loss": 5.3679,
"mean_token_accuracy": 0.16919135600328444,
"num_tokens": 27837549.0,
"step": 15095
},
{
"entropy": 5.6492126941680905,
"epoch": 1.2685990338164252,
"grad_norm": 1.515625,
"learning_rate": 0.0004843391964207393,
"loss": 5.3709,
"mean_token_accuracy": 0.1663505345582962,
"num_tokens": 27846678.0,
"step": 15100
},
{
"entropy": 5.684939861297607,
"epoch": 1.26901911363159,
"grad_norm": 1.421875,
"learning_rate": 0.0004843282175126258,
"loss": 5.3909,
"mean_token_accuracy": 0.17155635207891465,
"num_tokens": 27855734.0,
"step": 15105
},
{
"entropy": 5.617116832733155,
"epoch": 1.2694391934467548,
"grad_norm": 1.4296875,
"learning_rate": 0.00048431723489637086,
"loss": 5.3356,
"mean_token_accuracy": 0.17420419603586196,
"num_tokens": 27865111.0,
"step": 15110
},
{
"entropy": 5.567612361907959,
"epoch": 1.2698592732619198,
"grad_norm": 1.796875,
"learning_rate": 0.00048430624857216876,
"loss": 5.3467,
"mean_token_accuracy": 0.17068816274404525,
"num_tokens": 27874495.0,
"step": 15115
},
{
"entropy": 5.5786576747894285,
"epoch": 1.2702793530770846,
"grad_norm": 1.6875,
"learning_rate": 0.0004842952585402143,
"loss": 5.39,
"mean_token_accuracy": 0.1720960780978203,
"num_tokens": 27884531.0,
"step": 15120
},
{
"entropy": 5.556511497497558,
"epoch": 1.2706994328922496,
"grad_norm": 1.8125,
"learning_rate": 0.000484284264800702,
"loss": 5.2633,
"mean_token_accuracy": 0.17738655358552932,
"num_tokens": 27893463.0,
"step": 15125
},
{
"entropy": 5.616838836669922,
"epoch": 1.2711195127074144,
"grad_norm": 1.5859375,
"learning_rate": 0.00048427326735382687,
"loss": 5.3961,
"mean_token_accuracy": 0.16617247462272644,
"num_tokens": 27903015.0,
"step": 15130
},
{
"entropy": 5.605151844024658,
"epoch": 1.2715395925225792,
"grad_norm": 1.3203125,
"learning_rate": 0.0004842622661997834,
"loss": 5.3679,
"mean_token_accuracy": 0.17239933609962463,
"num_tokens": 27912207.0,
"step": 15135
},
{
"entropy": 5.640608310699463,
"epoch": 1.2719596723377442,
"grad_norm": 1.5,
"learning_rate": 0.0004842512613387668,
"loss": 5.3727,
"mean_token_accuracy": 0.16329355537891388,
"num_tokens": 27921566.0,
"step": 15140
},
{
"entropy": 5.552551126480102,
"epoch": 1.272379752152909,
"grad_norm": 1.3203125,
"learning_rate": 0.0004842402527709718,
"loss": 5.3231,
"mean_token_accuracy": 0.1740811884403229,
"num_tokens": 27930633.0,
"step": 15145
},
{
"entropy": 5.6511821269989015,
"epoch": 1.272799831968074,
"grad_norm": 1.4765625,
"learning_rate": 0.0004842292404965934,
"loss": 5.4351,
"mean_token_accuracy": 0.16572162210941316,
"num_tokens": 27939887.0,
"step": 15150
},
{
"entropy": 5.660265827178955,
"epoch": 1.2732199117832388,
"grad_norm": 1.484375,
"learning_rate": 0.0004842182245158268,
"loss": 5.4268,
"mean_token_accuracy": 0.170801804959774,
"num_tokens": 27949090.0,
"step": 15155
},
{
"entropy": 5.526915502548218,
"epoch": 1.2736399915984036,
"grad_norm": 1.46875,
"learning_rate": 0.00048420720482886715,
"loss": 5.2504,
"mean_token_accuracy": 0.17966016232967377,
"num_tokens": 27958141.0,
"step": 15160
},
{
"entropy": 5.50667896270752,
"epoch": 1.2740600714135686,
"grad_norm": 1.5234375,
"learning_rate": 0.0004841961814359095,
"loss": 5.3172,
"mean_token_accuracy": 0.17003074586391448,
"num_tokens": 27967780.0,
"step": 15165
},
{
"entropy": 5.671378660202026,
"epoch": 1.2744801512287336,
"grad_norm": 1.328125,
"learning_rate": 0.00048418515433714917,
"loss": 5.4021,
"mean_token_accuracy": 0.16783110350370406,
"num_tokens": 27976243.0,
"step": 15170
},
{
"entropy": 5.553921747207641,
"epoch": 1.2749002310438984,
"grad_norm": 1.46875,
"learning_rate": 0.0004841741235327817,
"loss": 5.2659,
"mean_token_accuracy": 0.17756874412298201,
"num_tokens": 27985874.0,
"step": 15175
},
{
"entropy": 5.669862508773804,
"epoch": 1.2753203108590632,
"grad_norm": 1.46875,
"learning_rate": 0.00048416308902300215,
"loss": 5.4961,
"mean_token_accuracy": 0.16234332621097564,
"num_tokens": 27995111.0,
"step": 15180
},
{
"entropy": 5.584555387496948,
"epoch": 1.2757403906742282,
"grad_norm": 1.453125,
"learning_rate": 0.0004841520508080063,
"loss": 5.3261,
"mean_token_accuracy": 0.1698221296072006,
"num_tokens": 28003948.0,
"step": 15185
},
{
"entropy": 5.5942200183868405,
"epoch": 1.276160470489393,
"grad_norm": 1.5703125,
"learning_rate": 0.00048414100888798957,
"loss": 5.3384,
"mean_token_accuracy": 0.17130068987607955,
"num_tokens": 28012941.0,
"step": 15190
},
{
"entropy": 5.539005184173584,
"epoch": 1.276580550304558,
"grad_norm": 1.375,
"learning_rate": 0.0004841299632631475,
"loss": 5.3271,
"mean_token_accuracy": 0.1688637003302574,
"num_tokens": 28022195.0,
"step": 15195
},
{
"entropy": 5.554776954650879,
"epoch": 1.2770006301197228,
"grad_norm": 1.4296875,
"learning_rate": 0.0004841189139336759,
"loss": 5.2663,
"mean_token_accuracy": 0.17250315546989442,
"num_tokens": 28031446.0,
"step": 15200
},
{
"entropy": 5.567613554000855,
"epoch": 1.2774207099348875,
"grad_norm": 1.3203125,
"learning_rate": 0.0004841078608997703,
"loss": 5.2854,
"mean_token_accuracy": 0.17581620216369628,
"num_tokens": 28040906.0,
"step": 15205
},
{
"entropy": 5.539995908737183,
"epoch": 1.2778407897500526,
"grad_norm": 1.3671875,
"learning_rate": 0.0004840968041616267,
"loss": 5.2897,
"mean_token_accuracy": 0.176302932202816,
"num_tokens": 28049848.0,
"step": 15210
},
{
"entropy": 5.562288093566894,
"epoch": 1.2782608695652173,
"grad_norm": 1.3984375,
"learning_rate": 0.00048408574371944094,
"loss": 5.3022,
"mean_token_accuracy": 0.17135927081108093,
"num_tokens": 28058276.0,
"step": 15215
},
{
"entropy": 5.608151054382324,
"epoch": 1.2786809493803823,
"grad_norm": 1.375,
"learning_rate": 0.0004840746795734088,
"loss": 5.411,
"mean_token_accuracy": 0.16826933175325393,
"num_tokens": 28068185.0,
"step": 15220
},
{
"entropy": 5.607448434829712,
"epoch": 1.2791010291955471,
"grad_norm": 1.46875,
"learning_rate": 0.0004840636117237264,
"loss": 5.4484,
"mean_token_accuracy": 0.16820820420980453,
"num_tokens": 28077532.0,
"step": 15225
},
{
"entropy": 5.541782855987549,
"epoch": 1.279521109010712,
"grad_norm": 1.4921875,
"learning_rate": 0.0004840525401705897,
"loss": 5.2958,
"mean_token_accuracy": 0.169663642346859,
"num_tokens": 28087593.0,
"step": 15230
},
{
"entropy": 5.612592506408691,
"epoch": 1.279941188825877,
"grad_norm": 1.5234375,
"learning_rate": 0.00048404146491419503,
"loss": 5.2859,
"mean_token_accuracy": 0.17592982202768326,
"num_tokens": 28096256.0,
"step": 15235
},
{
"entropy": 5.635091829299927,
"epoch": 1.2803612686410417,
"grad_norm": 1.515625,
"learning_rate": 0.00048403038595473837,
"loss": 5.3293,
"mean_token_accuracy": 0.1745913729071617,
"num_tokens": 28105048.0,
"step": 15240
},
{
"entropy": 5.560552406311035,
"epoch": 1.2807813484562067,
"grad_norm": 1.5625,
"learning_rate": 0.000484019303292416,
"loss": 5.3895,
"mean_token_accuracy": 0.15896713137626647,
"num_tokens": 28114330.0,
"step": 15245
},
{
"entropy": 5.5199511528015135,
"epoch": 1.2812014282713715,
"grad_norm": 1.359375,
"learning_rate": 0.00048400821692742434,
"loss": 5.3,
"mean_token_accuracy": 0.1790434643626213,
"num_tokens": 28123147.0,
"step": 15250
},
{
"entropy": 5.610697460174561,
"epoch": 1.2816215080865365,
"grad_norm": 1.6484375,
"learning_rate": 0.00048399712685995983,
"loss": 5.4288,
"mean_token_accuracy": 0.1721694827079773,
"num_tokens": 28132477.0,
"step": 15255
},
{
"entropy": 5.60445613861084,
"epoch": 1.2820415879017013,
"grad_norm": 1.4140625,
"learning_rate": 0.00048398603309021877,
"loss": 5.4024,
"mean_token_accuracy": 0.16869810074567795,
"num_tokens": 28141350.0,
"step": 15260
},
{
"entropy": 5.672561359405518,
"epoch": 1.2824616677168663,
"grad_norm": 1.6953125,
"learning_rate": 0.0004839749356183978,
"loss": 5.3686,
"mean_token_accuracy": 0.16765992641448973,
"num_tokens": 28149522.0,
"step": 15265
},
{
"entropy": 5.672017288208008,
"epoch": 1.282881747532031,
"grad_norm": 1.3671875,
"learning_rate": 0.0004839638344446933,
"loss": 5.469,
"mean_token_accuracy": 0.16339180618524551,
"num_tokens": 28159646.0,
"step": 15270
},
{
"entropy": 5.673255681991577,
"epoch": 1.283301827347196,
"grad_norm": 1.3828125,
"learning_rate": 0.0004839527295693023,
"loss": 5.3849,
"mean_token_accuracy": 0.17118140757083894,
"num_tokens": 28168408.0,
"step": 15275
},
{
"entropy": 5.619621467590332,
"epoch": 1.283721907162361,
"grad_norm": 1.4296875,
"learning_rate": 0.0004839416209924211,
"loss": 5.3823,
"mean_token_accuracy": 0.1675962209701538,
"num_tokens": 28177744.0,
"step": 15280
},
{
"entropy": 5.641964483261108,
"epoch": 1.2841419869775257,
"grad_norm": 1.4921875,
"learning_rate": 0.00048393050871424676,
"loss": 5.4314,
"mean_token_accuracy": 0.16861522048711777,
"num_tokens": 28186811.0,
"step": 15285
},
{
"entropy": 5.597791481018066,
"epoch": 1.2845620667926907,
"grad_norm": 1.4921875,
"learning_rate": 0.000483919392734976,
"loss": 5.4127,
"mean_token_accuracy": 0.170680233836174,
"num_tokens": 28197052.0,
"step": 15290
},
{
"entropy": 5.550778722763061,
"epoch": 1.2849821466078555,
"grad_norm": 1.6015625,
"learning_rate": 0.0004839082730548058,
"loss": 5.2639,
"mean_token_accuracy": 0.17900359034538268,
"num_tokens": 28206000.0,
"step": 15295
},
{
"entropy": 5.558641719818115,
"epoch": 1.2854022264230203,
"grad_norm": 1.578125,
"learning_rate": 0.0004838971496739331,
"loss": 5.2513,
"mean_token_accuracy": 0.17151403576135635,
"num_tokens": 28214679.0,
"step": 15300
},
{
"entropy": 5.5049355030059814,
"epoch": 1.2858223062381853,
"grad_norm": 1.4609375,
"learning_rate": 0.000483886022592555,
"loss": 5.3681,
"mean_token_accuracy": 0.16806211918592454,
"num_tokens": 28223890.0,
"step": 15305
},
{
"entropy": 5.550752401351929,
"epoch": 1.28624238605335,
"grad_norm": 1.6484375,
"learning_rate": 0.0004838748918108685,
"loss": 5.3088,
"mean_token_accuracy": 0.17612494379281998,
"num_tokens": 28232422.0,
"step": 15310
},
{
"entropy": 5.6166479110717775,
"epoch": 1.286662465868515,
"grad_norm": 1.4453125,
"learning_rate": 0.00048386375732907083,
"loss": 5.3488,
"mean_token_accuracy": 0.17347524911165238,
"num_tokens": 28242079.0,
"step": 15315
},
{
"entropy": 5.699514389038086,
"epoch": 1.2870825456836799,
"grad_norm": 1.3984375,
"learning_rate": 0.00048385261914735936,
"loss": 5.5226,
"mean_token_accuracy": 0.16434683352708818,
"num_tokens": 28252510.0,
"step": 15320
},
{
"entropy": 5.712462329864502,
"epoch": 1.2875026254988446,
"grad_norm": 1.421875,
"learning_rate": 0.00048384147726593125,
"loss": 5.4495,
"mean_token_accuracy": 0.1677544265985489,
"num_tokens": 28261348.0,
"step": 15325
},
{
"entropy": 5.624198532104492,
"epoch": 1.2879227053140097,
"grad_norm": 1.390625,
"learning_rate": 0.0004838303316849839,
"loss": 5.3546,
"mean_token_accuracy": 0.1670142412185669,
"num_tokens": 28270739.0,
"step": 15330
},
{
"entropy": 5.660454177856446,
"epoch": 1.2883427851291747,
"grad_norm": 1.7890625,
"learning_rate": 0.00048381918240471473,
"loss": 5.4178,
"mean_token_accuracy": 0.1622515268623829,
"num_tokens": 28279370.0,
"step": 15335
},
{
"entropy": 5.643009376525879,
"epoch": 1.2887628649443394,
"grad_norm": 1.7109375,
"learning_rate": 0.00048380802942532124,
"loss": 5.3265,
"mean_token_accuracy": 0.16982399374246598,
"num_tokens": 28287955.0,
"step": 15340
},
{
"entropy": 5.500215625762939,
"epoch": 1.2891829447595042,
"grad_norm": 1.4609375,
"learning_rate": 0.00048379687274700107,
"loss": 5.2845,
"mean_token_accuracy": 0.17823810428380965,
"num_tokens": 28296832.0,
"step": 15345
},
{
"entropy": 5.50995626449585,
"epoch": 1.2896030245746692,
"grad_norm": 1.4375,
"learning_rate": 0.00048378571236995185,
"loss": 5.3099,
"mean_token_accuracy": 0.1684551328420639,
"num_tokens": 28305778.0,
"step": 15350
},
{
"entropy": 5.643387413024902,
"epoch": 1.290023104389834,
"grad_norm": 1.828125,
"learning_rate": 0.00048377454829437124,
"loss": 5.368,
"mean_token_accuracy": 0.16758508384227752,
"num_tokens": 28314615.0,
"step": 15355
},
{
"entropy": 5.672886037826538,
"epoch": 1.290443184204999,
"grad_norm": 1.4609375,
"learning_rate": 0.0004837633805204569,
"loss": 5.4322,
"mean_token_accuracy": 0.16582419574260712,
"num_tokens": 28324478.0,
"step": 15360
},
{
"entropy": 5.655191707611084,
"epoch": 1.2908632640201638,
"grad_norm": 1.4921875,
"learning_rate": 0.0004837522090484069,
"loss": 5.3967,
"mean_token_accuracy": 0.16606489270925523,
"num_tokens": 28333532.0,
"step": 15365
},
{
"entropy": 5.622208118438721,
"epoch": 1.2912833438353286,
"grad_norm": 1.3984375,
"learning_rate": 0.00048374103387841894,
"loss": 5.3511,
"mean_token_accuracy": 0.16328290849924088,
"num_tokens": 28343723.0,
"step": 15370
},
{
"entropy": 5.578137588500977,
"epoch": 1.2917034236504936,
"grad_norm": 2.234375,
"learning_rate": 0.00048372985501069106,
"loss": 5.3352,
"mean_token_accuracy": 0.16576202511787413,
"num_tokens": 28351992.0,
"step": 15375
},
{
"entropy": 5.593153715133667,
"epoch": 1.2921235034656584,
"grad_norm": 1.6796875,
"learning_rate": 0.0004837186724454213,
"loss": 5.324,
"mean_token_accuracy": 0.16840839982032776,
"num_tokens": 28361141.0,
"step": 15380
},
{
"entropy": 5.581699228286743,
"epoch": 1.2925435832808234,
"grad_norm": 1.328125,
"learning_rate": 0.0004837074861828077,
"loss": 5.3111,
"mean_token_accuracy": 0.17236120253801346,
"num_tokens": 28370339.0,
"step": 15385
},
{
"entropy": 5.58486590385437,
"epoch": 1.2929636630959882,
"grad_norm": 1.5,
"learning_rate": 0.0004836962962230485,
"loss": 5.4399,
"mean_token_accuracy": 0.169328410923481,
"num_tokens": 28379242.0,
"step": 15390
},
{
"entropy": 5.58713812828064,
"epoch": 1.293383742911153,
"grad_norm": 1.625,
"learning_rate": 0.0004836851025663418,
"loss": 5.3088,
"mean_token_accuracy": 0.175137297809124,
"num_tokens": 28388864.0,
"step": 15395
},
{
"entropy": 5.670144557952881,
"epoch": 1.293803822726318,
"grad_norm": 1.3515625,
"learning_rate": 0.000483673905212886,
"loss": 5.4142,
"mean_token_accuracy": 0.16931960582733155,
"num_tokens": 28398000.0,
"step": 15400
},
{
"entropy": 5.530681467056274,
"epoch": 1.294223902541483,
"grad_norm": 1.515625,
"learning_rate": 0.0004836627041628794,
"loss": 5.345,
"mean_token_accuracy": 0.17576417773962022,
"num_tokens": 28407652.0,
"step": 15405
},
{
"entropy": 5.6185098648071286,
"epoch": 1.2946439823566478,
"grad_norm": 1.40625,
"learning_rate": 0.0004836514994165205,
"loss": 5.4189,
"mean_token_accuracy": 0.16661812365055084,
"num_tokens": 28417694.0,
"step": 15410
},
{
"entropy": 5.630365991592408,
"epoch": 1.2950640621718126,
"grad_norm": 1.5703125,
"learning_rate": 0.00048364029097400777,
"loss": 5.3612,
"mean_token_accuracy": 0.1731864407658577,
"num_tokens": 28426928.0,
"step": 15415
},
{
"entropy": 5.594233560562134,
"epoch": 1.2954841419869776,
"grad_norm": 1.4453125,
"learning_rate": 0.00048362907883553956,
"loss": 5.3933,
"mean_token_accuracy": 0.16121293902397155,
"num_tokens": 28436176.0,
"step": 15420
},
{
"entropy": 5.631103372573852,
"epoch": 1.2959042218021424,
"grad_norm": 1.453125,
"learning_rate": 0.00048361786300131477,
"loss": 5.4492,
"mean_token_accuracy": 0.1604493111371994,
"num_tokens": 28445277.0,
"step": 15425
},
{
"entropy": 5.649682426452637,
"epoch": 1.2963243016173074,
"grad_norm": 1.328125,
"learning_rate": 0.0004836066434715319,
"loss": 5.3444,
"mean_token_accuracy": 0.17051563113927842,
"num_tokens": 28453959.0,
"step": 15430
},
{
"entropy": 5.618559408187866,
"epoch": 1.2967443814324722,
"grad_norm": 1.65625,
"learning_rate": 0.0004835954202463898,
"loss": 5.4372,
"mean_token_accuracy": 0.16708061397075652,
"num_tokens": 28463780.0,
"step": 15435
},
{
"entropy": 5.537703371047973,
"epoch": 1.297164461247637,
"grad_norm": 1.3359375,
"learning_rate": 0.0004835841933260872,
"loss": 5.2921,
"mean_token_accuracy": 0.16737518906593324,
"num_tokens": 28473299.0,
"step": 15440
},
{
"entropy": 5.578835725784302,
"epoch": 1.297584541062802,
"grad_norm": 1.421875,
"learning_rate": 0.00048357296271082305,
"loss": 5.3261,
"mean_token_accuracy": 0.16800283193588256,
"num_tokens": 28481859.0,
"step": 15445
},
{
"entropy": 5.668525123596192,
"epoch": 1.2980046208779668,
"grad_norm": 1.8046875,
"learning_rate": 0.00048356172840079625,
"loss": 5.3831,
"mean_token_accuracy": 0.17016577422618867,
"num_tokens": 28491034.0,
"step": 15450
},
{
"entropy": 5.627667331695557,
"epoch": 1.2984247006931318,
"grad_norm": 1.375,
"learning_rate": 0.0004835504903962058,
"loss": 5.3176,
"mean_token_accuracy": 0.16899299919605254,
"num_tokens": 28499829.0,
"step": 15455
},
{
"entropy": 5.55805196762085,
"epoch": 1.2988447805082965,
"grad_norm": 1.5625,
"learning_rate": 0.00048353924869725084,
"loss": 5.3039,
"mean_token_accuracy": 0.17624977231025696,
"num_tokens": 28508188.0,
"step": 15460
},
{
"entropy": 5.492491817474365,
"epoch": 1.2992648603234613,
"grad_norm": 1.890625,
"learning_rate": 0.0004835280033041305,
"loss": 5.2152,
"mean_token_accuracy": 0.17513660490512847,
"num_tokens": 28516509.0,
"step": 15465
},
{
"entropy": 5.587926959991455,
"epoch": 1.2996849401386263,
"grad_norm": 1.828125,
"learning_rate": 0.0004835167542170439,
"loss": 5.4409,
"mean_token_accuracy": 0.1680296167731285,
"num_tokens": 28526457.0,
"step": 15470
},
{
"entropy": 5.635869455337525,
"epoch": 1.3001050199537914,
"grad_norm": 1.25,
"learning_rate": 0.0004835055014361904,
"loss": 5.3808,
"mean_token_accuracy": 0.16730038970708846,
"num_tokens": 28536149.0,
"step": 15475
},
{
"entropy": 5.660777997970581,
"epoch": 1.3005250997689561,
"grad_norm": 1.2421875,
"learning_rate": 0.00048349424496176924,
"loss": 5.4182,
"mean_token_accuracy": 0.1671212524175644,
"num_tokens": 28545486.0,
"step": 15480
},
{
"entropy": 5.594893980026245,
"epoch": 1.300945179584121,
"grad_norm": 1.4375,
"learning_rate": 0.00048348298479397996,
"loss": 5.3056,
"mean_token_accuracy": 0.17313460856676102,
"num_tokens": 28554555.0,
"step": 15485
},
{
"entropy": 5.493140792846679,
"epoch": 1.301365259399286,
"grad_norm": 1.5078125,
"learning_rate": 0.00048347172093302196,
"loss": 5.3427,
"mean_token_accuracy": 0.17841643542051316,
"num_tokens": 28563387.0,
"step": 15490
},
{
"entropy": 5.539090824127197,
"epoch": 1.3017853392144507,
"grad_norm": 1.4296875,
"learning_rate": 0.00048346045337909475,
"loss": 5.3437,
"mean_token_accuracy": 0.17286846339702605,
"num_tokens": 28573437.0,
"step": 15495
},
{
"entropy": 5.537428951263427,
"epoch": 1.3022054190296157,
"grad_norm": 1.9921875,
"learning_rate": 0.000483449182132398,
"loss": 5.2769,
"mean_token_accuracy": 0.18045637011528015,
"num_tokens": 28583362.0,
"step": 15500
},
{
"entropy": 5.738650894165039,
"epoch": 1.3026254988447805,
"grad_norm": 1.3515625,
"learning_rate": 0.00048343790719313124,
"loss": 5.47,
"mean_token_accuracy": 0.16674373149871827,
"num_tokens": 28593201.0,
"step": 15505
},
{
"entropy": 5.600464534759522,
"epoch": 1.3030455786599453,
"grad_norm": 1.46875,
"learning_rate": 0.00048342662856149427,
"loss": 5.372,
"mean_token_accuracy": 0.1667174980044365,
"num_tokens": 28602486.0,
"step": 15510
},
{
"entropy": 5.569722890853882,
"epoch": 1.3034656584751103,
"grad_norm": 1.4921875,
"learning_rate": 0.000483415346237687,
"loss": 5.3806,
"mean_token_accuracy": 0.16912435740232468,
"num_tokens": 28611643.0,
"step": 15515
},
{
"entropy": 5.643300008773804,
"epoch": 1.303885738290275,
"grad_norm": 1.7109375,
"learning_rate": 0.0004834040602219091,
"loss": 5.4183,
"mean_token_accuracy": 0.17343226373195647,
"num_tokens": 28620545.0,
"step": 15520
},
{
"entropy": 5.5370697498321535,
"epoch": 1.30430581810544,
"grad_norm": 1.5,
"learning_rate": 0.00048339277051436067,
"loss": 5.3602,
"mean_token_accuracy": 0.16954926997423173,
"num_tokens": 28630024.0,
"step": 15525
},
{
"entropy": 5.71484112739563,
"epoch": 1.304725897920605,
"grad_norm": 1.375,
"learning_rate": 0.0004833814771152415,
"loss": 5.4125,
"mean_token_accuracy": 0.17532897889614105,
"num_tokens": 28638995.0,
"step": 15530
},
{
"entropy": 5.614312267303466,
"epoch": 1.3051459777357697,
"grad_norm": 1.5234375,
"learning_rate": 0.00048337018002475184,
"loss": 5.3679,
"mean_token_accuracy": 0.1731203392148018,
"num_tokens": 28647833.0,
"step": 15535
},
{
"entropy": 5.557574462890625,
"epoch": 1.3055660575509347,
"grad_norm": 1.484375,
"learning_rate": 0.0004833588792430917,
"loss": 5.2711,
"mean_token_accuracy": 0.17764106392860413,
"num_tokens": 28657441.0,
"step": 15540
},
{
"entropy": 5.59211220741272,
"epoch": 1.3059861373660997,
"grad_norm": 1.34375,
"learning_rate": 0.0004833475747704614,
"loss": 5.3743,
"mean_token_accuracy": 0.17282749861478805,
"num_tokens": 28666666.0,
"step": 15545
},
{
"entropy": 5.55608491897583,
"epoch": 1.3064062171812645,
"grad_norm": 1.375,
"learning_rate": 0.000483336266607061,
"loss": 5.3759,
"mean_token_accuracy": 0.16900023818016052,
"num_tokens": 28676770.0,
"step": 15550
},
{
"entropy": 5.590077590942383,
"epoch": 1.3068262969964293,
"grad_norm": 1.2734375,
"learning_rate": 0.00048332495475309097,
"loss": 5.2904,
"mean_token_accuracy": 0.17570936679840088,
"num_tokens": 28685610.0,
"step": 15555
},
{
"entropy": 5.651003456115722,
"epoch": 1.3072463768115943,
"grad_norm": 1.5390625,
"learning_rate": 0.00048331363920875155,
"loss": 5.4007,
"mean_token_accuracy": 0.16941392719745635,
"num_tokens": 28695082.0,
"step": 15560
},
{
"entropy": 5.559187602996826,
"epoch": 1.307666456626759,
"grad_norm": 1.5390625,
"learning_rate": 0.00048330231997424335,
"loss": 5.3162,
"mean_token_accuracy": 0.17043908983469008,
"num_tokens": 28704006.0,
"step": 15565
},
{
"entropy": 5.558832693099975,
"epoch": 1.308086536441924,
"grad_norm": 1.390625,
"learning_rate": 0.0004832909970497668,
"loss": 5.3546,
"mean_token_accuracy": 0.17029603868722915,
"num_tokens": 28713665.0,
"step": 15570
},
{
"entropy": 5.603363800048828,
"epoch": 1.3085066162570889,
"grad_norm": 1.375,
"learning_rate": 0.00048327967043552245,
"loss": 5.3019,
"mean_token_accuracy": 0.17355511337518692,
"num_tokens": 28722920.0,
"step": 15575
},
{
"entropy": 5.581994581222534,
"epoch": 1.3089266960722536,
"grad_norm": 1.3046875,
"learning_rate": 0.00048326834013171107,
"loss": 5.2558,
"mean_token_accuracy": 0.17624608278274537,
"num_tokens": 28731689.0,
"step": 15580
},
{
"entropy": 5.581026935577393,
"epoch": 1.3093467758874187,
"grad_norm": 1.328125,
"learning_rate": 0.0004832570061385332,
"loss": 5.3859,
"mean_token_accuracy": 0.1769873559474945,
"num_tokens": 28741308.0,
"step": 15585
},
{
"entropy": 5.507744073867798,
"epoch": 1.3097668557025834,
"grad_norm": 1.421875,
"learning_rate": 0.0004832456684561898,
"loss": 5.3415,
"mean_token_accuracy": 0.1774263396859169,
"num_tokens": 28750190.0,
"step": 15590
},
{
"entropy": 5.606824207305908,
"epoch": 1.3101869355177485,
"grad_norm": 1.5546875,
"learning_rate": 0.0004832343270848815,
"loss": 5.4057,
"mean_token_accuracy": 0.16937861293554307,
"num_tokens": 28759588.0,
"step": 15595
},
{
"entropy": 5.6285364627838135,
"epoch": 1.3106070153329132,
"grad_norm": 1.375,
"learning_rate": 0.00048322298202480935,
"loss": 5.4098,
"mean_token_accuracy": 0.16976438611745834,
"num_tokens": 28768800.0,
"step": 15600
},
{
"entropy": 5.67966046333313,
"epoch": 1.311027095148078,
"grad_norm": 1.2890625,
"learning_rate": 0.00048321163327617433,
"loss": 5.3608,
"mean_token_accuracy": 0.17305390238761903,
"num_tokens": 28778108.0,
"step": 15605
},
{
"entropy": 5.665872383117676,
"epoch": 1.311447174963243,
"grad_norm": 1.4609375,
"learning_rate": 0.0004832002808391775,
"loss": 5.3298,
"mean_token_accuracy": 0.17185018211603165,
"num_tokens": 28787202.0,
"step": 15610
},
{
"entropy": 5.549618196487427,
"epoch": 1.3118672547784078,
"grad_norm": 1.359375,
"learning_rate": 0.0004831889247140198,
"loss": 5.3757,
"mean_token_accuracy": 0.16928450316190718,
"num_tokens": 28797482.0,
"step": 15615
},
{
"entropy": 5.508302640914917,
"epoch": 1.3122873345935728,
"grad_norm": 1.296875,
"learning_rate": 0.00048317756490090253,
"loss": 5.2947,
"mean_token_accuracy": 0.17709959149360657,
"num_tokens": 28805872.0,
"step": 15620
},
{
"entropy": 5.561249446868897,
"epoch": 1.3127074144087376,
"grad_norm": 1.5078125,
"learning_rate": 0.00048316620140002685,
"loss": 5.4078,
"mean_token_accuracy": 0.1639494240283966,
"num_tokens": 28814836.0,
"step": 15625
},
{
"entropy": 5.740321445465088,
"epoch": 1.3131274942239024,
"grad_norm": 1.3125,
"learning_rate": 0.0004831548342115942,
"loss": 5.4068,
"mean_token_accuracy": 0.16398458182811737,
"num_tokens": 28824727.0,
"step": 15630
},
{
"entropy": 5.746799182891846,
"epoch": 1.3135475740390674,
"grad_norm": 1.4140625,
"learning_rate": 0.00048314346333580576,
"loss": 5.508,
"mean_token_accuracy": 0.16522675901651382,
"num_tokens": 28833848.0,
"step": 15635
},
{
"entropy": 5.567570352554322,
"epoch": 1.3139676538542324,
"grad_norm": 1.421875,
"learning_rate": 0.0004831320887728631,
"loss": 5.2541,
"mean_token_accuracy": 0.17734746038913726,
"num_tokens": 28842198.0,
"step": 15640
},
{
"entropy": 5.571052837371826,
"epoch": 1.3143877336693972,
"grad_norm": 1.375,
"learning_rate": 0.0004831207105229676,
"loss": 5.3474,
"mean_token_accuracy": 0.17115414440631865,
"num_tokens": 28851804.0,
"step": 15645
},
{
"entropy": 5.506551837921142,
"epoch": 1.314807813484562,
"grad_norm": 1.546875,
"learning_rate": 0.00048310932858632087,
"loss": 5.2493,
"mean_token_accuracy": 0.1772620141506195,
"num_tokens": 28860181.0,
"step": 15650
},
{
"entropy": 5.537792301177978,
"epoch": 1.315227893299727,
"grad_norm": 1.3671875,
"learning_rate": 0.00048309794296312467,
"loss": 5.3279,
"mean_token_accuracy": 0.1781880587339401,
"num_tokens": 28869945.0,
"step": 15655
},
{
"entropy": 5.594064044952392,
"epoch": 1.3156479731148918,
"grad_norm": 1.53125,
"learning_rate": 0.00048308655365358053,
"loss": 5.394,
"mean_token_accuracy": 0.17326931655406952,
"num_tokens": 28880343.0,
"step": 15660
},
{
"entropy": 5.746251773834229,
"epoch": 1.3160680529300568,
"grad_norm": 1.546875,
"learning_rate": 0.00048307516065789017,
"loss": 5.4565,
"mean_token_accuracy": 0.162956078350544,
"num_tokens": 28889441.0,
"step": 15665
},
{
"entropy": 5.658310222625732,
"epoch": 1.3164881327452216,
"grad_norm": 1.5390625,
"learning_rate": 0.00048306376397625546,
"loss": 5.4075,
"mean_token_accuracy": 0.1670873075723648,
"num_tokens": 28898154.0,
"step": 15670
},
{
"entropy": 5.5842632293701175,
"epoch": 1.3169082125603864,
"grad_norm": 1.453125,
"learning_rate": 0.00048305236360887834,
"loss": 5.3817,
"mean_token_accuracy": 0.16679264307022096,
"num_tokens": 28908359.0,
"step": 15675
},
{
"entropy": 5.610452651977539,
"epoch": 1.3173282923755514,
"grad_norm": 1.4609375,
"learning_rate": 0.00048304095955596074,
"loss": 5.3879,
"mean_token_accuracy": 0.16719227135181428,
"num_tokens": 28918416.0,
"step": 15680
},
{
"entropy": 5.632193183898925,
"epoch": 1.3177483721907162,
"grad_norm": 1.3984375,
"learning_rate": 0.0004830295518177047,
"loss": 5.3121,
"mean_token_accuracy": 0.17960517555475236,
"num_tokens": 28927412.0,
"step": 15685
},
{
"entropy": 5.565472078323364,
"epoch": 1.3181684520058812,
"grad_norm": 1.734375,
"learning_rate": 0.00048301814039431227,
"loss": 5.3494,
"mean_token_accuracy": 0.17322985231876373,
"num_tokens": 28936106.0,
"step": 15690
},
{
"entropy": 5.595353460311889,
"epoch": 1.318588531821046,
"grad_norm": 1.546875,
"learning_rate": 0.00048300672528598553,
"loss": 5.3836,
"mean_token_accuracy": 0.1725798785686493,
"num_tokens": 28945197.0,
"step": 15695
},
{
"entropy": 5.658270645141601,
"epoch": 1.3190086116362107,
"grad_norm": 1.46875,
"learning_rate": 0.0004829953064929268,
"loss": 5.4077,
"mean_token_accuracy": 0.16884516328573226,
"num_tokens": 28954278.0,
"step": 15700
},
{
"entropy": 5.7062318325042725,
"epoch": 1.3194286914513758,
"grad_norm": 1.484375,
"learning_rate": 0.0004829838840153383,
"loss": 5.4763,
"mean_token_accuracy": 0.16783434450626372,
"num_tokens": 28963101.0,
"step": 15705
},
{
"entropy": 5.5181427001953125,
"epoch": 1.3198487712665408,
"grad_norm": 1.4296875,
"learning_rate": 0.0004829724578534224,
"loss": 5.3688,
"mean_token_accuracy": 0.16623532623052598,
"num_tokens": 28972063.0,
"step": 15710
},
{
"entropy": 5.565000104904175,
"epoch": 1.3202688510817056,
"grad_norm": 1.484375,
"learning_rate": 0.00048296102800738153,
"loss": 5.3124,
"mean_token_accuracy": 0.17217005491256715,
"num_tokens": 28981617.0,
"step": 15715
},
{
"entropy": 5.637431478500366,
"epoch": 1.3206889308968703,
"grad_norm": 1.4140625,
"learning_rate": 0.00048294959447741807,
"loss": 5.2938,
"mean_token_accuracy": 0.1703649029135704,
"num_tokens": 28989442.0,
"step": 15720
},
{
"entropy": 5.6043823719024655,
"epoch": 1.3211090107120353,
"grad_norm": 1.421875,
"learning_rate": 0.00048293815726373467,
"loss": 5.3154,
"mean_token_accuracy": 0.17528259456157685,
"num_tokens": 28999104.0,
"step": 15725
},
{
"entropy": 5.62308897972107,
"epoch": 1.3215290905272001,
"grad_norm": 1.5703125,
"learning_rate": 0.00048292671636653386,
"loss": 5.3788,
"mean_token_accuracy": 0.16871308237314225,
"num_tokens": 29008645.0,
"step": 15730
},
{
"entropy": 5.6037006855010985,
"epoch": 1.3219491703423651,
"grad_norm": 1.421875,
"learning_rate": 0.0004829152717860184,
"loss": 5.3626,
"mean_token_accuracy": 0.16728195548057556,
"num_tokens": 29018655.0,
"step": 15735
},
{
"entropy": 5.645209646224975,
"epoch": 1.32236925015753,
"grad_norm": 1.3984375,
"learning_rate": 0.00048290382352239087,
"loss": 5.3646,
"mean_token_accuracy": 0.17591540813446044,
"num_tokens": 29027109.0,
"step": 15740
},
{
"entropy": 5.538152742385864,
"epoch": 1.3227893299726947,
"grad_norm": 1.4375,
"learning_rate": 0.00048289237157585424,
"loss": 5.1924,
"mean_token_accuracy": 0.18130724281072616,
"num_tokens": 29035535.0,
"step": 15745
},
{
"entropy": 5.526106214523315,
"epoch": 1.3232094097878597,
"grad_norm": 1.4453125,
"learning_rate": 0.0004828809159466112,
"loss": 5.3557,
"mean_token_accuracy": 0.17011864781379699,
"num_tokens": 29044723.0,
"step": 15750
},
{
"entropy": 5.632492637634277,
"epoch": 1.3236294896030245,
"grad_norm": 1.9140625,
"learning_rate": 0.0004828694566348648,
"loss": 5.4998,
"mean_token_accuracy": 0.15992355048656465,
"num_tokens": 29053636.0,
"step": 15755
},
{
"entropy": 5.696485710144043,
"epoch": 1.3240495694181895,
"grad_norm": 1.40625,
"learning_rate": 0.00048285799364081806,
"loss": 5.3859,
"mean_token_accuracy": 0.1657705307006836,
"num_tokens": 29062940.0,
"step": 15760
},
{
"entropy": 5.560177230834961,
"epoch": 1.3244696492333543,
"grad_norm": 1.6171875,
"learning_rate": 0.00048284652696467404,
"loss": 5.306,
"mean_token_accuracy": 0.1746760606765747,
"num_tokens": 29072159.0,
"step": 15765
},
{
"entropy": 5.6406481742858885,
"epoch": 1.324889729048519,
"grad_norm": 1.4296875,
"learning_rate": 0.00048283505660663575,
"loss": 5.3941,
"mean_token_accuracy": 0.16920927315950393,
"num_tokens": 29081544.0,
"step": 15770
},
{
"entropy": 5.587003755569458,
"epoch": 1.325309808863684,
"grad_norm": 1.5390625,
"learning_rate": 0.0004828235825669064,
"loss": 5.3458,
"mean_token_accuracy": 0.1749515488743782,
"num_tokens": 29090710.0,
"step": 15775
},
{
"entropy": 5.595648622512817,
"epoch": 1.325729888678849,
"grad_norm": 1.46875,
"learning_rate": 0.00048281210484568937,
"loss": 5.3544,
"mean_token_accuracy": 0.16642958670854568,
"num_tokens": 29098988.0,
"step": 15780
},
{
"entropy": 5.55333981513977,
"epoch": 1.326149968494014,
"grad_norm": 1.3671875,
"learning_rate": 0.00048280062344318794,
"loss": 5.3909,
"mean_token_accuracy": 0.1612667679786682,
"num_tokens": 29108926.0,
"step": 15785
},
{
"entropy": 5.61096305847168,
"epoch": 1.3265700483091787,
"grad_norm": 1.421875,
"learning_rate": 0.0004827891383596054,
"loss": 5.3185,
"mean_token_accuracy": 0.17129528820514678,
"num_tokens": 29118065.0,
"step": 15790
},
{
"entropy": 5.570857095718384,
"epoch": 1.3269901281243437,
"grad_norm": 1.453125,
"learning_rate": 0.00048277764959514524,
"loss": 5.2713,
"mean_token_accuracy": 0.17513011544942855,
"num_tokens": 29127030.0,
"step": 15795
},
{
"entropy": 5.643740320205689,
"epoch": 1.3274102079395085,
"grad_norm": 1.4765625,
"learning_rate": 0.0004827661571500111,
"loss": 5.4004,
"mean_token_accuracy": 0.17051657736301423,
"num_tokens": 29137200.0,
"step": 15800
},
{
"entropy": 5.591941261291504,
"epoch": 1.3278302877546735,
"grad_norm": 1.5390625,
"learning_rate": 0.00048275466102440644,
"loss": 5.3802,
"mean_token_accuracy": 0.17082896530628205,
"num_tokens": 29147029.0,
"step": 15805
},
{
"entropy": 5.521048879623413,
"epoch": 1.3282503675698383,
"grad_norm": 1.65625,
"learning_rate": 0.00048274316121853494,
"loss": 5.2987,
"mean_token_accuracy": 0.1778016746044159,
"num_tokens": 29155675.0,
"step": 15810
},
{
"entropy": 5.6082827091217045,
"epoch": 1.328670447385003,
"grad_norm": 1.4375,
"learning_rate": 0.00048273165773260023,
"loss": 5.338,
"mean_token_accuracy": 0.1711556628346443,
"num_tokens": 29164730.0,
"step": 15815
},
{
"entropy": 5.640547084808349,
"epoch": 1.329090527200168,
"grad_norm": 1.734375,
"learning_rate": 0.0004827201505668063,
"loss": 5.3516,
"mean_token_accuracy": 0.17501722127199174,
"num_tokens": 29173074.0,
"step": 15820
},
{
"entropy": 5.673982048034668,
"epoch": 1.3295106070153329,
"grad_norm": 1.4375,
"learning_rate": 0.0004827086397213568,
"loss": 5.4646,
"mean_token_accuracy": 0.16482558846473694,
"num_tokens": 29182175.0,
"step": 15825
},
{
"entropy": 5.748240852355957,
"epoch": 1.3299306868304979,
"grad_norm": 1.234375,
"learning_rate": 0.0004826971251964557,
"loss": 5.6569,
"mean_token_accuracy": 0.15942730382084846,
"num_tokens": 29192910.0,
"step": 15830
},
{
"entropy": 5.586177349090576,
"epoch": 1.3303507666456627,
"grad_norm": 1.421875,
"learning_rate": 0.000482685606992307,
"loss": 5.314,
"mean_token_accuracy": 0.16921163648366927,
"num_tokens": 29201969.0,
"step": 15835
},
{
"entropy": 5.68097071647644,
"epoch": 1.3307708464608274,
"grad_norm": 1.4296875,
"learning_rate": 0.00048267408510911463,
"loss": 5.452,
"mean_token_accuracy": 0.16762558221817017,
"num_tokens": 29210475.0,
"step": 15840
},
{
"entropy": 5.5629668712615965,
"epoch": 1.3311909262759924,
"grad_norm": 1.5078125,
"learning_rate": 0.0004826625595470829,
"loss": 5.315,
"mean_token_accuracy": 0.17030383050441741,
"num_tokens": 29222586.0,
"step": 15845
},
{
"entropy": 5.574795961380005,
"epoch": 1.3316110060911575,
"grad_norm": 1.46875,
"learning_rate": 0.00048265103030641575,
"loss": 5.3713,
"mean_token_accuracy": 0.16889488995075225,
"num_tokens": 29231503.0,
"step": 15850
},
{
"entropy": 5.569881916046143,
"epoch": 1.3320310859063222,
"grad_norm": 1.3984375,
"learning_rate": 0.0004826394973873176,
"loss": 5.3421,
"mean_token_accuracy": 0.16882582604885102,
"num_tokens": 29241534.0,
"step": 15855
},
{
"entropy": 5.627525854110718,
"epoch": 1.332451165721487,
"grad_norm": 1.6171875,
"learning_rate": 0.00048262796078999266,
"loss": 5.3445,
"mean_token_accuracy": 0.172283935546875,
"num_tokens": 29250381.0,
"step": 15860
},
{
"entropy": 5.5921648979187015,
"epoch": 1.332871245536652,
"grad_norm": 1.3828125,
"learning_rate": 0.0004826164205146453,
"loss": 5.4099,
"mean_token_accuracy": 0.16868837922811508,
"num_tokens": 29259205.0,
"step": 15865
},
{
"entropy": 5.481484031677246,
"epoch": 1.3332913253518168,
"grad_norm": 1.3828125,
"learning_rate": 0.00048260487656147995,
"loss": 5.3009,
"mean_token_accuracy": 0.17611772418022156,
"num_tokens": 29267723.0,
"step": 15870
},
{
"entropy": 5.535662412643433,
"epoch": 1.3337114051669818,
"grad_norm": 1.296875,
"learning_rate": 0.00048259332893070106,
"loss": 5.3244,
"mean_token_accuracy": 0.17356904000043868,
"num_tokens": 29277102.0,
"step": 15875
},
{
"entropy": 5.6097249507904055,
"epoch": 1.3341314849821466,
"grad_norm": 1.578125,
"learning_rate": 0.0004825817776225133,
"loss": 5.2985,
"mean_token_accuracy": 0.1713997796177864,
"num_tokens": 29286484.0,
"step": 15880
},
{
"entropy": 5.621392154693604,
"epoch": 1.3345515647973114,
"grad_norm": 1.546875,
"learning_rate": 0.00048257022263712123,
"loss": 5.3825,
"mean_token_accuracy": 0.17635536342859268,
"num_tokens": 29296528.0,
"step": 15885
},
{
"entropy": 5.4858238697052,
"epoch": 1.3349716446124764,
"grad_norm": 1.4140625,
"learning_rate": 0.00048255866397472954,
"loss": 5.2566,
"mean_token_accuracy": 0.17637043744325637,
"num_tokens": 29305283.0,
"step": 15890
},
{
"entropy": 5.590370178222656,
"epoch": 1.3353917244276412,
"grad_norm": 1.421875,
"learning_rate": 0.000482547101635543,
"loss": 5.2583,
"mean_token_accuracy": 0.17325850278139115,
"num_tokens": 29315088.0,
"step": 15895
},
{
"entropy": 5.598516130447388,
"epoch": 1.3358118042428062,
"grad_norm": 1.625,
"learning_rate": 0.00048253553561976645,
"loss": 5.2927,
"mean_token_accuracy": 0.17234370112419128,
"num_tokens": 29323793.0,
"step": 15900
},
{
"entropy": 5.470377397537232,
"epoch": 1.336231884057971,
"grad_norm": 1.3671875,
"learning_rate": 0.0004825239659276047,
"loss": 5.346,
"mean_token_accuracy": 0.1682179808616638,
"num_tokens": 29334015.0,
"step": 15905
},
{
"entropy": 5.653326606750488,
"epoch": 1.3366519638731358,
"grad_norm": 1.5,
"learning_rate": 0.0004825123925592628,
"loss": 5.4609,
"mean_token_accuracy": 0.16445996165275573,
"num_tokens": 29343221.0,
"step": 15910
},
{
"entropy": 5.660612726211548,
"epoch": 1.3370720436883008,
"grad_norm": 1.9296875,
"learning_rate": 0.00048250081551494574,
"loss": 5.3251,
"mean_token_accuracy": 0.17200662344694137,
"num_tokens": 29352261.0,
"step": 15915
},
{
"entropy": 5.588099670410156,
"epoch": 1.3374921235034656,
"grad_norm": 1.5078125,
"learning_rate": 0.0004824892347948586,
"loss": 5.415,
"mean_token_accuracy": 0.16620554327964782,
"num_tokens": 29362138.0,
"step": 15920
},
{
"entropy": 5.55838680267334,
"epoch": 1.3379122033186306,
"grad_norm": 1.46875,
"learning_rate": 0.0004824776503992064,
"loss": 5.3127,
"mean_token_accuracy": 0.1756969392299652,
"num_tokens": 29371234.0,
"step": 15925
},
{
"entropy": 5.558799934387207,
"epoch": 1.3383322831337954,
"grad_norm": 1.3828125,
"learning_rate": 0.0004824660623281945,
"loss": 5.3484,
"mean_token_accuracy": 0.17654556334018706,
"num_tokens": 29380371.0,
"step": 15930
},
{
"entropy": 5.669035911560059,
"epoch": 1.3387523629489604,
"grad_norm": 1.4609375,
"learning_rate": 0.00048245447058202815,
"loss": 5.4802,
"mean_token_accuracy": 0.16493066400289536,
"num_tokens": 29389230.0,
"step": 15935
},
{
"entropy": 5.643646621704102,
"epoch": 1.3391724427641252,
"grad_norm": 1.484375,
"learning_rate": 0.0004824428751609126,
"loss": 5.3543,
"mean_token_accuracy": 0.17469435930252075,
"num_tokens": 29398753.0,
"step": 15940
},
{
"entropy": 5.592764806747437,
"epoch": 1.3395925225792902,
"grad_norm": 1.4453125,
"learning_rate": 0.00048243127606505343,
"loss": 5.3183,
"mean_token_accuracy": 0.17847731411457063,
"num_tokens": 29407487.0,
"step": 15945
},
{
"entropy": 5.518846654891968,
"epoch": 1.340012602394455,
"grad_norm": 1.4765625,
"learning_rate": 0.000482419673294656,
"loss": 5.3163,
"mean_token_accuracy": 0.17520241290330887,
"num_tokens": 29416140.0,
"step": 15950
},
{
"entropy": 5.5419957637786865,
"epoch": 1.3404326822096198,
"grad_norm": 1.4921875,
"learning_rate": 0.0004824080668499259,
"loss": 5.3425,
"mean_token_accuracy": 0.1725288987159729,
"num_tokens": 29424763.0,
"step": 15955
},
{
"entropy": 5.6412975788116455,
"epoch": 1.3408527620247848,
"grad_norm": 1.484375,
"learning_rate": 0.00048239645673106855,
"loss": 5.3482,
"mean_token_accuracy": 0.16793448776006697,
"num_tokens": 29434589.0,
"step": 15960
},
{
"entropy": 5.603252029418945,
"epoch": 1.3412728418399495,
"grad_norm": 1.625,
"learning_rate": 0.00048238484293828995,
"loss": 5.3662,
"mean_token_accuracy": 0.16894400417804717,
"num_tokens": 29443549.0,
"step": 15965
},
{
"entropy": 5.602950191497802,
"epoch": 1.3416929216551146,
"grad_norm": 1.5625,
"learning_rate": 0.0004823732254717955,
"loss": 5.3631,
"mean_token_accuracy": 0.17208106964826583,
"num_tokens": 29452457.0,
"step": 15970
},
{
"entropy": 5.555796575546265,
"epoch": 1.3421130014702793,
"grad_norm": 1.3515625,
"learning_rate": 0.0004823616043317912,
"loss": 5.3281,
"mean_token_accuracy": 0.1746997132897377,
"num_tokens": 29461238.0,
"step": 15975
},
{
"entropy": 5.654101085662842,
"epoch": 1.3425330812854441,
"grad_norm": 1.3125,
"learning_rate": 0.00048234997951848284,
"loss": 5.4105,
"mean_token_accuracy": 0.16118277460336686,
"num_tokens": 29471170.0,
"step": 15980
},
{
"entropy": 5.71726016998291,
"epoch": 1.3429531611006091,
"grad_norm": 1.484375,
"learning_rate": 0.0004823383510320764,
"loss": 5.4068,
"mean_token_accuracy": 0.16380561292171478,
"num_tokens": 29481017.0,
"step": 15985
},
{
"entropy": 5.66351900100708,
"epoch": 1.343373240915774,
"grad_norm": 1.6015625,
"learning_rate": 0.00048232671887277786,
"loss": 5.3613,
"mean_token_accuracy": 0.16766881942749023,
"num_tokens": 29489809.0,
"step": 15990
},
{
"entropy": 5.539630031585693,
"epoch": 1.343793320730939,
"grad_norm": 1.4765625,
"learning_rate": 0.00048231508304079313,
"loss": 5.3859,
"mean_token_accuracy": 0.17453958988189697,
"num_tokens": 29499499.0,
"step": 15995
},
{
"entropy": 5.622837018966675,
"epoch": 1.3442134005461037,
"grad_norm": 1.53125,
"learning_rate": 0.00048230344353632855,
"loss": 5.3491,
"mean_token_accuracy": 0.1713936820626259,
"num_tokens": 29508526.0,
"step": 16000
},
{
"entropy": 5.630671453475952,
"epoch": 1.3446334803612685,
"grad_norm": 1.3984375,
"learning_rate": 0.0004822918003595902,
"loss": 5.2768,
"mean_token_accuracy": 0.17665404826402664,
"num_tokens": 29517516.0,
"step": 16005
},
{
"entropy": 5.621455478668213,
"epoch": 1.3450535601764335,
"grad_norm": 1.6328125,
"learning_rate": 0.0004822801535107843,
"loss": 5.3864,
"mean_token_accuracy": 0.1649400234222412,
"num_tokens": 29526949.0,
"step": 16010
},
{
"entropy": 5.550832223892212,
"epoch": 1.3454736399915985,
"grad_norm": 1.3984375,
"learning_rate": 0.0004822685029901173,
"loss": 5.2765,
"mean_token_accuracy": 0.17336531430482865,
"num_tokens": 29536696.0,
"step": 16015
},
{
"entropy": 5.5667977809906,
"epoch": 1.3458937198067633,
"grad_norm": 1.3515625,
"learning_rate": 0.0004822568487977954,
"loss": 5.383,
"mean_token_accuracy": 0.1736892729997635,
"num_tokens": 29545672.0,
"step": 16020
},
{
"entropy": 5.616351890563965,
"epoch": 1.346313799621928,
"grad_norm": 1.5234375,
"learning_rate": 0.00048224519093402517,
"loss": 5.4183,
"mean_token_accuracy": 0.1693510353565216,
"num_tokens": 29554888.0,
"step": 16025
},
{
"entropy": 5.613638782501221,
"epoch": 1.346733879437093,
"grad_norm": 1.4453125,
"learning_rate": 0.00048223352939901317,
"loss": 5.3409,
"mean_token_accuracy": 0.1744115099310875,
"num_tokens": 29564798.0,
"step": 16030
},
{
"entropy": 5.6031999588012695,
"epoch": 1.347153959252258,
"grad_norm": 1.4921875,
"learning_rate": 0.0004822218641929658,
"loss": 5.374,
"mean_token_accuracy": 0.1732271909713745,
"num_tokens": 29574802.0,
"step": 16035
},
{
"entropy": 5.659905481338501,
"epoch": 1.347574039067423,
"grad_norm": 1.4921875,
"learning_rate": 0.0004822101953160899,
"loss": 5.3465,
"mean_token_accuracy": 0.16780985891819,
"num_tokens": 29583056.0,
"step": 16040
},
{
"entropy": 5.5878712177276615,
"epoch": 1.3479941188825877,
"grad_norm": 1.40625,
"learning_rate": 0.000482198522768592,
"loss": 5.3432,
"mean_token_accuracy": 0.16807262152433394,
"num_tokens": 29591935.0,
"step": 16045
},
{
"entropy": 5.515739965438843,
"epoch": 1.3484141986977525,
"grad_norm": 1.8515625,
"learning_rate": 0.00048218684655067907,
"loss": 5.272,
"mean_token_accuracy": 0.17685087025165558,
"num_tokens": 29600812.0,
"step": 16050
},
{
"entropy": 5.622799730300903,
"epoch": 1.3488342785129175,
"grad_norm": 1.4296875,
"learning_rate": 0.0004821751666625577,
"loss": 5.3895,
"mean_token_accuracy": 0.16696966886520387,
"num_tokens": 29610735.0,
"step": 16055
},
{
"entropy": 5.629577350616455,
"epoch": 1.3492543583280823,
"grad_norm": 1.4921875,
"learning_rate": 0.00048216348310443506,
"loss": 5.305,
"mean_token_accuracy": 0.1662694603204727,
"num_tokens": 29620295.0,
"step": 16060
},
{
"entropy": 5.527255725860596,
"epoch": 1.3496744381432473,
"grad_norm": 1.3671875,
"learning_rate": 0.00048215179587651795,
"loss": 5.1988,
"mean_token_accuracy": 0.18249010890722275,
"num_tokens": 29628214.0,
"step": 16065
},
{
"entropy": 5.552061605453491,
"epoch": 1.350094517958412,
"grad_norm": 1.3046875,
"learning_rate": 0.0004821401049790134,
"loss": 5.3343,
"mean_token_accuracy": 0.17743861377239228,
"num_tokens": 29636598.0,
"step": 16070
},
{
"entropy": 5.632044887542724,
"epoch": 1.3505145977735769,
"grad_norm": 1.2890625,
"learning_rate": 0.0004821284104121286,
"loss": 5.3171,
"mean_token_accuracy": 0.17407508194446564,
"num_tokens": 29646052.0,
"step": 16075
},
{
"entropy": 5.57314510345459,
"epoch": 1.3509346775887419,
"grad_norm": 1.3515625,
"learning_rate": 0.00048211671217607066,
"loss": 5.3412,
"mean_token_accuracy": 0.16080363988876342,
"num_tokens": 29655310.0,
"step": 16080
},
{
"entropy": 5.612378406524658,
"epoch": 1.3513547574039069,
"grad_norm": 1.3671875,
"learning_rate": 0.0004821050102710468,
"loss": 5.3478,
"mean_token_accuracy": 0.16932445466518403,
"num_tokens": 29664020.0,
"step": 16085
},
{
"entropy": 5.562947416305542,
"epoch": 1.3517748372190717,
"grad_norm": 1.28125,
"learning_rate": 0.00048209330469726433,
"loss": 5.4038,
"mean_token_accuracy": 0.16574105769395828,
"num_tokens": 29672416.0,
"step": 16090
},
{
"entropy": 5.625362539291382,
"epoch": 1.3521949170342364,
"grad_norm": 1.3125,
"learning_rate": 0.00048208159545493057,
"loss": 5.3024,
"mean_token_accuracy": 0.1789864793419838,
"num_tokens": 29681148.0,
"step": 16095
},
{
"entropy": 5.607744932174683,
"epoch": 1.3526149968494015,
"grad_norm": 1.3359375,
"learning_rate": 0.0004820698825442531,
"loss": 5.2769,
"mean_token_accuracy": 0.18102280795574188,
"num_tokens": 29689089.0,
"step": 16100
},
{
"entropy": 5.589360904693604,
"epoch": 1.3530350766645662,
"grad_norm": 1.4296875,
"learning_rate": 0.00048205816596543914,
"loss": 5.4067,
"mean_token_accuracy": 0.16675178855657577,
"num_tokens": 29697704.0,
"step": 16105
},
{
"entropy": 5.657672452926636,
"epoch": 1.3534551564797312,
"grad_norm": 1.34375,
"learning_rate": 0.00048204644571869646,
"loss": 5.4061,
"mean_token_accuracy": 0.16762982904911042,
"num_tokens": 29706966.0,
"step": 16110
},
{
"entropy": 5.552745246887207,
"epoch": 1.353875236294896,
"grad_norm": 1.390625,
"learning_rate": 0.0004820347218042326,
"loss": 5.2945,
"mean_token_accuracy": 0.16882940977811814,
"num_tokens": 29715817.0,
"step": 16115
},
{
"entropy": 5.604909896850586,
"epoch": 1.3542953161100608,
"grad_norm": 1.3125,
"learning_rate": 0.0004820229942222553,
"loss": 5.3875,
"mean_token_accuracy": 0.1684834063053131,
"num_tokens": 29725500.0,
"step": 16120
},
{
"entropy": 5.613030433654785,
"epoch": 1.3547153959252258,
"grad_norm": 1.46875,
"learning_rate": 0.00048201126297297214,
"loss": 5.3362,
"mean_token_accuracy": 0.16994466930627822,
"num_tokens": 29734774.0,
"step": 16125
},
{
"entropy": 5.57075138092041,
"epoch": 1.3551354757403906,
"grad_norm": 1.2578125,
"learning_rate": 0.0004819995280565911,
"loss": 5.3042,
"mean_token_accuracy": 0.17480856478214263,
"num_tokens": 29744667.0,
"step": 16130
},
{
"entropy": 5.677864265441895,
"epoch": 1.3555555555555556,
"grad_norm": 1.3671875,
"learning_rate": 0.00048198778947332,
"loss": 5.4191,
"mean_token_accuracy": 0.1700369492173195,
"num_tokens": 29753644.0,
"step": 16135
},
{
"entropy": 5.695547485351563,
"epoch": 1.3559756353707204,
"grad_norm": 1.4296875,
"learning_rate": 0.0004819760472233668,
"loss": 5.333,
"mean_token_accuracy": 0.17538298517465592,
"num_tokens": 29762977.0,
"step": 16140
},
{
"entropy": 5.566663360595703,
"epoch": 1.3563957151858852,
"grad_norm": 1.453125,
"learning_rate": 0.00048196430130693956,
"loss": 5.3272,
"mean_token_accuracy": 0.17656562328338624,
"num_tokens": 29772221.0,
"step": 16145
},
{
"entropy": 5.542793655395508,
"epoch": 1.3568157950010502,
"grad_norm": 1.359375,
"learning_rate": 0.00048195255172424627,
"loss": 5.3102,
"mean_token_accuracy": 0.17864325344562532,
"num_tokens": 29781240.0,
"step": 16150
},
{
"entropy": 5.6508362770080565,
"epoch": 1.3572358748162152,
"grad_norm": 1.359375,
"learning_rate": 0.00048194079847549507,
"loss": 5.3015,
"mean_token_accuracy": 0.16976007223129272,
"num_tokens": 29790330.0,
"step": 16155
},
{
"entropy": 5.628068542480468,
"epoch": 1.35765595463138,
"grad_norm": 1.3203125,
"learning_rate": 0.0004819290415608942,
"loss": 5.4186,
"mean_token_accuracy": 0.1703268453478813,
"num_tokens": 29800945.0,
"step": 16160
},
{
"entropy": 5.645391607284546,
"epoch": 1.3580760344465448,
"grad_norm": 1.40625,
"learning_rate": 0.0004819172809806519,
"loss": 5.4733,
"mean_token_accuracy": 0.16720173954963685,
"num_tokens": 29810391.0,
"step": 16165
},
{
"entropy": 5.615267324447632,
"epoch": 1.3584961142617098,
"grad_norm": 1.6015625,
"learning_rate": 0.00048190551673497645,
"loss": 5.335,
"mean_token_accuracy": 0.17374605238437651,
"num_tokens": 29819511.0,
"step": 16170
},
{
"entropy": 5.615129375457764,
"epoch": 1.3589161940768746,
"grad_norm": 1.4609375,
"learning_rate": 0.0004818937488240764,
"loss": 5.3675,
"mean_token_accuracy": 0.1720701590180397,
"num_tokens": 29828313.0,
"step": 16175
},
{
"entropy": 5.549634075164795,
"epoch": 1.3593362738920396,
"grad_norm": 1.3828125,
"learning_rate": 0.00048188197724816014,
"loss": 5.2705,
"mean_token_accuracy": 0.17981562167406082,
"num_tokens": 29837940.0,
"step": 16180
},
{
"entropy": 5.5864886283874515,
"epoch": 1.3597563537072044,
"grad_norm": 1.453125,
"learning_rate": 0.00048187020200743613,
"loss": 5.244,
"mean_token_accuracy": 0.17982228100299835,
"num_tokens": 29846799.0,
"step": 16185
},
{
"entropy": 5.604966831207276,
"epoch": 1.3601764335223692,
"grad_norm": 1.4140625,
"learning_rate": 0.000481858423102113,
"loss": 5.3906,
"mean_token_accuracy": 0.16628821343183517,
"num_tokens": 29856263.0,
"step": 16190
},
{
"entropy": 5.551861810684204,
"epoch": 1.3605965133375342,
"grad_norm": 1.5546875,
"learning_rate": 0.0004818466405323994,
"loss": 5.3115,
"mean_token_accuracy": 0.17324225306510926,
"num_tokens": 29864335.0,
"step": 16195
},
{
"entropy": 5.648256301879883,
"epoch": 1.361016593152699,
"grad_norm": 1.59375,
"learning_rate": 0.00048183485429850417,
"loss": 5.3754,
"mean_token_accuracy": 0.16436336785554886,
"num_tokens": 29873466.0,
"step": 16200
},
{
"entropy": 5.523039293289185,
"epoch": 1.361436672967864,
"grad_norm": 1.328125,
"learning_rate": 0.0004818230644006359,
"loss": 5.3423,
"mean_token_accuracy": 0.17815627604722978,
"num_tokens": 29883051.0,
"step": 16205
},
{
"entropy": 5.5821390628814695,
"epoch": 1.3618567527830288,
"grad_norm": 1.4453125,
"learning_rate": 0.0004818112708390036,
"loss": 5.2884,
"mean_token_accuracy": 0.16962315440177916,
"num_tokens": 29891823.0,
"step": 16210
},
{
"entropy": 5.581261396408081,
"epoch": 1.3622768325981935,
"grad_norm": 1.578125,
"learning_rate": 0.0004817994736138162,
"loss": 5.2927,
"mean_token_accuracy": 0.1753602907061577,
"num_tokens": 29900735.0,
"step": 16215
},
{
"entropy": 5.591730451583862,
"epoch": 1.3626969124133586,
"grad_norm": 1.5546875,
"learning_rate": 0.0004817876727252824,
"loss": 5.3808,
"mean_token_accuracy": 0.17229579836130143,
"num_tokens": 29910345.0,
"step": 16220
},
{
"entropy": 5.5530870914459225,
"epoch": 1.3631169922285233,
"grad_norm": 1.859375,
"learning_rate": 0.00048177586817361166,
"loss": 5.3309,
"mean_token_accuracy": 0.17143950313329698,
"num_tokens": 29919650.0,
"step": 16225
},
{
"entropy": 5.690658140182495,
"epoch": 1.3635370720436883,
"grad_norm": 1.6015625,
"learning_rate": 0.0004817640599590128,
"loss": 5.3736,
"mean_token_accuracy": 0.17006734907627105,
"num_tokens": 29928851.0,
"step": 16230
},
{
"entropy": 5.693109655380249,
"epoch": 1.3639571518588531,
"grad_norm": 1.5,
"learning_rate": 0.00048175224808169506,
"loss": 5.4502,
"mean_token_accuracy": 0.16599725037813187,
"num_tokens": 29939146.0,
"step": 16235
},
{
"entropy": 5.556785535812378,
"epoch": 1.3643772316740181,
"grad_norm": 1.78125,
"learning_rate": 0.00048174043254186775,
"loss": 5.2975,
"mean_token_accuracy": 0.1708006501197815,
"num_tokens": 29947556.0,
"step": 16240
},
{
"entropy": 5.599189519882202,
"epoch": 1.364797311489183,
"grad_norm": 1.328125,
"learning_rate": 0.0004817286133397401,
"loss": 5.3945,
"mean_token_accuracy": 0.16401757150888444,
"num_tokens": 29957319.0,
"step": 16245
},
{
"entropy": 5.641275882720947,
"epoch": 1.365217391304348,
"grad_norm": 1.4453125,
"learning_rate": 0.0004817167904755216,
"loss": 5.3733,
"mean_token_accuracy": 0.1701898068189621,
"num_tokens": 29966697.0,
"step": 16250
},
{
"entropy": 5.638650703430176,
"epoch": 1.3656374711195127,
"grad_norm": 1.59375,
"learning_rate": 0.00048170496394942154,
"loss": 5.3747,
"mean_token_accuracy": 0.16956737339496614,
"num_tokens": 29975103.0,
"step": 16255
},
{
"entropy": 5.499557638168335,
"epoch": 1.3660575509346775,
"grad_norm": 1.4375,
"learning_rate": 0.00048169313376164943,
"loss": 5.2941,
"mean_token_accuracy": 0.17256233394145964,
"num_tokens": 29984865.0,
"step": 16260
},
{
"entropy": 5.568027591705322,
"epoch": 1.3664776307498425,
"grad_norm": 1.3984375,
"learning_rate": 0.00048168129991241497,
"loss": 5.3205,
"mean_token_accuracy": 0.16998280435800553,
"num_tokens": 29994376.0,
"step": 16265
},
{
"entropy": 5.69058051109314,
"epoch": 1.3668977105650073,
"grad_norm": 1.3359375,
"learning_rate": 0.0004816694624019277,
"loss": 5.5187,
"mean_token_accuracy": 0.1662422835826874,
"num_tokens": 30004846.0,
"step": 16270
},
{
"entropy": 5.590044927597046,
"epoch": 1.3673177903801723,
"grad_norm": 1.375,
"learning_rate": 0.00048165762123039723,
"loss": 5.3178,
"mean_token_accuracy": 0.17071129232645035,
"num_tokens": 30014083.0,
"step": 16275
},
{
"entropy": 5.554098129272461,
"epoch": 1.367737870195337,
"grad_norm": 1.25,
"learning_rate": 0.00048164577639803354,
"loss": 5.3038,
"mean_token_accuracy": 0.17434270977973937,
"num_tokens": 30023606.0,
"step": 16280
},
{
"entropy": 5.512778234481812,
"epoch": 1.3681579500105019,
"grad_norm": 2.21875,
"learning_rate": 0.0004816339279050463,
"loss": 5.2985,
"mean_token_accuracy": 0.16695593893527985,
"num_tokens": 30033657.0,
"step": 16285
},
{
"entropy": 5.565143680572509,
"epoch": 1.368578029825667,
"grad_norm": 1.3984375,
"learning_rate": 0.00048162207575164537,
"loss": 5.3552,
"mean_token_accuracy": 0.17300256788730622,
"num_tokens": 30043230.0,
"step": 16290
},
{
"entropy": 5.596675729751587,
"epoch": 1.3689981096408317,
"grad_norm": 1.34375,
"learning_rate": 0.00048161021993804075,
"loss": 5.3818,
"mean_token_accuracy": 0.16435192823410033,
"num_tokens": 30054457.0,
"step": 16295
},
{
"entropy": 5.5473246574401855,
"epoch": 1.3694181894559967,
"grad_norm": 1.53125,
"learning_rate": 0.00048159836046444255,
"loss": 5.2319,
"mean_token_accuracy": 0.17981479614973067,
"num_tokens": 30062912.0,
"step": 16300
},
{
"entropy": 5.594003820419312,
"epoch": 1.3698382692711615,
"grad_norm": 1.5,
"learning_rate": 0.0004815864973310607,
"loss": 5.3733,
"mean_token_accuracy": 0.16775110810995103,
"num_tokens": 30071340.0,
"step": 16305
},
{
"entropy": 5.684501123428345,
"epoch": 1.3702583490863263,
"grad_norm": 1.453125,
"learning_rate": 0.00048157463053810553,
"loss": 5.4493,
"mean_token_accuracy": 0.16390240639448167,
"num_tokens": 30080334.0,
"step": 16310
},
{
"entropy": 5.561945152282715,
"epoch": 1.3706784289014913,
"grad_norm": 1.3671875,
"learning_rate": 0.00048156276008578706,
"loss": 5.3129,
"mean_token_accuracy": 0.17048832327127456,
"num_tokens": 30089391.0,
"step": 16315
},
{
"entropy": 5.521819686889648,
"epoch": 1.3710985087166563,
"grad_norm": 1.5,
"learning_rate": 0.0004815508859743157,
"loss": 5.2878,
"mean_token_accuracy": 0.17092218548059462,
"num_tokens": 30099027.0,
"step": 16320
},
{
"entropy": 5.554465055465698,
"epoch": 1.371518588531821,
"grad_norm": 1.421875,
"learning_rate": 0.0004815390082039017,
"loss": 5.2807,
"mean_token_accuracy": 0.17152634710073472,
"num_tokens": 30108088.0,
"step": 16325
},
{
"entropy": 5.564589929580689,
"epoch": 1.3719386683469859,
"grad_norm": 1.328125,
"learning_rate": 0.00048152712677475556,
"loss": 5.2714,
"mean_token_accuracy": 0.17465380877256392,
"num_tokens": 30117768.0,
"step": 16330
},
{
"entropy": 5.637947416305542,
"epoch": 1.3723587481621509,
"grad_norm": 1.53125,
"learning_rate": 0.00048151524168708773,
"loss": 5.4031,
"mean_token_accuracy": 0.1665814161300659,
"num_tokens": 30126364.0,
"step": 16335
},
{
"entropy": 5.530297708511353,
"epoch": 1.3727788279773157,
"grad_norm": 1.5546875,
"learning_rate": 0.00048150335294110867,
"loss": 5.3372,
"mean_token_accuracy": 0.16926538050174714,
"num_tokens": 30135365.0,
"step": 16340
},
{
"entropy": 5.635163116455078,
"epoch": 1.3731989077924807,
"grad_norm": 1.4921875,
"learning_rate": 0.00048149146053702915,
"loss": 5.4015,
"mean_token_accuracy": 0.1793300449848175,
"num_tokens": 30145542.0,
"step": 16345
},
{
"entropy": 5.653752326965332,
"epoch": 1.3736189876076454,
"grad_norm": 1.4296875,
"learning_rate": 0.0004814795644750597,
"loss": 5.4245,
"mean_token_accuracy": 0.16785451918840408,
"num_tokens": 30154100.0,
"step": 16350
},
{
"entropy": 5.568053197860718,
"epoch": 1.3740390674228102,
"grad_norm": 1.3359375,
"learning_rate": 0.00048146766475541105,
"loss": 5.3135,
"mean_token_accuracy": 0.1715140759944916,
"num_tokens": 30162647.0,
"step": 16355
},
{
"entropy": 5.732910871505737,
"epoch": 1.3744591472379752,
"grad_norm": 1.53125,
"learning_rate": 0.00048145576137829406,
"loss": 5.4498,
"mean_token_accuracy": 0.16248857825994492,
"num_tokens": 30172518.0,
"step": 16360
},
{
"entropy": 5.583808088302613,
"epoch": 1.37487922705314,
"grad_norm": 1.3828125,
"learning_rate": 0.0004814438543439195,
"loss": 5.3845,
"mean_token_accuracy": 0.17373429983854294,
"num_tokens": 30183124.0,
"step": 16365
},
{
"entropy": 5.620030927658081,
"epoch": 1.375299306868305,
"grad_norm": 1.484375,
"learning_rate": 0.0004814319436524984,
"loss": 5.3306,
"mean_token_accuracy": 0.1729157581925392,
"num_tokens": 30191861.0,
"step": 16370
},
{
"entropy": 5.536642169952392,
"epoch": 1.3757193866834698,
"grad_norm": 1.328125,
"learning_rate": 0.00048142002930424174,
"loss": 5.2362,
"mean_token_accuracy": 0.17516896426677703,
"num_tokens": 30200308.0,
"step": 16375
},
{
"entropy": 5.6155558109283445,
"epoch": 1.3761394664986346,
"grad_norm": 1.28125,
"learning_rate": 0.0004814081112993605,
"loss": 5.3384,
"mean_token_accuracy": 0.1747937634587288,
"num_tokens": 30209380.0,
"step": 16380
},
{
"entropy": 5.634831047058105,
"epoch": 1.3765595463137996,
"grad_norm": 1.265625,
"learning_rate": 0.0004813961896380659,
"loss": 5.4439,
"mean_token_accuracy": 0.16628428995609285,
"num_tokens": 30218549.0,
"step": 16385
},
{
"entropy": 5.599160194396973,
"epoch": 1.3769796261289646,
"grad_norm": 1.5234375,
"learning_rate": 0.0004813842643205691,
"loss": 5.3549,
"mean_token_accuracy": 0.1684131532907486,
"num_tokens": 30228119.0,
"step": 16390
},
{
"entropy": 5.634571838378906,
"epoch": 1.3773997059441294,
"grad_norm": 1.546875,
"learning_rate": 0.0004813723353470813,
"loss": 5.3433,
"mean_token_accuracy": 0.16634008586406707,
"num_tokens": 30236765.0,
"step": 16395
},
{
"entropy": 5.611937761306763,
"epoch": 1.3778197857592942,
"grad_norm": 1.2265625,
"learning_rate": 0.0004813604027178139,
"loss": 5.2798,
"mean_token_accuracy": 0.1692429482936859,
"num_tokens": 30246089.0,
"step": 16400
},
{
"entropy": 5.588499641418457,
"epoch": 1.3782398655744592,
"grad_norm": 1.3984375,
"learning_rate": 0.00048134846643297817,
"loss": 5.4012,
"mean_token_accuracy": 0.16748136729002,
"num_tokens": 30255806.0,
"step": 16405
},
{
"entropy": 5.684622478485108,
"epoch": 1.378659945389624,
"grad_norm": 1.3671875,
"learning_rate": 0.0004813365264927856,
"loss": 5.466,
"mean_token_accuracy": 0.160992269217968,
"num_tokens": 30267112.0,
"step": 16410
},
{
"entropy": 5.636025619506836,
"epoch": 1.379080025204789,
"grad_norm": 1.4140625,
"learning_rate": 0.0004813245828974477,
"loss": 5.3219,
"mean_token_accuracy": 0.17303089946508407,
"num_tokens": 30276168.0,
"step": 16415
},
{
"entropy": 5.559030294418335,
"epoch": 1.3795001050199538,
"grad_norm": 1.5625,
"learning_rate": 0.0004813126356471761,
"loss": 5.3653,
"mean_token_accuracy": 0.16919714510440825,
"num_tokens": 30285723.0,
"step": 16420
},
{
"entropy": 5.685474586486817,
"epoch": 1.3799201848351186,
"grad_norm": 1.6171875,
"learning_rate": 0.0004813006847421824,
"loss": 5.4035,
"mean_token_accuracy": 0.17144897878170012,
"num_tokens": 30294790.0,
"step": 16425
},
{
"entropy": 5.600849294662476,
"epoch": 1.3803402646502836,
"grad_norm": 1.3203125,
"learning_rate": 0.0004812887301826783,
"loss": 5.3325,
"mean_token_accuracy": 0.17425192892551422,
"num_tokens": 30303439.0,
"step": 16430
},
{
"entropy": 5.5108733654022215,
"epoch": 1.3807603444654484,
"grad_norm": 1.703125,
"learning_rate": 0.0004812767719688755,
"loss": 5.3223,
"mean_token_accuracy": 0.16848094761371613,
"num_tokens": 30312493.0,
"step": 16435
},
{
"entropy": 5.644097995758057,
"epoch": 1.3811804242806134,
"grad_norm": 1.328125,
"learning_rate": 0.0004812648101009859,
"loss": 5.3547,
"mean_token_accuracy": 0.17637852132320403,
"num_tokens": 30321637.0,
"step": 16440
},
{
"entropy": 5.691900682449341,
"epoch": 1.3816005040957782,
"grad_norm": 1.46875,
"learning_rate": 0.0004812528445792215,
"loss": 5.4764,
"mean_token_accuracy": 0.1603387027978897,
"num_tokens": 30330730.0,
"step": 16445
},
{
"entropy": 5.551471710205078,
"epoch": 1.382020583910943,
"grad_norm": 1.328125,
"learning_rate": 0.00048124087540379407,
"loss": 5.3202,
"mean_token_accuracy": 0.171609827876091,
"num_tokens": 30339568.0,
"step": 16450
},
{
"entropy": 5.560440397262573,
"epoch": 1.382440663726108,
"grad_norm": 1.84375,
"learning_rate": 0.00048122890257491573,
"loss": 5.3833,
"mean_token_accuracy": 0.16492262333631516,
"num_tokens": 30349225.0,
"step": 16455
},
{
"entropy": 5.654400062561035,
"epoch": 1.382860743541273,
"grad_norm": 1.3203125,
"learning_rate": 0.00048121692609279866,
"loss": 5.367,
"mean_token_accuracy": 0.17373183816671373,
"num_tokens": 30358804.0,
"step": 16460
},
{
"entropy": 5.627988862991333,
"epoch": 1.3832808233564378,
"grad_norm": 1.3046875,
"learning_rate": 0.0004812049459576549,
"loss": 5.4346,
"mean_token_accuracy": 0.1719113200902939,
"num_tokens": 30368490.0,
"step": 16465
},
{
"entropy": 5.666933012008667,
"epoch": 1.3837009031716025,
"grad_norm": 1.5234375,
"learning_rate": 0.0004811929621696966,
"loss": 5.3319,
"mean_token_accuracy": 0.1665458783507347,
"num_tokens": 30377117.0,
"step": 16470
},
{
"entropy": 5.531878280639648,
"epoch": 1.3841209829867676,
"grad_norm": 1.2734375,
"learning_rate": 0.00048118097472913627,
"loss": 5.2197,
"mean_token_accuracy": 0.17990321069955825,
"num_tokens": 30385151.0,
"step": 16475
},
{
"entropy": 5.4649818420410154,
"epoch": 1.3845410628019323,
"grad_norm": 1.328125,
"learning_rate": 0.0004811689836361861,
"loss": 5.2596,
"mean_token_accuracy": 0.1737077608704567,
"num_tokens": 30394837.0,
"step": 16480
},
{
"entropy": 5.559386777877807,
"epoch": 1.3849611426170974,
"grad_norm": 1.640625,
"learning_rate": 0.0004811569888910585,
"loss": 5.3214,
"mean_token_accuracy": 0.17485344558954238,
"num_tokens": 30403507.0,
"step": 16485
},
{
"entropy": 5.553847122192383,
"epoch": 1.3853812224322621,
"grad_norm": 1.484375,
"learning_rate": 0.0004811449904939661,
"loss": 5.3167,
"mean_token_accuracy": 0.17440947741270066,
"num_tokens": 30412941.0,
"step": 16490
},
{
"entropy": 5.626542854309082,
"epoch": 1.385801302247427,
"grad_norm": 1.421875,
"learning_rate": 0.00048113298844512127,
"loss": 5.3033,
"mean_token_accuracy": 0.17789137810468675,
"num_tokens": 30421823.0,
"step": 16495
},
{
"entropy": 5.583016681671142,
"epoch": 1.386221382062592,
"grad_norm": 1.265625,
"learning_rate": 0.0004811209827447367,
"loss": 5.4108,
"mean_token_accuracy": 0.16509422063827514,
"num_tokens": 30431901.0,
"step": 16500
},
{
"entropy": 5.559862089157105,
"epoch": 1.3866414618777567,
"grad_norm": 1.5546875,
"learning_rate": 0.00048110897339302504,
"loss": 5.3254,
"mean_token_accuracy": 0.16900486946105958,
"num_tokens": 30442037.0,
"step": 16505
},
{
"entropy": 5.608466672897339,
"epoch": 1.3870615416929217,
"grad_norm": 1.3203125,
"learning_rate": 0.00048109696039019915,
"loss": 5.3071,
"mean_token_accuracy": 0.17851560711860656,
"num_tokens": 30451189.0,
"step": 16510
},
{
"entropy": 5.626796579360962,
"epoch": 1.3874816215080865,
"grad_norm": 1.4375,
"learning_rate": 0.0004810849437364716,
"loss": 5.3873,
"mean_token_accuracy": 0.16870215982198716,
"num_tokens": 30460214.0,
"step": 16515
},
{
"entropy": 5.637179136276245,
"epoch": 1.3879017013232513,
"grad_norm": 1.3359375,
"learning_rate": 0.00048107292343205546,
"loss": 5.4029,
"mean_token_accuracy": 0.16420054137706758,
"num_tokens": 30469936.0,
"step": 16520
},
{
"entropy": 5.597711706161499,
"epoch": 1.3883217811384163,
"grad_norm": 1.3828125,
"learning_rate": 0.0004810608994771636,
"loss": 5.3424,
"mean_token_accuracy": 0.1725441485643387,
"num_tokens": 30479282.0,
"step": 16525
},
{
"entropy": 5.640149974822998,
"epoch": 1.388741860953581,
"grad_norm": 1.296875,
"learning_rate": 0.000481048871872009,
"loss": 5.3899,
"mean_token_accuracy": 0.17059538662433624,
"num_tokens": 30487839.0,
"step": 16530
},
{
"entropy": 5.703711271286011,
"epoch": 1.389161940768746,
"grad_norm": 1.3984375,
"learning_rate": 0.00048103684061680463,
"loss": 5.4192,
"mean_token_accuracy": 0.16955755650997162,
"num_tokens": 30497327.0,
"step": 16535
},
{
"entropy": 5.620444583892822,
"epoch": 1.389582020583911,
"grad_norm": 1.375,
"learning_rate": 0.00048102480571176384,
"loss": 5.3111,
"mean_token_accuracy": 0.1740582123398781,
"num_tokens": 30506996.0,
"step": 16540
},
{
"entropy": 5.5478297710418705,
"epoch": 1.390002100399076,
"grad_norm": 1.40625,
"learning_rate": 0.0004810127671570997,
"loss": 5.253,
"mean_token_accuracy": 0.1782710075378418,
"num_tokens": 30515627.0,
"step": 16545
},
{
"entropy": 5.62630558013916,
"epoch": 1.3904221802142407,
"grad_norm": 1.4140625,
"learning_rate": 0.00048100072495302544,
"loss": 5.3837,
"mean_token_accuracy": 0.1714572563767433,
"num_tokens": 30525858.0,
"step": 16550
},
{
"entropy": 5.5055724620819095,
"epoch": 1.3908422600294057,
"grad_norm": 1.7265625,
"learning_rate": 0.0004809886790997544,
"loss": 5.2677,
"mean_token_accuracy": 0.1812590092420578,
"num_tokens": 30536331.0,
"step": 16555
},
{
"entropy": 5.613178396224976,
"epoch": 1.3912623398445705,
"grad_norm": 1.3125,
"learning_rate": 0.0004809766295975,
"loss": 5.328,
"mean_token_accuracy": 0.1739115834236145,
"num_tokens": 30545329.0,
"step": 16560
},
{
"entropy": 5.592377233505249,
"epoch": 1.3916824196597353,
"grad_norm": 1.53125,
"learning_rate": 0.0004809645764464757,
"loss": 5.2901,
"mean_token_accuracy": 0.16965100020170212,
"num_tokens": 30554357.0,
"step": 16565
},
{
"entropy": 5.58855447769165,
"epoch": 1.3921024994749003,
"grad_norm": 1.515625,
"learning_rate": 0.00048095251964689494,
"loss": 5.4646,
"mean_token_accuracy": 0.16623444408178328,
"num_tokens": 30563548.0,
"step": 16570
},
{
"entropy": 5.559497594833374,
"epoch": 1.392522579290065,
"grad_norm": 1.4140625,
"learning_rate": 0.00048094045919897134,
"loss": 5.3392,
"mean_token_accuracy": 0.17339515388011933,
"num_tokens": 30572844.0,
"step": 16575
},
{
"entropy": 5.588882350921631,
"epoch": 1.39294265910523,
"grad_norm": 1.3515625,
"learning_rate": 0.0004809283951029185,
"loss": 5.259,
"mean_token_accuracy": 0.17575060725212097,
"num_tokens": 30580930.0,
"step": 16580
},
{
"entropy": 5.580895280838012,
"epoch": 1.3933627389203949,
"grad_norm": 1.3515625,
"learning_rate": 0.0004809163273589503,
"loss": 5.2652,
"mean_token_accuracy": 0.17910676598548889,
"num_tokens": 30589917.0,
"step": 16585
},
{
"entropy": 5.534520387649536,
"epoch": 1.3937828187355596,
"grad_norm": 1.234375,
"learning_rate": 0.00048090425596728035,
"loss": 5.3769,
"mean_token_accuracy": 0.16942780315876008,
"num_tokens": 30599282.0,
"step": 16590
},
{
"entropy": 5.594612646102905,
"epoch": 1.3942028985507247,
"grad_norm": 1.3515625,
"learning_rate": 0.00048089218092812254,
"loss": 5.3515,
"mean_token_accuracy": 0.16896623820066453,
"num_tokens": 30608244.0,
"step": 16595
},
{
"entropy": 5.6988067626953125,
"epoch": 1.3946229783658894,
"grad_norm": 1.4296875,
"learning_rate": 0.00048088010224169064,
"loss": 5.4811,
"mean_token_accuracy": 0.1701745718717575,
"num_tokens": 30617340.0,
"step": 16600
},
{
"entropy": 5.697456169128418,
"epoch": 1.3950430581810545,
"grad_norm": 1.4453125,
"learning_rate": 0.00048086801990819886,
"loss": 5.3959,
"mean_token_accuracy": 0.16495682895183564,
"num_tokens": 30626244.0,
"step": 16605
},
{
"entropy": 5.584010171890259,
"epoch": 1.3954631379962192,
"grad_norm": 1.4921875,
"learning_rate": 0.00048085593392786113,
"loss": 5.3784,
"mean_token_accuracy": 0.17467126101255417,
"num_tokens": 30635279.0,
"step": 16610
},
{
"entropy": 5.656182336807251,
"epoch": 1.395883217811384,
"grad_norm": 1.3203125,
"learning_rate": 0.0004808438443008915,
"loss": 5.5187,
"mean_token_accuracy": 0.1635328322649002,
"num_tokens": 30645790.0,
"step": 16615
},
{
"entropy": 5.559104681015015,
"epoch": 1.396303297626549,
"grad_norm": 1.46875,
"learning_rate": 0.0004808317510275041,
"loss": 5.3441,
"mean_token_accuracy": 0.16912484169006348,
"num_tokens": 30654497.0,
"step": 16620
},
{
"entropy": 5.603745460510254,
"epoch": 1.396723377441714,
"grad_norm": 1.25,
"learning_rate": 0.0004808196541079133,
"loss": 5.4183,
"mean_token_accuracy": 0.16523861289024352,
"num_tokens": 30663760.0,
"step": 16625
},
{
"entropy": 5.612056541442871,
"epoch": 1.3971434572568788,
"grad_norm": 1.4453125,
"learning_rate": 0.00048080755354233326,
"loss": 5.4158,
"mean_token_accuracy": 0.16850531101226807,
"num_tokens": 30674263.0,
"step": 16630
},
{
"entropy": 5.634685134887695,
"epoch": 1.3975635370720436,
"grad_norm": 1.46875,
"learning_rate": 0.0004807954493309784,
"loss": 5.2985,
"mean_token_accuracy": 0.16984269618988038,
"num_tokens": 30683501.0,
"step": 16635
},
{
"entropy": 5.582397985458374,
"epoch": 1.3979836168872086,
"grad_norm": 1.4921875,
"learning_rate": 0.00048078334147406314,
"loss": 5.2775,
"mean_token_accuracy": 0.18322759866714478,
"num_tokens": 30691917.0,
"step": 16640
},
{
"entropy": 5.563442945480347,
"epoch": 1.3984036967023734,
"grad_norm": 1.2734375,
"learning_rate": 0.00048077122997180197,
"loss": 5.3732,
"mean_token_accuracy": 0.1712018147110939,
"num_tokens": 30701753.0,
"step": 16645
},
{
"entropy": 5.523807716369629,
"epoch": 1.3988237765175384,
"grad_norm": 1.3359375,
"learning_rate": 0.0004807591148244093,
"loss": 5.3397,
"mean_token_accuracy": 0.172254741191864,
"num_tokens": 30710878.0,
"step": 16650
},
{
"entropy": 5.521091270446777,
"epoch": 1.3992438563327032,
"grad_norm": 1.2265625,
"learning_rate": 0.0004807469960321,
"loss": 5.2342,
"mean_token_accuracy": 0.1791976734995842,
"num_tokens": 30719372.0,
"step": 16655
},
{
"entropy": 5.561942672729492,
"epoch": 1.399663936147868,
"grad_norm": 1.328125,
"learning_rate": 0.00048073487359508854,
"loss": 5.3985,
"mean_token_accuracy": 0.16648093461990357,
"num_tokens": 30728529.0,
"step": 16660
},
{
"entropy": 5.659975481033325,
"epoch": 1.400084015963033,
"grad_norm": 1.390625,
"learning_rate": 0.00048072274751358976,
"loss": 5.3383,
"mean_token_accuracy": 0.17329138070344924,
"num_tokens": 30737704.0,
"step": 16665
},
{
"entropy": 5.5603667259216305,
"epoch": 1.4005040957781978,
"grad_norm": 1.390625,
"learning_rate": 0.00048071061778781843,
"loss": 5.3155,
"mean_token_accuracy": 0.16734694093465804,
"num_tokens": 30747836.0,
"step": 16670
},
{
"entropy": 5.526829862594605,
"epoch": 1.4009241755933628,
"grad_norm": 1.3671875,
"learning_rate": 0.0004806984844179894,
"loss": 5.3789,
"mean_token_accuracy": 0.1673613667488098,
"num_tokens": 30757881.0,
"step": 16675
},
{
"entropy": 5.60459885597229,
"epoch": 1.4013442554085276,
"grad_norm": 1.3671875,
"learning_rate": 0.00048068634740431774,
"loss": 5.3838,
"mean_token_accuracy": 0.16605682522058487,
"num_tokens": 30767592.0,
"step": 16680
},
{
"entropy": 5.589730167388916,
"epoch": 1.4017643352236924,
"grad_norm": 1.4921875,
"learning_rate": 0.0004806742067470182,
"loss": 5.3626,
"mean_token_accuracy": 0.17218761295080184,
"num_tokens": 30776633.0,
"step": 16685
},
{
"entropy": 5.658456707000733,
"epoch": 1.4021844150388574,
"grad_norm": 1.3515625,
"learning_rate": 0.00048066206244630613,
"loss": 5.2974,
"mean_token_accuracy": 0.17236762195825578,
"num_tokens": 30785195.0,
"step": 16690
},
{
"entropy": 5.5110338687896725,
"epoch": 1.4026044948540224,
"grad_norm": 1.5234375,
"learning_rate": 0.00048064991450239643,
"loss": 5.3155,
"mean_token_accuracy": 0.17006028145551683,
"num_tokens": 30794397.0,
"step": 16695
},
{
"entropy": 5.690442705154419,
"epoch": 1.4030245746691872,
"grad_norm": 1.34375,
"learning_rate": 0.00048063776291550444,
"loss": 5.4542,
"mean_token_accuracy": 0.16505993902683258,
"num_tokens": 30803312.0,
"step": 16700
},
{
"entropy": 5.671812105178833,
"epoch": 1.403444654484352,
"grad_norm": 1.359375,
"learning_rate": 0.00048062560768584537,
"loss": 5.3696,
"mean_token_accuracy": 0.17488641738891603,
"num_tokens": 30812519.0,
"step": 16705
},
{
"entropy": 5.541071128845215,
"epoch": 1.403864734299517,
"grad_norm": 1.40625,
"learning_rate": 0.00048061344881363444,
"loss": 5.3093,
"mean_token_accuracy": 0.17705396711826324,
"num_tokens": 30821558.0,
"step": 16710
},
{
"entropy": 5.611396789550781,
"epoch": 1.4042848141146818,
"grad_norm": 1.2421875,
"learning_rate": 0.0004806012862990873,
"loss": 5.3363,
"mean_token_accuracy": 0.1747825935482979,
"num_tokens": 30831521.0,
"step": 16715
},
{
"entropy": 5.608307790756226,
"epoch": 1.4047048939298468,
"grad_norm": 1.3125,
"learning_rate": 0.00048058912014241914,
"loss": 5.3266,
"mean_token_accuracy": 0.16965138018131257,
"num_tokens": 30841191.0,
"step": 16720
},
{
"entropy": 5.619459724426269,
"epoch": 1.4051249737450116,
"grad_norm": 1.3984375,
"learning_rate": 0.0004805769503438456,
"loss": 5.439,
"mean_token_accuracy": 0.17043500691652297,
"num_tokens": 30850556.0,
"step": 16725
},
{
"entropy": 5.599527263641358,
"epoch": 1.4055450535601763,
"grad_norm": 1.375,
"learning_rate": 0.00048056477690358227,
"loss": 5.3227,
"mean_token_accuracy": 0.17971756011247636,
"num_tokens": 30859410.0,
"step": 16730
},
{
"entropy": 5.679393196105957,
"epoch": 1.4059651333753413,
"grad_norm": 1.3671875,
"learning_rate": 0.0004805525998218447,
"loss": 5.3511,
"mean_token_accuracy": 0.16887875497341157,
"num_tokens": 30868048.0,
"step": 16735
},
{
"entropy": 5.608558559417725,
"epoch": 1.4063852131905061,
"grad_norm": 1.2734375,
"learning_rate": 0.00048054041909884873,
"loss": 5.3663,
"mean_token_accuracy": 0.1683382585644722,
"num_tokens": 30876785.0,
"step": 16740
},
{
"entropy": 5.655353260040283,
"epoch": 1.4068052930056711,
"grad_norm": 1.3984375,
"learning_rate": 0.00048052823473481007,
"loss": 5.4278,
"mean_token_accuracy": 0.16634053140878677,
"num_tokens": 30886158.0,
"step": 16745
},
{
"entropy": 5.644631242752075,
"epoch": 1.407225372820836,
"grad_norm": 1.375,
"learning_rate": 0.00048051604672994446,
"loss": 5.2942,
"mean_token_accuracy": 0.17192937582731246,
"num_tokens": 30895283.0,
"step": 16750
},
{
"entropy": 5.58252534866333,
"epoch": 1.4076454526360007,
"grad_norm": 1.375,
"learning_rate": 0.00048050385508446804,
"loss": 5.3403,
"mean_token_accuracy": 0.17440515607595444,
"num_tokens": 30905514.0,
"step": 16755
},
{
"entropy": 5.596921396255493,
"epoch": 1.4080655324511657,
"grad_norm": 1.296875,
"learning_rate": 0.00048049165979859655,
"loss": 5.2436,
"mean_token_accuracy": 0.1878000468015671,
"num_tokens": 30914794.0,
"step": 16760
},
{
"entropy": 5.580918025970459,
"epoch": 1.4084856122663307,
"grad_norm": 1.375,
"learning_rate": 0.00048047946087254615,
"loss": 5.2736,
"mean_token_accuracy": 0.17217740267515183,
"num_tokens": 30923823.0,
"step": 16765
},
{
"entropy": 5.567533493041992,
"epoch": 1.4089056920814955,
"grad_norm": 1.6015625,
"learning_rate": 0.00048046725830653295,
"loss": 5.3996,
"mean_token_accuracy": 0.1653307020664215,
"num_tokens": 30932738.0,
"step": 16770
},
{
"entropy": 5.601448440551758,
"epoch": 1.4093257718966603,
"grad_norm": 1.3984375,
"learning_rate": 0.00048045505210077304,
"loss": 5.4055,
"mean_token_accuracy": 0.16473395228385926,
"num_tokens": 30942302.0,
"step": 16775
},
{
"entropy": 5.64259181022644,
"epoch": 1.4097458517118253,
"grad_norm": 1.40625,
"learning_rate": 0.0004804428422554826,
"loss": 5.3062,
"mean_token_accuracy": 0.17126897871494293,
"num_tokens": 30951662.0,
"step": 16780
},
{
"entropy": 5.56359715461731,
"epoch": 1.41016593152699,
"grad_norm": 1.2421875,
"learning_rate": 0.0004804306287708782,
"loss": 5.3425,
"mean_token_accuracy": 0.17381667643785476,
"num_tokens": 30960475.0,
"step": 16785
},
{
"entropy": 5.5127181053161625,
"epoch": 1.410586011342155,
"grad_norm": 1.3671875,
"learning_rate": 0.00048041841164717574,
"loss": 5.1639,
"mean_token_accuracy": 0.1837026745080948,
"num_tokens": 30969075.0,
"step": 16790
},
{
"entropy": 5.53020658493042,
"epoch": 1.41100609115732,
"grad_norm": 1.3828125,
"learning_rate": 0.0004804061908845921,
"loss": 5.2567,
"mean_token_accuracy": 0.1792847916483879,
"num_tokens": 30978030.0,
"step": 16795
},
{
"entropy": 5.5579996585845945,
"epoch": 1.4114261709724847,
"grad_norm": 1.3359375,
"learning_rate": 0.00048039396648334346,
"loss": 5.2236,
"mean_token_accuracy": 0.17854771614074708,
"num_tokens": 30985639.0,
"step": 16800
},
{
"entropy": 5.6335865497589115,
"epoch": 1.4118462507876497,
"grad_norm": 1.234375,
"learning_rate": 0.0004803817384436465,
"loss": 5.3762,
"mean_token_accuracy": 0.16809797286987305,
"num_tokens": 30994811.0,
"step": 16805
},
{
"entropy": 5.600879812240601,
"epoch": 1.4122663306028145,
"grad_norm": 1.421875,
"learning_rate": 0.0004803695067657178,
"loss": 5.3247,
"mean_token_accuracy": 0.17350828349590303,
"num_tokens": 31003813.0,
"step": 16810
},
{
"entropy": 5.53850474357605,
"epoch": 1.4126864104179795,
"grad_norm": 1.421875,
"learning_rate": 0.000480357271449774,
"loss": 5.291,
"mean_token_accuracy": 0.18001099973917006,
"num_tokens": 31012488.0,
"step": 16815
},
{
"entropy": 5.565526103973388,
"epoch": 1.4131064902331443,
"grad_norm": 1.484375,
"learning_rate": 0.0004803450324960318,
"loss": 5.3116,
"mean_token_accuracy": 0.17441705465316773,
"num_tokens": 31021089.0,
"step": 16820
},
{
"entropy": 5.5931884288787845,
"epoch": 1.413526570048309,
"grad_norm": 1.359375,
"learning_rate": 0.00048033278990470825,
"loss": 5.3189,
"mean_token_accuracy": 0.17343529164791108,
"num_tokens": 31029903.0,
"step": 16825
},
{
"entropy": 5.592469024658203,
"epoch": 1.413946649863474,
"grad_norm": 1.2734375,
"learning_rate": 0.00048032054367601996,
"loss": 5.3165,
"mean_token_accuracy": 0.172076952457428,
"num_tokens": 31039207.0,
"step": 16830
},
{
"entropy": 5.541013956069946,
"epoch": 1.414366729678639,
"grad_norm": 1.28125,
"learning_rate": 0.00048030829381018396,
"loss": 5.3679,
"mean_token_accuracy": 0.16457488387823105,
"num_tokens": 31048190.0,
"step": 16835
},
{
"entropy": 5.593619680404663,
"epoch": 1.4147868094938039,
"grad_norm": 1.3125,
"learning_rate": 0.0004802960403074173,
"loss": 5.4119,
"mean_token_accuracy": 0.1732983648777008,
"num_tokens": 31058769.0,
"step": 16840
},
{
"entropy": 5.600053405761718,
"epoch": 1.4152068893089687,
"grad_norm": 1.40625,
"learning_rate": 0.00048028378316793705,
"loss": 5.3903,
"mean_token_accuracy": 0.16838869005441665,
"num_tokens": 31066830.0,
"step": 16845
},
{
"entropy": 5.6990594387054445,
"epoch": 1.4156269691241337,
"grad_norm": 1.2265625,
"learning_rate": 0.0004802715223919602,
"loss": 5.432,
"mean_token_accuracy": 0.1703927457332611,
"num_tokens": 31077205.0,
"step": 16850
},
{
"entropy": 5.649413204193115,
"epoch": 1.4160470489392984,
"grad_norm": 1.3125,
"learning_rate": 0.00048025925797970403,
"loss": 5.357,
"mean_token_accuracy": 0.17568007558584214,
"num_tokens": 31087327.0,
"step": 16855
},
{
"entropy": 5.495763635635376,
"epoch": 1.4164671287544635,
"grad_norm": 1.34375,
"learning_rate": 0.00048024698993138587,
"loss": 5.3054,
"mean_token_accuracy": 0.17436772882938384,
"num_tokens": 31096501.0,
"step": 16860
},
{
"entropy": 5.6352144241333,
"epoch": 1.4168872085696282,
"grad_norm": 1.2421875,
"learning_rate": 0.00048023471824722294,
"loss": 5.4804,
"mean_token_accuracy": 0.16082673370838166,
"num_tokens": 31105949.0,
"step": 16865
},
{
"entropy": 5.682409048080444,
"epoch": 1.417307288384793,
"grad_norm": 1.4375,
"learning_rate": 0.00048022244292743256,
"loss": 5.3824,
"mean_token_accuracy": 0.1642298310995102,
"num_tokens": 31115482.0,
"step": 16870
},
{
"entropy": 5.622606229782105,
"epoch": 1.417727368199958,
"grad_norm": 1.59375,
"learning_rate": 0.00048021016397223234,
"loss": 5.3241,
"mean_token_accuracy": 0.17723027169704436,
"num_tokens": 31124758.0,
"step": 16875
},
{
"entropy": 5.528745889663696,
"epoch": 1.4181474480151228,
"grad_norm": 1.2890625,
"learning_rate": 0.00048019788138183977,
"loss": 5.2143,
"mean_token_accuracy": 0.18183207660913467,
"num_tokens": 31134114.0,
"step": 16880
},
{
"entropy": 5.507150077819825,
"epoch": 1.4185675278302878,
"grad_norm": 1.3671875,
"learning_rate": 0.00048018559515647244,
"loss": 5.2678,
"mean_token_accuracy": 0.17222765535116197,
"num_tokens": 31142667.0,
"step": 16885
},
{
"entropy": 5.556279134750366,
"epoch": 1.4189876076454526,
"grad_norm": 1.2265625,
"learning_rate": 0.00048017330529634785,
"loss": 5.3669,
"mean_token_accuracy": 0.1647636979818344,
"num_tokens": 31152105.0,
"step": 16890
},
{
"entropy": 5.6083588123321535,
"epoch": 1.4194076874606174,
"grad_norm": 1.296875,
"learning_rate": 0.00048016101180168376,
"loss": 5.3352,
"mean_token_accuracy": 0.17789842784404755,
"num_tokens": 31160277.0,
"step": 16895
},
{
"entropy": 5.702673673629761,
"epoch": 1.4198277672757824,
"grad_norm": 1.46875,
"learning_rate": 0.00048014871467269804,
"loss": 5.5348,
"mean_token_accuracy": 0.15722108483314515,
"num_tokens": 31170677.0,
"step": 16900
},
{
"entropy": 5.59203953742981,
"epoch": 1.4202478470909472,
"grad_norm": 1.390625,
"learning_rate": 0.00048013641390960856,
"loss": 5.3194,
"mean_token_accuracy": 0.16935041695833206,
"num_tokens": 31179298.0,
"step": 16905
},
{
"entropy": 5.612672281265259,
"epoch": 1.4206679269061122,
"grad_norm": 1.6328125,
"learning_rate": 0.0004801241095126331,
"loss": 5.3425,
"mean_token_accuracy": 0.16731651723384858,
"num_tokens": 31188547.0,
"step": 16910
},
{
"entropy": 5.543025302886963,
"epoch": 1.421088006721277,
"grad_norm": 1.359375,
"learning_rate": 0.0004801118014819896,
"loss": 5.3283,
"mean_token_accuracy": 0.1734934628009796,
"num_tokens": 31197680.0,
"step": 16915
},
{
"entropy": 5.553297996520996,
"epoch": 1.421508086536442,
"grad_norm": 1.3203125,
"learning_rate": 0.0004800994898178962,
"loss": 5.2894,
"mean_token_accuracy": 0.17648072838783263,
"num_tokens": 31206351.0,
"step": 16920
},
{
"entropy": 5.6083721160888675,
"epoch": 1.4219281663516068,
"grad_norm": 1.484375,
"learning_rate": 0.0004800871745205708,
"loss": 5.4956,
"mean_token_accuracy": 0.1636019006371498,
"num_tokens": 31216478.0,
"step": 16925
},
{
"entropy": 5.699634027481079,
"epoch": 1.4223482461667718,
"grad_norm": 1.25,
"learning_rate": 0.00048007485559023195,
"loss": 5.4333,
"mean_token_accuracy": 0.1666202038526535,
"num_tokens": 31225920.0,
"step": 16930
},
{
"entropy": 5.571429109573364,
"epoch": 1.4227683259819366,
"grad_norm": 1.609375,
"learning_rate": 0.0004800625330270975,
"loss": 5.3111,
"mean_token_accuracy": 0.17457341700792312,
"num_tokens": 31235061.0,
"step": 16935
},
{
"entropy": 5.520137023925781,
"epoch": 1.4231884057971014,
"grad_norm": 1.2578125,
"learning_rate": 0.0004800502068313859,
"loss": 5.2904,
"mean_token_accuracy": 0.17647054940462112,
"num_tokens": 31243448.0,
"step": 16940
},
{
"entropy": 5.617452144622803,
"epoch": 1.4236084856122664,
"grad_norm": 1.3671875,
"learning_rate": 0.0004800378770033154,
"loss": 5.4098,
"mean_token_accuracy": 0.17099433839321138,
"num_tokens": 31252569.0,
"step": 16945
},
{
"entropy": 5.5876306056976315,
"epoch": 1.4240285654274312,
"grad_norm": 1.3984375,
"learning_rate": 0.0004800255435431046,
"loss": 5.2803,
"mean_token_accuracy": 0.17398663237690926,
"num_tokens": 31261905.0,
"step": 16950
},
{
"entropy": 5.488924217224121,
"epoch": 1.4244486452425962,
"grad_norm": 1.328125,
"learning_rate": 0.00048001320645097177,
"loss": 5.2568,
"mean_token_accuracy": 0.17906904369592666,
"num_tokens": 31271203.0,
"step": 16955
},
{
"entropy": 5.51849217414856,
"epoch": 1.424868725057761,
"grad_norm": 1.671875,
"learning_rate": 0.00048000086572713566,
"loss": 5.2776,
"mean_token_accuracy": 0.18182590752840042,
"num_tokens": 31279812.0,
"step": 16960
},
{
"entropy": 5.593318033218384,
"epoch": 1.4252888048729258,
"grad_norm": 1.375,
"learning_rate": 0.0004799885213718147,
"loss": 5.3103,
"mean_token_accuracy": 0.1696845307946205,
"num_tokens": 31289615.0,
"step": 16965
},
{
"entropy": 5.5744716167449955,
"epoch": 1.4257088846880908,
"grad_norm": 1.4375,
"learning_rate": 0.00047997617338522763,
"loss": 5.2527,
"mean_token_accuracy": 0.18043006360530853,
"num_tokens": 31298947.0,
"step": 16970
},
{
"entropy": 5.524801540374756,
"epoch": 1.4261289645032555,
"grad_norm": 1.4921875,
"learning_rate": 0.00047996382176759324,
"loss": 5.2413,
"mean_token_accuracy": 0.17649227678775786,
"num_tokens": 31307465.0,
"step": 16975
},
{
"entropy": 5.524610996246338,
"epoch": 1.4265490443184206,
"grad_norm": 1.359375,
"learning_rate": 0.0004799514665191303,
"loss": 5.3515,
"mean_token_accuracy": 0.17230029702186583,
"num_tokens": 31317682.0,
"step": 16980
},
{
"entropy": 5.602631425857544,
"epoch": 1.4269691241335853,
"grad_norm": 1.484375,
"learning_rate": 0.0004799391076400576,
"loss": 5.3558,
"mean_token_accuracy": 0.16894156187772752,
"num_tokens": 31326113.0,
"step": 16985
},
{
"entropy": 5.650381135940552,
"epoch": 1.4273892039487501,
"grad_norm": 1.3203125,
"learning_rate": 0.00047992674513059415,
"loss": 5.4024,
"mean_token_accuracy": 0.17357337921857835,
"num_tokens": 31335263.0,
"step": 16990
},
{
"entropy": 5.589767742156982,
"epoch": 1.4278092837639151,
"grad_norm": 1.34375,
"learning_rate": 0.00047991437899095896,
"loss": 5.3421,
"mean_token_accuracy": 0.17734005600214003,
"num_tokens": 31344503.0,
"step": 16995
},
{
"entropy": 5.551265811920166,
"epoch": 1.4282293635790801,
"grad_norm": 1.46875,
"learning_rate": 0.00047990200922137105,
"loss": 5.3727,
"mean_token_accuracy": 0.1755639910697937,
"num_tokens": 31354530.0,
"step": 17000
},
{
"entropy": 5.5601198196411135,
"epoch": 1.428649443394245,
"grad_norm": 1.3125,
"learning_rate": 0.0004798896358220496,
"loss": 5.2157,
"mean_token_accuracy": 0.1786619856953621,
"num_tokens": 31362761.0,
"step": 17005
},
{
"entropy": 5.539676380157471,
"epoch": 1.4290695232094097,
"grad_norm": 1.375,
"learning_rate": 0.0004798772587932137,
"loss": 5.2508,
"mean_token_accuracy": 0.17362561970949172,
"num_tokens": 31372933.0,
"step": 17010
},
{
"entropy": 5.60802321434021,
"epoch": 1.4294896030245747,
"grad_norm": 1.3515625,
"learning_rate": 0.0004798648781350826,
"loss": 5.4445,
"mean_token_accuracy": 0.17000383734703065,
"num_tokens": 31382651.0,
"step": 17015
},
{
"entropy": 5.591900968551636,
"epoch": 1.4299096828397395,
"grad_norm": 1.3125,
"learning_rate": 0.0004798524938478758,
"loss": 5.379,
"mean_token_accuracy": 0.16931596547365188,
"num_tokens": 31392272.0,
"step": 17020
},
{
"entropy": 5.641653537750244,
"epoch": 1.4303297626549045,
"grad_norm": 1.3671875,
"learning_rate": 0.0004798401059318124,
"loss": 5.2814,
"mean_token_accuracy": 0.17356303334236145,
"num_tokens": 31400684.0,
"step": 17025
},
{
"entropy": 5.596721267700195,
"epoch": 1.4307498424700693,
"grad_norm": 1.328125,
"learning_rate": 0.0004798277143871122,
"loss": 5.29,
"mean_token_accuracy": 0.177234947681427,
"num_tokens": 31409082.0,
"step": 17030
},
{
"entropy": 5.524614763259888,
"epoch": 1.431169922285234,
"grad_norm": 1.3984375,
"learning_rate": 0.0004798153192139944,
"loss": 5.246,
"mean_token_accuracy": 0.17682832330465317,
"num_tokens": 31417415.0,
"step": 17035
},
{
"entropy": 5.54094295501709,
"epoch": 1.431590002100399,
"grad_norm": 1.4375,
"learning_rate": 0.0004798029204126786,
"loss": 5.397,
"mean_token_accuracy": 0.17507326304912568,
"num_tokens": 31427510.0,
"step": 17040
},
{
"entropy": 5.530172109603882,
"epoch": 1.432010081915564,
"grad_norm": 1.34375,
"learning_rate": 0.0004797905179833847,
"loss": 5.2554,
"mean_token_accuracy": 0.17984679490327835,
"num_tokens": 31436187.0,
"step": 17045
},
{
"entropy": 5.5713074684143065,
"epoch": 1.432430161730729,
"grad_norm": 1.6875,
"learning_rate": 0.0004797781119263321,
"loss": 5.2702,
"mean_token_accuracy": 0.1719392091035843,
"num_tokens": 31445179.0,
"step": 17050
},
{
"entropy": 5.614700126647949,
"epoch": 1.4328502415458937,
"grad_norm": 1.9609375,
"learning_rate": 0.0004797657022417408,
"loss": 5.3486,
"mean_token_accuracy": 0.17392427623271942,
"num_tokens": 31454434.0,
"step": 17055
},
{
"entropy": 5.602687883377075,
"epoch": 1.4332703213610585,
"grad_norm": 1.5234375,
"learning_rate": 0.00047975328892983045,
"loss": 5.3577,
"mean_token_accuracy": 0.17273057103157044,
"num_tokens": 31464202.0,
"step": 17060
},
{
"entropy": 5.4981677532196045,
"epoch": 1.4336904011762235,
"grad_norm": 1.1875,
"learning_rate": 0.00047974087199082095,
"loss": 5.2295,
"mean_token_accuracy": 0.18076304644346236,
"num_tokens": 31473158.0,
"step": 17065
},
{
"entropy": 5.532413482666016,
"epoch": 1.4341104809913885,
"grad_norm": 1.5078125,
"learning_rate": 0.00047972845142493244,
"loss": 5.2834,
"mean_token_accuracy": 0.1661238968372345,
"num_tokens": 31482643.0,
"step": 17070
},
{
"entropy": 5.555637979507447,
"epoch": 1.4345305608065533,
"grad_norm": 1.5,
"learning_rate": 0.0004797160272323848,
"loss": 5.3333,
"mean_token_accuracy": 0.1765944853425026,
"num_tokens": 31492080.0,
"step": 17075
},
{
"entropy": 5.570637989044189,
"epoch": 1.434950640621718,
"grad_norm": 1.4296875,
"learning_rate": 0.00047970359941339815,
"loss": 5.2965,
"mean_token_accuracy": 0.17814753949642181,
"num_tokens": 31501990.0,
"step": 17080
},
{
"entropy": 5.5639323711395265,
"epoch": 1.435370720436883,
"grad_norm": 1.453125,
"learning_rate": 0.0004796911679681926,
"loss": 5.3597,
"mean_token_accuracy": 0.1694718822836876,
"num_tokens": 31510548.0,
"step": 17085
},
{
"entropy": 5.560849666595459,
"epoch": 1.4357908002520479,
"grad_norm": 1.8671875,
"learning_rate": 0.00047967873289698847,
"loss": 5.3226,
"mean_token_accuracy": 0.17743553966283798,
"num_tokens": 31518695.0,
"step": 17090
},
{
"entropy": 5.68918776512146,
"epoch": 1.4362108800672129,
"grad_norm": 1.3984375,
"learning_rate": 0.00047966629420000595,
"loss": 5.449,
"mean_token_accuracy": 0.16994978934526445,
"num_tokens": 31528021.0,
"step": 17095
},
{
"entropy": 5.6427077293396,
"epoch": 1.4366309598823777,
"grad_norm": 1.4375,
"learning_rate": 0.0004796538518774654,
"loss": 5.4214,
"mean_token_accuracy": 0.16749105975031853,
"num_tokens": 31537786.0,
"step": 17100
},
{
"entropy": 5.558966732025146,
"epoch": 1.4370510396975424,
"grad_norm": 1.4609375,
"learning_rate": 0.00047964140592958725,
"loss": 5.3613,
"mean_token_accuracy": 0.17375758588314055,
"num_tokens": 31548006.0,
"step": 17105
},
{
"entropy": 5.585396099090576,
"epoch": 1.4374711195127075,
"grad_norm": 1.4609375,
"learning_rate": 0.000479628956356592,
"loss": 5.3221,
"mean_token_accuracy": 0.17522389590740203,
"num_tokens": 31557042.0,
"step": 17110
},
{
"entropy": 5.622907304763794,
"epoch": 1.4378911993278722,
"grad_norm": 1.3671875,
"learning_rate": 0.0004796165031587001,
"loss": 5.3528,
"mean_token_accuracy": 0.17105019092559814,
"num_tokens": 31566661.0,
"step": 17115
},
{
"entropy": 5.579087591171264,
"epoch": 1.4383112791430372,
"grad_norm": 1.578125,
"learning_rate": 0.0004796040463361323,
"loss": 5.3196,
"mean_token_accuracy": 0.18242323398590088,
"num_tokens": 31575724.0,
"step": 17120
},
{
"entropy": 5.562480306625366,
"epoch": 1.438731358958202,
"grad_norm": 1.296875,
"learning_rate": 0.0004795915858891091,
"loss": 5.4078,
"mean_token_accuracy": 0.1762837290763855,
"num_tokens": 31585068.0,
"step": 17125
},
{
"entropy": 5.649813795089722,
"epoch": 1.4391514387733668,
"grad_norm": 1.421875,
"learning_rate": 0.0004795791218178514,
"loss": 5.4194,
"mean_token_accuracy": 0.1680687412619591,
"num_tokens": 31594629.0,
"step": 17130
},
{
"entropy": 5.570974636077881,
"epoch": 1.4395715185885318,
"grad_norm": 1.28125,
"learning_rate": 0.00047956665412257984,
"loss": 5.3064,
"mean_token_accuracy": 0.1763842895627022,
"num_tokens": 31603469.0,
"step": 17135
},
{
"entropy": 5.522866535186767,
"epoch": 1.4399915984036968,
"grad_norm": 1.5,
"learning_rate": 0.00047955418280351526,
"loss": 5.2527,
"mean_token_accuracy": 0.18022239059209824,
"num_tokens": 31611674.0,
"step": 17140
},
{
"entropy": 5.656683111190796,
"epoch": 1.4404116782188616,
"grad_norm": 1.4296875,
"learning_rate": 0.0004795417078608788,
"loss": 5.51,
"mean_token_accuracy": 0.16124989837408066,
"num_tokens": 31621863.0,
"step": 17145
},
{
"entropy": 5.654839992523193,
"epoch": 1.4408317580340264,
"grad_norm": 2.125,
"learning_rate": 0.00047952922929489126,
"loss": 5.3603,
"mean_token_accuracy": 0.1715451255440712,
"num_tokens": 31630968.0,
"step": 17150
},
{
"entropy": 5.597797584533692,
"epoch": 1.4412518378491914,
"grad_norm": 1.515625,
"learning_rate": 0.00047951674710577366,
"loss": 5.3719,
"mean_token_accuracy": 0.16875551342964173,
"num_tokens": 31640643.0,
"step": 17155
},
{
"entropy": 5.479277181625366,
"epoch": 1.4416719176643562,
"grad_norm": 1.328125,
"learning_rate": 0.00047950426129374723,
"loss": 5.2677,
"mean_token_accuracy": 0.17747672945261,
"num_tokens": 31648941.0,
"step": 17160
},
{
"entropy": 5.566006469726562,
"epoch": 1.4420919974795212,
"grad_norm": 1.578125,
"learning_rate": 0.00047949177185903314,
"loss": 5.3679,
"mean_token_accuracy": 0.17760609239339828,
"num_tokens": 31658019.0,
"step": 17165
},
{
"entropy": 5.624629878997803,
"epoch": 1.442512077294686,
"grad_norm": 1.546875,
"learning_rate": 0.0004794792788018526,
"loss": 5.4006,
"mean_token_accuracy": 0.16569752991199493,
"num_tokens": 31668050.0,
"step": 17170
},
{
"entropy": 5.5617005825042725,
"epoch": 1.4429321571098508,
"grad_norm": 1.46875,
"learning_rate": 0.000479466782122427,
"loss": 5.2539,
"mean_token_accuracy": 0.1734926789999008,
"num_tokens": 31676727.0,
"step": 17175
},
{
"entropy": 5.585551071166992,
"epoch": 1.4433522369250158,
"grad_norm": 1.515625,
"learning_rate": 0.00047945428182097756,
"loss": 5.3677,
"mean_token_accuracy": 0.1650926575064659,
"num_tokens": 31686205.0,
"step": 17180
},
{
"entropy": 5.59891562461853,
"epoch": 1.4437723167401806,
"grad_norm": 1.5859375,
"learning_rate": 0.00047944177789772583,
"loss": 5.3875,
"mean_token_accuracy": 0.17198501527309418,
"num_tokens": 31695521.0,
"step": 17185
},
{
"entropy": 5.716433334350586,
"epoch": 1.4441923965553456,
"grad_norm": 1.5859375,
"learning_rate": 0.0004794292703528932,
"loss": 5.4088,
"mean_token_accuracy": 0.16242260485887527,
"num_tokens": 31706606.0,
"step": 17190
},
{
"entropy": 5.666668224334717,
"epoch": 1.4446124763705104,
"grad_norm": 1.453125,
"learning_rate": 0.00047941675918670133,
"loss": 5.5028,
"mean_token_accuracy": 0.16124990060925484,
"num_tokens": 31716881.0,
"step": 17195
},
{
"entropy": 5.580557203292846,
"epoch": 1.4450325561856752,
"grad_norm": 1.3359375,
"learning_rate": 0.0004794042443993719,
"loss": 5.2901,
"mean_token_accuracy": 0.17272173017263412,
"num_tokens": 31725878.0,
"step": 17200
},
{
"entropy": 5.565096426010132,
"epoch": 1.4454526360008402,
"grad_norm": 1.96875,
"learning_rate": 0.0004793917259911265,
"loss": 5.363,
"mean_token_accuracy": 0.16763572096824647,
"num_tokens": 31735033.0,
"step": 17205
},
{
"entropy": 5.499946022033692,
"epoch": 1.445872715816005,
"grad_norm": 1.59375,
"learning_rate": 0.0004793792039621869,
"loss": 5.3361,
"mean_token_accuracy": 0.17203260213136673,
"num_tokens": 31744887.0,
"step": 17210
},
{
"entropy": 5.646187257766724,
"epoch": 1.44629279563117,
"grad_norm": 1.4453125,
"learning_rate": 0.00047936667831277504,
"loss": 5.386,
"mean_token_accuracy": 0.16433528065681458,
"num_tokens": 31754137.0,
"step": 17215
},
{
"entropy": 5.587683486938476,
"epoch": 1.4467128754463348,
"grad_norm": 1.5859375,
"learning_rate": 0.0004793541490431126,
"loss": 5.2131,
"mean_token_accuracy": 0.18168816417455674,
"num_tokens": 31763394.0,
"step": 17220
},
{
"entropy": 5.573675155639648,
"epoch": 1.4471329552614998,
"grad_norm": 1.3046875,
"learning_rate": 0.0004793416161534216,
"loss": 5.3375,
"mean_token_accuracy": 0.17382805198431014,
"num_tokens": 31771905.0,
"step": 17225
},
{
"entropy": 5.453760242462158,
"epoch": 1.4475530350766646,
"grad_norm": 1.4453125,
"learning_rate": 0.00047932907964392423,
"loss": 5.1726,
"mean_token_accuracy": 0.18354609608650208,
"num_tokens": 31780788.0,
"step": 17230
},
{
"entropy": 5.613137531280517,
"epoch": 1.4479731148918296,
"grad_norm": 1.515625,
"learning_rate": 0.00047931653951484234,
"loss": 5.3567,
"mean_token_accuracy": 0.16674324870109558,
"num_tokens": 31790198.0,
"step": 17235
},
{
"entropy": 5.627959203720093,
"epoch": 1.4483931947069943,
"grad_norm": 1.390625,
"learning_rate": 0.00047930399576639815,
"loss": 5.3447,
"mean_token_accuracy": 0.17671910673379898,
"num_tokens": 31799396.0,
"step": 17240
},
{
"entropy": 5.510777425765991,
"epoch": 1.4488132745221591,
"grad_norm": 2.265625,
"learning_rate": 0.00047929144839881386,
"loss": 5.2101,
"mean_token_accuracy": 0.18441505879163742,
"num_tokens": 31807680.0,
"step": 17245
},
{
"entropy": 5.601726055145264,
"epoch": 1.4492333543373241,
"grad_norm": 1.40625,
"learning_rate": 0.00047927889741231186,
"loss": 5.3439,
"mean_token_accuracy": 0.16829505264759065,
"num_tokens": 31817406.0,
"step": 17250
},
{
"entropy": 5.58220157623291,
"epoch": 1.449653434152489,
"grad_norm": 1.34375,
"learning_rate": 0.00047926634280711435,
"loss": 5.3231,
"mean_token_accuracy": 0.17198817133903505,
"num_tokens": 31826518.0,
"step": 17255
},
{
"entropy": 5.5924890518188475,
"epoch": 1.450073513967654,
"grad_norm": 1.296875,
"learning_rate": 0.0004792537845834437,
"loss": 5.4021,
"mean_token_accuracy": 0.17086707800626755,
"num_tokens": 31835538.0,
"step": 17260
},
{
"entropy": 5.567937278747559,
"epoch": 1.4504935937828187,
"grad_norm": 1.421875,
"learning_rate": 0.0004792412227415224,
"loss": 5.2608,
"mean_token_accuracy": 0.17742300629615784,
"num_tokens": 31844899.0,
"step": 17265
},
{
"entropy": 5.56048469543457,
"epoch": 1.4509136735979835,
"grad_norm": 2.625,
"learning_rate": 0.00047922865728157314,
"loss": 5.3269,
"mean_token_accuracy": 0.1774661958217621,
"num_tokens": 31854322.0,
"step": 17270
},
{
"entropy": 5.517394256591797,
"epoch": 1.4513337534131485,
"grad_norm": 1.484375,
"learning_rate": 0.0004792160882038183,
"loss": 5.2875,
"mean_token_accuracy": 0.17161102145910262,
"num_tokens": 31863657.0,
"step": 17275
},
{
"entropy": 5.570685482025146,
"epoch": 1.4517538332283133,
"grad_norm": 1.4375,
"learning_rate": 0.0004792035155084806,
"loss": 5.2727,
"mean_token_accuracy": 0.1753618985414505,
"num_tokens": 31873468.0,
"step": 17280
},
{
"entropy": 5.583717775344849,
"epoch": 1.4521739130434783,
"grad_norm": 1.7734375,
"learning_rate": 0.00047919093919578283,
"loss": 5.3989,
"mean_token_accuracy": 0.17585224360227586,
"num_tokens": 31882391.0,
"step": 17285
},
{
"entropy": 5.578426027297974,
"epoch": 1.452593992858643,
"grad_norm": 1.3671875,
"learning_rate": 0.0004791783592659476,
"loss": 5.3812,
"mean_token_accuracy": 0.1719109058380127,
"num_tokens": 31891370.0,
"step": 17290
},
{
"entropy": 5.5499907493591305,
"epoch": 1.4530140726738079,
"grad_norm": 1.2890625,
"learning_rate": 0.000479165775719198,
"loss": 5.3184,
"mean_token_accuracy": 0.17406646758317948,
"num_tokens": 31900688.0,
"step": 17295
},
{
"entropy": 5.578524923324585,
"epoch": 1.453434152488973,
"grad_norm": 1.9375,
"learning_rate": 0.00047915318855575674,
"loss": 5.3385,
"mean_token_accuracy": 0.1787629634141922,
"num_tokens": 31909359.0,
"step": 17300
},
{
"entropy": 5.532979679107666,
"epoch": 1.453854232304138,
"grad_norm": 1.3046875,
"learning_rate": 0.00047914059777584686,
"loss": 5.2988,
"mean_token_accuracy": 0.1753024697303772,
"num_tokens": 31918529.0,
"step": 17305
},
{
"entropy": 5.511482238769531,
"epoch": 1.4542743121193027,
"grad_norm": 1.46875,
"learning_rate": 0.00047912800337969144,
"loss": 5.3878,
"mean_token_accuracy": 0.1672118455171585,
"num_tokens": 31928310.0,
"step": 17310
},
{
"entropy": 5.572142028808594,
"epoch": 1.4546943919344675,
"grad_norm": 1.3203125,
"learning_rate": 0.00047911540536751355,
"loss": 5.2876,
"mean_token_accuracy": 0.17637560814619063,
"num_tokens": 31937077.0,
"step": 17315
},
{
"entropy": 5.632218599319458,
"epoch": 1.4551144717496325,
"grad_norm": 1.78125,
"learning_rate": 0.0004791028037395363,
"loss": 5.3557,
"mean_token_accuracy": 0.16721779406070708,
"num_tokens": 31946023.0,
"step": 17320
},
{
"entropy": 5.482977199554443,
"epoch": 1.4555345515647973,
"grad_norm": 1.34375,
"learning_rate": 0.00047909019849598305,
"loss": 5.2105,
"mean_token_accuracy": 0.1816324010491371,
"num_tokens": 31954741.0,
"step": 17325
},
{
"entropy": 5.571130895614624,
"epoch": 1.4559546313799623,
"grad_norm": 1.5546875,
"learning_rate": 0.00047907758963707696,
"loss": 5.3044,
"mean_token_accuracy": 0.17254888415336608,
"num_tokens": 31963516.0,
"step": 17330
},
{
"entropy": 5.609737253189087,
"epoch": 1.456374711195127,
"grad_norm": 1.421875,
"learning_rate": 0.00047906497716304153,
"loss": 5.3418,
"mean_token_accuracy": 0.17257929295301438,
"num_tokens": 31971917.0,
"step": 17335
},
{
"entropy": 5.606870508193969,
"epoch": 1.4567947910102919,
"grad_norm": 1.5546875,
"learning_rate": 0.0004790523610741001,
"loss": 5.3889,
"mean_token_accuracy": 0.16833689957857131,
"num_tokens": 31980718.0,
"step": 17340
},
{
"entropy": 5.654848718643189,
"epoch": 1.4572148708254569,
"grad_norm": 1.359375,
"learning_rate": 0.00047903974137047614,
"loss": 5.3165,
"mean_token_accuracy": 0.17566545754671098,
"num_tokens": 31988664.0,
"step": 17345
},
{
"entropy": 5.6314348697662355,
"epoch": 1.4576349506406217,
"grad_norm": 1.3125,
"learning_rate": 0.00047902711805239325,
"loss": 5.3832,
"mean_token_accuracy": 0.17269489765167237,
"num_tokens": 31998415.0,
"step": 17350
},
{
"entropy": 5.66870903968811,
"epoch": 1.4580550304557867,
"grad_norm": 1.3046875,
"learning_rate": 0.00047901449112007494,
"loss": 5.4086,
"mean_token_accuracy": 0.17308784127235413,
"num_tokens": 32007915.0,
"step": 17355
},
{
"entropy": 5.559226942062378,
"epoch": 1.4584751102709514,
"grad_norm": 1.3359375,
"learning_rate": 0.00047900186057374514,
"loss": 5.3298,
"mean_token_accuracy": 0.171367247402668,
"num_tokens": 32016582.0,
"step": 17360
},
{
"entropy": 5.540211009979248,
"epoch": 1.4588951900861162,
"grad_norm": 1.5,
"learning_rate": 0.00047898922641362724,
"loss": 5.3263,
"mean_token_accuracy": 0.17277695536613463,
"num_tokens": 32026008.0,
"step": 17365
},
{
"entropy": 5.618186712265015,
"epoch": 1.4593152699012812,
"grad_norm": 1.375,
"learning_rate": 0.0004789765886399453,
"loss": 5.371,
"mean_token_accuracy": 0.17320572584867477,
"num_tokens": 32034554.0,
"step": 17370
},
{
"entropy": 5.606414651870727,
"epoch": 1.4597353497164463,
"grad_norm": 1.453125,
"learning_rate": 0.00047896394725292313,
"loss": 5.3601,
"mean_token_accuracy": 0.18139052242040635,
"num_tokens": 32044003.0,
"step": 17375
},
{
"entropy": 5.559694385528564,
"epoch": 1.460155429531611,
"grad_norm": 1.453125,
"learning_rate": 0.00047895130225278473,
"loss": 5.3459,
"mean_token_accuracy": 0.16826673597097397,
"num_tokens": 32053753.0,
"step": 17380
},
{
"entropy": 5.615738868713379,
"epoch": 1.4605755093467758,
"grad_norm": 1.2421875,
"learning_rate": 0.0004789386536397539,
"loss": 5.3494,
"mean_token_accuracy": 0.17024147063493728,
"num_tokens": 32062459.0,
"step": 17385
},
{
"entropy": 5.673247003555298,
"epoch": 1.4609955891619408,
"grad_norm": 1.453125,
"learning_rate": 0.0004789260014140549,
"loss": 5.4389,
"mean_token_accuracy": 0.1728850543498993,
"num_tokens": 32072544.0,
"step": 17390
},
{
"entropy": 5.612435674667358,
"epoch": 1.4614156689771056,
"grad_norm": 1.390625,
"learning_rate": 0.00047891334557591177,
"loss": 5.368,
"mean_token_accuracy": 0.16520496159791948,
"num_tokens": 32082015.0,
"step": 17395
},
{
"entropy": 5.561788177490234,
"epoch": 1.4618357487922706,
"grad_norm": 1.6328125,
"learning_rate": 0.0004789006861255488,
"loss": 5.296,
"mean_token_accuracy": 0.17139685451984404,
"num_tokens": 32091622.0,
"step": 17400
},
{
"entropy": 5.674663257598877,
"epoch": 1.4622558286074354,
"grad_norm": 1.6328125,
"learning_rate": 0.0004788880230631901,
"loss": 5.4544,
"mean_token_accuracy": 0.16580108255147935,
"num_tokens": 32102716.0,
"step": 17405
},
{
"entropy": 5.581311655044556,
"epoch": 1.4626759084226002,
"grad_norm": 1.3515625,
"learning_rate": 0.00047887535638906005,
"loss": 5.2253,
"mean_token_accuracy": 0.1814083620905876,
"num_tokens": 32111051.0,
"step": 17410
},
{
"entropy": 5.46392936706543,
"epoch": 1.4630959882377652,
"grad_norm": 1.3515625,
"learning_rate": 0.000478862686103383,
"loss": 5.2368,
"mean_token_accuracy": 0.18095454573631287,
"num_tokens": 32119781.0,
"step": 17415
},
{
"entropy": 5.646488809585572,
"epoch": 1.46351606805293,
"grad_norm": 1.3359375,
"learning_rate": 0.00047885001220638354,
"loss": 5.3475,
"mean_token_accuracy": 0.17790547609329224,
"num_tokens": 32128849.0,
"step": 17420
},
{
"entropy": 5.655535888671875,
"epoch": 1.463936147868095,
"grad_norm": 1.3671875,
"learning_rate": 0.00047883733469828604,
"loss": 5.3882,
"mean_token_accuracy": 0.17596450448036194,
"num_tokens": 32138046.0,
"step": 17425
},
{
"entropy": 5.676393985748291,
"epoch": 1.4643562276832598,
"grad_norm": 1.2265625,
"learning_rate": 0.00047882465357931516,
"loss": 5.4357,
"mean_token_accuracy": 0.1657523274421692,
"num_tokens": 32147994.0,
"step": 17430
},
{
"entropy": 5.6804546356201175,
"epoch": 1.4647763074984246,
"grad_norm": 1.53125,
"learning_rate": 0.0004788119688496954,
"loss": 5.3865,
"mean_token_accuracy": 0.16991026997566222,
"num_tokens": 32156835.0,
"step": 17435
},
{
"entropy": 5.625530672073364,
"epoch": 1.4651963873135896,
"grad_norm": 1.4375,
"learning_rate": 0.0004787992805096516,
"loss": 5.3039,
"mean_token_accuracy": 0.1783510684967041,
"num_tokens": 32166751.0,
"step": 17440
},
{
"entropy": 5.559576320648193,
"epoch": 1.4656164671287546,
"grad_norm": 1.46875,
"learning_rate": 0.00047878658855940855,
"loss": 5.4232,
"mean_token_accuracy": 0.16614902019500732,
"num_tokens": 32175705.0,
"step": 17445
},
{
"entropy": 5.6670890808105465,
"epoch": 1.4660365469439194,
"grad_norm": 1.21875,
"learning_rate": 0.0004787738929991909,
"loss": 5.4799,
"mean_token_accuracy": 0.16261570304632186,
"num_tokens": 32185404.0,
"step": 17450
},
{
"entropy": 5.608896732330322,
"epoch": 1.4664566267590842,
"grad_norm": 1.4609375,
"learning_rate": 0.00047876119382922374,
"loss": 5.3277,
"mean_token_accuracy": 0.1755466192960739,
"num_tokens": 32194054.0,
"step": 17455
},
{
"entropy": 5.5917315006256105,
"epoch": 1.4668767065742492,
"grad_norm": 1.3359375,
"learning_rate": 0.00047874849104973194,
"loss": 5.4179,
"mean_token_accuracy": 0.16044087558984757,
"num_tokens": 32204080.0,
"step": 17460
},
{
"entropy": 5.590706062316895,
"epoch": 1.467296786389414,
"grad_norm": 1.3203125,
"learning_rate": 0.00047873578466094054,
"loss": 5.3132,
"mean_token_accuracy": 0.16462807953357697,
"num_tokens": 32213279.0,
"step": 17465
},
{
"entropy": 5.594799661636353,
"epoch": 1.467716866204579,
"grad_norm": 1.640625,
"learning_rate": 0.0004787230746630746,
"loss": 5.3271,
"mean_token_accuracy": 0.17292114496231079,
"num_tokens": 32221668.0,
"step": 17470
},
{
"entropy": 5.588113641738891,
"epoch": 1.4681369460197438,
"grad_norm": 1.53125,
"learning_rate": 0.0004787103610563593,
"loss": 5.2533,
"mean_token_accuracy": 0.1766665682196617,
"num_tokens": 32229683.0,
"step": 17475
},
{
"entropy": 5.556916284561157,
"epoch": 1.4685570258349085,
"grad_norm": 1.265625,
"learning_rate": 0.00047869764384101993,
"loss": 5.322,
"mean_token_accuracy": 0.17673111110925674,
"num_tokens": 32238948.0,
"step": 17480
},
{
"entropy": 5.615086889266967,
"epoch": 1.4689771056500736,
"grad_norm": 1.375,
"learning_rate": 0.00047868492301728164,
"loss": 5.3675,
"mean_token_accuracy": 0.16278913021087646,
"num_tokens": 32248079.0,
"step": 17485
},
{
"entropy": 5.569729900360107,
"epoch": 1.4693971854652383,
"grad_norm": 1.328125,
"learning_rate": 0.00047867219858536975,
"loss": 5.1875,
"mean_token_accuracy": 0.1838084951043129,
"num_tokens": 32256413.0,
"step": 17490
},
{
"entropy": 5.5846789360046385,
"epoch": 1.4698172652804034,
"grad_norm": 1.2734375,
"learning_rate": 0.0004786594705455098,
"loss": 5.3498,
"mean_token_accuracy": 0.17128240764141084,
"num_tokens": 32265954.0,
"step": 17495
},
{
"entropy": 5.512920093536377,
"epoch": 1.4702373450955681,
"grad_norm": 1.3828125,
"learning_rate": 0.0004786467388979272,
"loss": 5.2443,
"mean_token_accuracy": 0.1766051933169365,
"num_tokens": 32273817.0,
"step": 17500
},
{
"entropy": 5.508085632324219,
"epoch": 1.470657424910733,
"grad_norm": 1.2890625,
"learning_rate": 0.00047863400364284744,
"loss": 5.3126,
"mean_token_accuracy": 0.1748736783862114,
"num_tokens": 32283025.0,
"step": 17505
},
{
"entropy": 5.591394853591919,
"epoch": 1.471077504725898,
"grad_norm": 1.46875,
"learning_rate": 0.00047862126478049623,
"loss": 5.3064,
"mean_token_accuracy": 0.17286635637283326,
"num_tokens": 32292321.0,
"step": 17510
},
{
"entropy": 5.680115556716919,
"epoch": 1.4714975845410627,
"grad_norm": 1.4453125,
"learning_rate": 0.00047860852231109915,
"loss": 5.4133,
"mean_token_accuracy": 0.15822359770536423,
"num_tokens": 32302203.0,
"step": 17515
},
{
"entropy": 5.502277565002442,
"epoch": 1.4719176643562277,
"grad_norm": 1.3671875,
"learning_rate": 0.0004785957762348819,
"loss": 5.228,
"mean_token_accuracy": 0.17663549780845642,
"num_tokens": 32310893.0,
"step": 17520
},
{
"entropy": 5.516014051437378,
"epoch": 1.4723377441713925,
"grad_norm": 1.40625,
"learning_rate": 0.0004785830265520703,
"loss": 5.3045,
"mean_token_accuracy": 0.1776268407702446,
"num_tokens": 32320320.0,
"step": 17525
},
{
"entropy": 5.504133701324463,
"epoch": 1.4727578239865575,
"grad_norm": 1.2890625,
"learning_rate": 0.00047857027326289023,
"loss": 5.2009,
"mean_token_accuracy": 0.18166478127241134,
"num_tokens": 32329196.0,
"step": 17530
},
{
"entropy": 5.569661331176758,
"epoch": 1.4731779038017223,
"grad_norm": 1.4140625,
"learning_rate": 0.00047855751636756763,
"loss": 5.3374,
"mean_token_accuracy": 0.168901327252388,
"num_tokens": 32338529.0,
"step": 17535
},
{
"entropy": 5.5948240756988525,
"epoch": 1.4735979836168873,
"grad_norm": 1.421875,
"learning_rate": 0.0004785447558663284,
"loss": 5.3305,
"mean_token_accuracy": 0.1811675399541855,
"num_tokens": 32347114.0,
"step": 17540
},
{
"entropy": 5.6855837345123295,
"epoch": 1.474018063432052,
"grad_norm": 1.40625,
"learning_rate": 0.00047853199175939865,
"loss": 5.4984,
"mean_token_accuracy": 0.16563470512628556,
"num_tokens": 32356765.0,
"step": 17545
},
{
"entropy": 5.7249833106994625,
"epoch": 1.474438143247217,
"grad_norm": 1.3515625,
"learning_rate": 0.0004785192240470045,
"loss": 5.4522,
"mean_token_accuracy": 0.16961250603199005,
"num_tokens": 32366175.0,
"step": 17550
},
{
"entropy": 5.584809017181397,
"epoch": 1.474858223062382,
"grad_norm": 1.3125,
"learning_rate": 0.000478506452729372,
"loss": 5.2327,
"mean_token_accuracy": 0.17507179230451583,
"num_tokens": 32375063.0,
"step": 17555
},
{
"entropy": 5.542085790634156,
"epoch": 1.4752783028775467,
"grad_norm": 1.390625,
"learning_rate": 0.00047849367780672755,
"loss": 5.3106,
"mean_token_accuracy": 0.1758947491645813,
"num_tokens": 32384596.0,
"step": 17560
},
{
"entropy": 5.541361141204834,
"epoch": 1.4756983826927117,
"grad_norm": 1.390625,
"learning_rate": 0.0004784808992792974,
"loss": 5.2908,
"mean_token_accuracy": 0.1738338887691498,
"num_tokens": 32393489.0,
"step": 17565
},
{
"entropy": 5.610920715332031,
"epoch": 1.4761184625078765,
"grad_norm": 1.5859375,
"learning_rate": 0.0004784681171473079,
"loss": 5.2706,
"mean_token_accuracy": 0.17383526414632797,
"num_tokens": 32402192.0,
"step": 17570
},
{
"entropy": 5.621479034423828,
"epoch": 1.4765385423230413,
"grad_norm": 1.4453125,
"learning_rate": 0.00047845533141098543,
"loss": 5.3575,
"mean_token_accuracy": 0.17180275470018386,
"num_tokens": 32411317.0,
"step": 17575
},
{
"entropy": 5.649381971359253,
"epoch": 1.4769586221382063,
"grad_norm": 1.296875,
"learning_rate": 0.0004784425420705565,
"loss": 5.4123,
"mean_token_accuracy": 0.16808382868766786,
"num_tokens": 32420308.0,
"step": 17580
},
{
"entropy": 5.546915102005005,
"epoch": 1.477378701953371,
"grad_norm": 1.2421875,
"learning_rate": 0.0004784297491262477,
"loss": 5.3316,
"mean_token_accuracy": 0.17753085345029831,
"num_tokens": 32429532.0,
"step": 17585
},
{
"entropy": 5.606923818588257,
"epoch": 1.477798781768536,
"grad_norm": 1.3984375,
"learning_rate": 0.0004784169525782858,
"loss": 5.3244,
"mean_token_accuracy": 0.17007494419813157,
"num_tokens": 32439382.0,
"step": 17590
},
{
"entropy": 5.62837553024292,
"epoch": 1.4782188615837009,
"grad_norm": 1.421875,
"learning_rate": 0.0004784041524268971,
"loss": 5.3184,
"mean_token_accuracy": 0.17702646404504777,
"num_tokens": 32447893.0,
"step": 17595
},
{
"entropy": 5.575413751602173,
"epoch": 1.4786389413988656,
"grad_norm": 1.5,
"learning_rate": 0.00047839134867230874,
"loss": 5.3106,
"mean_token_accuracy": 0.1776092231273651,
"num_tokens": 32457770.0,
"step": 17600
},
{
"entropy": 5.623736572265625,
"epoch": 1.4790590212140307,
"grad_norm": 1.2734375,
"learning_rate": 0.00047837854131474726,
"loss": 5.4288,
"mean_token_accuracy": 0.16864898055791855,
"num_tokens": 32467247.0,
"step": 17605
},
{
"entropy": 5.693157386779785,
"epoch": 1.4794791010291957,
"grad_norm": 1.234375,
"learning_rate": 0.00047836573035443976,
"loss": 5.395,
"mean_token_accuracy": 0.17299325466156007,
"num_tokens": 32477453.0,
"step": 17610
},
{
"entropy": 5.666988325119019,
"epoch": 1.4798991808443605,
"grad_norm": 1.3984375,
"learning_rate": 0.00047835291579161293,
"loss": 5.3599,
"mean_token_accuracy": 0.17759935855865477,
"num_tokens": 32486278.0,
"step": 17615
},
{
"entropy": 5.5154444694519045,
"epoch": 1.4803192606595252,
"grad_norm": 1.328125,
"learning_rate": 0.0004783400976264941,
"loss": 5.2828,
"mean_token_accuracy": 0.17707307487726212,
"num_tokens": 32495523.0,
"step": 17620
},
{
"entropy": 5.610222578048706,
"epoch": 1.4807393404746902,
"grad_norm": 1.4765625,
"learning_rate": 0.00047832727585930997,
"loss": 5.3326,
"mean_token_accuracy": 0.1761850118637085,
"num_tokens": 32504952.0,
"step": 17625
},
{
"entropy": 5.6164140701293945,
"epoch": 1.481159420289855,
"grad_norm": 1.3203125,
"learning_rate": 0.0004783144504902879,
"loss": 5.2995,
"mean_token_accuracy": 0.17257804721593856,
"num_tokens": 32515620.0,
"step": 17630
},
{
"entropy": 5.519783544540405,
"epoch": 1.48157950010502,
"grad_norm": 1.3671875,
"learning_rate": 0.000478301621519655,
"loss": 5.2699,
"mean_token_accuracy": 0.17979115545749663,
"num_tokens": 32524549.0,
"step": 17635
},
{
"entropy": 5.592423009872436,
"epoch": 1.4819995799201848,
"grad_norm": 1.3828125,
"learning_rate": 0.0004782887889476386,
"loss": 5.1671,
"mean_token_accuracy": 0.18689583390951156,
"num_tokens": 32533043.0,
"step": 17640
},
{
"entropy": 5.556523084640503,
"epoch": 1.4824196597353496,
"grad_norm": 1.375,
"learning_rate": 0.000478275952774466,
"loss": 5.2839,
"mean_token_accuracy": 0.17498526573181153,
"num_tokens": 32541679.0,
"step": 17645
},
{
"entropy": 5.572411346435547,
"epoch": 1.4828397395505146,
"grad_norm": 1.5390625,
"learning_rate": 0.0004782631130003646,
"loss": 5.4106,
"mean_token_accuracy": 0.17719341367483138,
"num_tokens": 32550922.0,
"step": 17650
},
{
"entropy": 5.638541507720947,
"epoch": 1.4832598193656794,
"grad_norm": 1.3359375,
"learning_rate": 0.0004782502696255617,
"loss": 5.4062,
"mean_token_accuracy": 0.16908139437437059,
"num_tokens": 32560063.0,
"step": 17655
},
{
"entropy": 5.568115520477295,
"epoch": 1.4836798991808444,
"grad_norm": 1.5,
"learning_rate": 0.00047823742265028495,
"loss": 5.2762,
"mean_token_accuracy": 0.17622478306293488,
"num_tokens": 32569476.0,
"step": 17660
},
{
"entropy": 5.589584493637085,
"epoch": 1.4840999789960092,
"grad_norm": 1.40625,
"learning_rate": 0.000478224572074762,
"loss": 5.3391,
"mean_token_accuracy": 0.182720348238945,
"num_tokens": 32578552.0,
"step": 17665
},
{
"entropy": 5.573598194122314,
"epoch": 1.484520058811174,
"grad_norm": 1.34375,
"learning_rate": 0.0004782117178992203,
"loss": 5.3287,
"mean_token_accuracy": 0.1749765008687973,
"num_tokens": 32589074.0,
"step": 17670
},
{
"entropy": 5.560750675201416,
"epoch": 1.484940138626339,
"grad_norm": 1.3515625,
"learning_rate": 0.0004781988601238878,
"loss": 5.3624,
"mean_token_accuracy": 0.17228161841630935,
"num_tokens": 32599288.0,
"step": 17675
},
{
"entropy": 5.672045660018921,
"epoch": 1.485360218441504,
"grad_norm": 1.453125,
"learning_rate": 0.000478185998748992,
"loss": 5.4202,
"mean_token_accuracy": 0.16393853574991227,
"num_tokens": 32609430.0,
"step": 17680
},
{
"entropy": 5.550922489166259,
"epoch": 1.4857802982566688,
"grad_norm": 1.3359375,
"learning_rate": 0.00047817313377476083,
"loss": 5.2528,
"mean_token_accuracy": 0.18080978393554686,
"num_tokens": 32617763.0,
"step": 17685
},
{
"entropy": 5.5186584949493405,
"epoch": 1.4862003780718336,
"grad_norm": 1.3125,
"learning_rate": 0.00047816026520142234,
"loss": 5.3408,
"mean_token_accuracy": 0.16939881592988967,
"num_tokens": 32627465.0,
"step": 17690
},
{
"entropy": 5.69835057258606,
"epoch": 1.4866204578869986,
"grad_norm": 1.328125,
"learning_rate": 0.0004781473930292043,
"loss": 5.2585,
"mean_token_accuracy": 0.1850713849067688,
"num_tokens": 32635984.0,
"step": 17695
},
{
"entropy": 5.479670619964599,
"epoch": 1.4870405377021634,
"grad_norm": 1.2578125,
"learning_rate": 0.0004781345172583348,
"loss": 5.1823,
"mean_token_accuracy": 0.17850004732608796,
"num_tokens": 32644346.0,
"step": 17700
},
{
"entropy": 5.511671018600464,
"epoch": 1.4874606175173284,
"grad_norm": 1.4453125,
"learning_rate": 0.00047812163788904196,
"loss": 5.3267,
"mean_token_accuracy": 0.16714714467525482,
"num_tokens": 32654118.0,
"step": 17705
},
{
"entropy": 5.629597139358521,
"epoch": 1.4878806973324932,
"grad_norm": 1.4375,
"learning_rate": 0.00047810875492155386,
"loss": 5.3538,
"mean_token_accuracy": 0.17599407136440276,
"num_tokens": 32664258.0,
"step": 17710
},
{
"entropy": 5.601988077163696,
"epoch": 1.488300777147658,
"grad_norm": 1.34375,
"learning_rate": 0.0004780958683560987,
"loss": 5.3824,
"mean_token_accuracy": 0.1649293139576912,
"num_tokens": 32673672.0,
"step": 17715
},
{
"entropy": 5.629183435440064,
"epoch": 1.488720856962823,
"grad_norm": 1.3671875,
"learning_rate": 0.0004780829781929049,
"loss": 5.362,
"mean_token_accuracy": 0.16451467871665953,
"num_tokens": 32682901.0,
"step": 17720
},
{
"entropy": 5.675383186340332,
"epoch": 1.4891409367779878,
"grad_norm": 1.3828125,
"learning_rate": 0.0004780700844322007,
"loss": 5.3003,
"mean_token_accuracy": 0.17862350344657899,
"num_tokens": 32691384.0,
"step": 17725
},
{
"entropy": 5.5285297393798825,
"epoch": 1.4895610165931528,
"grad_norm": 1.484375,
"learning_rate": 0.00047805718707421446,
"loss": 5.3506,
"mean_token_accuracy": 0.17417652904987335,
"num_tokens": 32700758.0,
"step": 17730
},
{
"entropy": 5.651323699951172,
"epoch": 1.4899810964083176,
"grad_norm": 1.3984375,
"learning_rate": 0.00047804428611917475,
"loss": 5.4435,
"mean_token_accuracy": 0.16965910345315932,
"num_tokens": 32709676.0,
"step": 17735
},
{
"entropy": 5.6955992698669435,
"epoch": 1.4904011762234823,
"grad_norm": 1.3984375,
"learning_rate": 0.00047803138156731,
"loss": 5.3606,
"mean_token_accuracy": 0.17058410942554475,
"num_tokens": 32718102.0,
"step": 17740
},
{
"entropy": 5.664826488494873,
"epoch": 1.4908212560386473,
"grad_norm": 1.3125,
"learning_rate": 0.00047801847341884897,
"loss": 5.3355,
"mean_token_accuracy": 0.17453012019395828,
"num_tokens": 32727356.0,
"step": 17745
},
{
"entropy": 5.516296768188477,
"epoch": 1.4912413358538124,
"grad_norm": 1.609375,
"learning_rate": 0.0004780055616740202,
"loss": 5.328,
"mean_token_accuracy": 0.16924781501293182,
"num_tokens": 32736605.0,
"step": 17750
},
{
"entropy": 5.539705562591553,
"epoch": 1.4916614156689771,
"grad_norm": 1.2890625,
"learning_rate": 0.0004779926463330524,
"loss": 5.2686,
"mean_token_accuracy": 0.17446471750736237,
"num_tokens": 32745573.0,
"step": 17755
},
{
"entropy": 5.56036343574524,
"epoch": 1.492081495484142,
"grad_norm": 1.3359375,
"learning_rate": 0.0004779797273961744,
"loss": 5.3171,
"mean_token_accuracy": 0.17623323798179627,
"num_tokens": 32755695.0,
"step": 17760
},
{
"entropy": 5.55844259262085,
"epoch": 1.492501575299307,
"grad_norm": 1.296875,
"learning_rate": 0.0004779668048636151,
"loss": 5.236,
"mean_token_accuracy": 0.18182460218667984,
"num_tokens": 32763570.0,
"step": 17765
},
{
"entropy": 5.587655019760132,
"epoch": 1.4929216551144717,
"grad_norm": 1.4453125,
"learning_rate": 0.00047795387873560336,
"loss": 5.3514,
"mean_token_accuracy": 0.17356598526239395,
"num_tokens": 32772006.0,
"step": 17770
},
{
"entropy": 5.615496683120727,
"epoch": 1.4933417349296367,
"grad_norm": 1.4375,
"learning_rate": 0.0004779409490123681,
"loss": 5.3013,
"mean_token_accuracy": 0.16848473399877548,
"num_tokens": 32781080.0,
"step": 17775
},
{
"entropy": 5.5203471183776855,
"epoch": 1.4937618147448015,
"grad_norm": 1.4375,
"learning_rate": 0.0004779280156941384,
"loss": 5.2557,
"mean_token_accuracy": 0.17351365685462952,
"num_tokens": 32789880.0,
"step": 17780
},
{
"entropy": 5.60774712562561,
"epoch": 1.4941818945599663,
"grad_norm": 1.546875,
"learning_rate": 0.00047791507878114354,
"loss": 5.3076,
"mean_token_accuracy": 0.17505631297826768,
"num_tokens": 32799222.0,
"step": 17785
},
{
"entropy": 5.54469404220581,
"epoch": 1.4946019743751313,
"grad_norm": 1.3671875,
"learning_rate": 0.0004779021382736124,
"loss": 5.2838,
"mean_token_accuracy": 0.16949357837438583,
"num_tokens": 32808945.0,
"step": 17790
},
{
"entropy": 5.490546083450317,
"epoch": 1.495022054190296,
"grad_norm": 1.2890625,
"learning_rate": 0.0004778891941717745,
"loss": 5.2152,
"mean_token_accuracy": 0.18479133397340775,
"num_tokens": 32818386.0,
"step": 17795
},
{
"entropy": 5.489358425140381,
"epoch": 1.495442134005461,
"grad_norm": 1.2578125,
"learning_rate": 0.0004778762464758589,
"loss": 5.3003,
"mean_token_accuracy": 0.1693611517548561,
"num_tokens": 32828364.0,
"step": 17800
},
{
"entropy": 5.688781499862671,
"epoch": 1.495862213820626,
"grad_norm": 1.234375,
"learning_rate": 0.00047786329518609505,
"loss": 5.4202,
"mean_token_accuracy": 0.16817854940891266,
"num_tokens": 32837399.0,
"step": 17805
},
{
"entropy": 5.5961981296539305,
"epoch": 1.4962822936357907,
"grad_norm": 1.359375,
"learning_rate": 0.00047785034030271243,
"loss": 5.2669,
"mean_token_accuracy": 0.18185012489557267,
"num_tokens": 32846111.0,
"step": 17810
},
{
"entropy": 5.54898853302002,
"epoch": 1.4967023734509557,
"grad_norm": 1.3203125,
"learning_rate": 0.0004778373818259404,
"loss": 5.1613,
"mean_token_accuracy": 0.18888902068138122,
"num_tokens": 32855839.0,
"step": 17815
},
{
"entropy": 5.603731250762939,
"epoch": 1.4971224532661207,
"grad_norm": 1.3984375,
"learning_rate": 0.00047782441975600866,
"loss": 5.4449,
"mean_token_accuracy": 0.16674946397542953,
"num_tokens": 32865946.0,
"step": 17820
},
{
"entropy": 5.635933542251587,
"epoch": 1.4975425330812855,
"grad_norm": 1.3515625,
"learning_rate": 0.0004778114540931468,
"loss": 5.4147,
"mean_token_accuracy": 0.1694378599524498,
"num_tokens": 32875310.0,
"step": 17825
},
{
"entropy": 5.599696683883667,
"epoch": 1.4979626128964503,
"grad_norm": 2.046875,
"learning_rate": 0.00047779848483758445,
"loss": 5.3463,
"mean_token_accuracy": 0.17409632056951524,
"num_tokens": 32885315.0,
"step": 17830
},
{
"entropy": 5.599123477935791,
"epoch": 1.4983826927116153,
"grad_norm": 1.3203125,
"learning_rate": 0.00047778551198955133,
"loss": 5.324,
"mean_token_accuracy": 0.17629729509353637,
"num_tokens": 32894055.0,
"step": 17835
},
{
"entropy": 5.591061735153199,
"epoch": 1.49880277252678,
"grad_norm": 1.3046875,
"learning_rate": 0.0004777725355492773,
"loss": 5.3287,
"mean_token_accuracy": 0.17851689904928209,
"num_tokens": 32903030.0,
"step": 17840
},
{
"entropy": 5.6326555728912355,
"epoch": 1.499222852341945,
"grad_norm": 1.3046875,
"learning_rate": 0.0004777595555169922,
"loss": 5.2707,
"mean_token_accuracy": 0.1805201530456543,
"num_tokens": 32911562.0,
"step": 17845
},
{
"entropy": 5.617887783050537,
"epoch": 1.4996429321571099,
"grad_norm": 1.3671875,
"learning_rate": 0.000477746571892926,
"loss": 5.3723,
"mean_token_accuracy": 0.16956857591867447,
"num_tokens": 32920376.0,
"step": 17850
},
{
"entropy": 5.568303728103638,
"epoch": 1.5000630119722747,
"grad_norm": 1.4296875,
"learning_rate": 0.0004777335846773087,
"loss": 5.3087,
"mean_token_accuracy": 0.17171664237976075,
"num_tokens": 32929374.0,
"step": 17855
},
{
"entropy": 5.4869523525238035,
"epoch": 1.5004830917874397,
"grad_norm": 1.5703125,
"learning_rate": 0.00047772059387037025,
"loss": 5.2627,
"mean_token_accuracy": 0.1698673829436302,
"num_tokens": 32938695.0,
"step": 17860
},
{
"entropy": 5.626132535934448,
"epoch": 1.5009031716026044,
"grad_norm": 1.40625,
"learning_rate": 0.0004777075994723409,
"loss": 5.3369,
"mean_token_accuracy": 0.1788775607943535,
"num_tokens": 32947725.0,
"step": 17865
},
{
"entropy": 5.617758178710938,
"epoch": 1.5013232514177695,
"grad_norm": 1.3984375,
"learning_rate": 0.00047769460148345085,
"loss": 5.3365,
"mean_token_accuracy": 0.1705639123916626,
"num_tokens": 32957017.0,
"step": 17870
},
{
"entropy": 5.570928621292114,
"epoch": 1.5017433312329342,
"grad_norm": 1.28125,
"learning_rate": 0.0004776815999039303,
"loss": 5.3005,
"mean_token_accuracy": 0.173713581264019,
"num_tokens": 32965944.0,
"step": 17875
},
{
"entropy": 5.556409454345703,
"epoch": 1.502163411048099,
"grad_norm": 1.4921875,
"learning_rate": 0.0004776685947340096,
"loss": 5.2923,
"mean_token_accuracy": 0.17578593790531158,
"num_tokens": 32975368.0,
"step": 17880
},
{
"entropy": 5.583327388763427,
"epoch": 1.502583490863264,
"grad_norm": 1.3125,
"learning_rate": 0.0004776555859739191,
"loss": 5.367,
"mean_token_accuracy": 0.17373339235782623,
"num_tokens": 32984603.0,
"step": 17885
},
{
"entropy": 5.589329671859741,
"epoch": 1.503003570678429,
"grad_norm": 1.6953125,
"learning_rate": 0.00047764257362388913,
"loss": 5.3244,
"mean_token_accuracy": 0.1739801973104477,
"num_tokens": 32993621.0,
"step": 17890
},
{
"entropy": 5.530405759811401,
"epoch": 1.5034236504935938,
"grad_norm": 1.3203125,
"learning_rate": 0.0004776295576841504,
"loss": 5.3229,
"mean_token_accuracy": 0.17710662484169007,
"num_tokens": 33002637.0,
"step": 17895
},
{
"entropy": 5.586118364334107,
"epoch": 1.5038437303087586,
"grad_norm": 1.3125,
"learning_rate": 0.00047761653815493337,
"loss": 5.265,
"mean_token_accuracy": 0.18133355528116227,
"num_tokens": 33011964.0,
"step": 17900
},
{
"entropy": 5.611306428909302,
"epoch": 1.5042638101239234,
"grad_norm": 1.421875,
"learning_rate": 0.00047760351503646877,
"loss": 5.3281,
"mean_token_accuracy": 0.17200585156679155,
"num_tokens": 33020626.0,
"step": 17905
},
{
"entropy": 5.597643947601318,
"epoch": 1.5046838899390884,
"grad_norm": 1.6328125,
"learning_rate": 0.0004775904883289871,
"loss": 5.2988,
"mean_token_accuracy": 0.1718204289674759,
"num_tokens": 33029212.0,
"step": 17910
},
{
"entropy": 5.594880437850952,
"epoch": 1.5051039697542534,
"grad_norm": 1.6328125,
"learning_rate": 0.00047757745803271936,
"loss": 5.351,
"mean_token_accuracy": 0.17042253464460372,
"num_tokens": 33038893.0,
"step": 17915
},
{
"entropy": 5.579921007156372,
"epoch": 1.5055240495694182,
"grad_norm": 1.484375,
"learning_rate": 0.0004775644241478962,
"loss": 5.3341,
"mean_token_accuracy": 0.16926731467247008,
"num_tokens": 33048058.0,
"step": 17920
},
{
"entropy": 5.522654056549072,
"epoch": 1.505944129384583,
"grad_norm": 1.3046875,
"learning_rate": 0.00047755138667474864,
"loss": 5.2415,
"mean_token_accuracy": 0.18073787540197372,
"num_tokens": 33057106.0,
"step": 17925
},
{
"entropy": 5.564896965026856,
"epoch": 1.506364209199748,
"grad_norm": 1.34375,
"learning_rate": 0.0004775383456135075,
"loss": 5.3883,
"mean_token_accuracy": 0.17372349947690963,
"num_tokens": 33066400.0,
"step": 17930
},
{
"entropy": 5.596910762786865,
"epoch": 1.5067842890149128,
"grad_norm": 1.390625,
"learning_rate": 0.0004775253009644038,
"loss": 5.2598,
"mean_token_accuracy": 0.18395816534757614,
"num_tokens": 33075357.0,
"step": 17935
},
{
"entropy": 5.702732753753662,
"epoch": 1.5072043688300778,
"grad_norm": 1.1953125,
"learning_rate": 0.00047751225272766885,
"loss": 5.3546,
"mean_token_accuracy": 0.16766141802072526,
"num_tokens": 33085707.0,
"step": 17940
},
{
"entropy": 5.694062423706055,
"epoch": 1.5076244486452426,
"grad_norm": 1.421875,
"learning_rate": 0.0004774992009035335,
"loss": 5.4634,
"mean_token_accuracy": 0.1673523962497711,
"num_tokens": 33095825.0,
"step": 17945
},
{
"entropy": 5.521922779083252,
"epoch": 1.5080445284604074,
"grad_norm": 1.40625,
"learning_rate": 0.0004774861454922291,
"loss": 5.2645,
"mean_token_accuracy": 0.17879536151885986,
"num_tokens": 33105130.0,
"step": 17950
},
{
"entropy": 5.503930473327637,
"epoch": 1.5084646082755724,
"grad_norm": 1.390625,
"learning_rate": 0.0004774730864939869,
"loss": 5.2876,
"mean_token_accuracy": 0.17215581685304643,
"num_tokens": 33113226.0,
"step": 17955
},
{
"entropy": 5.615638256072998,
"epoch": 1.5088846880907374,
"grad_norm": 1.34375,
"learning_rate": 0.00047746002390903824,
"loss": 5.2788,
"mean_token_accuracy": 0.18200136572122574,
"num_tokens": 33120824.0,
"step": 17960
},
{
"entropy": 5.663376998901367,
"epoch": 1.5093047679059022,
"grad_norm": 1.5,
"learning_rate": 0.0004774469577376145,
"loss": 5.2736,
"mean_token_accuracy": 0.18282802850008012,
"num_tokens": 33129503.0,
"step": 17965
},
{
"entropy": 5.484948587417603,
"epoch": 1.509724847721067,
"grad_norm": 1.265625,
"learning_rate": 0.00047743388797994715,
"loss": 5.201,
"mean_token_accuracy": 0.17696729451417922,
"num_tokens": 33138838.0,
"step": 17970
},
{
"entropy": 5.527910041809082,
"epoch": 1.5101449275362318,
"grad_norm": 1.59375,
"learning_rate": 0.00047742081463626767,
"loss": 5.306,
"mean_token_accuracy": 0.17541001588106156,
"num_tokens": 33148142.0,
"step": 17975
},
{
"entropy": 5.5546680927276615,
"epoch": 1.5105650073513968,
"grad_norm": 1.3203125,
"learning_rate": 0.0004774077377068078,
"loss": 5.3059,
"mean_token_accuracy": 0.17930330634117125,
"num_tokens": 33156750.0,
"step": 17980
},
{
"entropy": 5.674122428894043,
"epoch": 1.5109850871665618,
"grad_norm": 1.3671875,
"learning_rate": 0.000477394657191799,
"loss": 5.4445,
"mean_token_accuracy": 0.1651099517941475,
"num_tokens": 33166511.0,
"step": 17985
},
{
"entropy": 5.607362747192383,
"epoch": 1.5114051669817266,
"grad_norm": 1.3671875,
"learning_rate": 0.00047738157309147307,
"loss": 5.3824,
"mean_token_accuracy": 0.17614495307207106,
"num_tokens": 33175812.0,
"step": 17990
},
{
"entropy": 5.549844884872437,
"epoch": 1.5118252467968913,
"grad_norm": 1.3828125,
"learning_rate": 0.00047736848540606174,
"loss": 5.2574,
"mean_token_accuracy": 0.17114411890506745,
"num_tokens": 33185201.0,
"step": 17995
},
{
"entropy": 5.573437643051148,
"epoch": 1.5122453266120561,
"grad_norm": 1.421875,
"learning_rate": 0.000477355394135797,
"loss": 5.24,
"mean_token_accuracy": 0.17542240619659424,
"num_tokens": 33195151.0,
"step": 18000
},
{
"epoch": 1.5122453266120561,
"eval_entropy": 5.420882165215047,
"eval_loss": 5.399367332458496,
"eval_mean_token_accuracy": 0.17883830919488547,
"eval_num_tokens": 33195151.0,
"eval_runtime": 27.4199,
"eval_samples_per_second": 1362.732,
"eval_steps_per_second": 170.351,
"step": 18000
},
{
"entropy": 5.586634397506714,
"epoch": 1.5126654064272211,
"grad_norm": 1.3984375,
"learning_rate": 0.0004773422992809106,
"loss": 5.2963,
"mean_token_accuracy": 0.17793478816747665,
"num_tokens": 33204800.0,
"step": 18005
},
{
"entropy": 5.56807918548584,
"epoch": 1.5130854862423861,
"grad_norm": 1.5078125,
"learning_rate": 0.0004773292008416346,
"loss": 5.3461,
"mean_token_accuracy": 0.17049676328897476,
"num_tokens": 33214529.0,
"step": 18010
},
{
"entropy": 5.631432580947876,
"epoch": 1.513505566057551,
"grad_norm": 1.375,
"learning_rate": 0.00047731609881820095,
"loss": 5.3432,
"mean_token_accuracy": 0.17332853376865387,
"num_tokens": 33224522.0,
"step": 18015
},
{
"entropy": 5.6420072555542,
"epoch": 1.5139256458727157,
"grad_norm": 1.390625,
"learning_rate": 0.00047730299321084173,
"loss": 5.3414,
"mean_token_accuracy": 0.17412778586149216,
"num_tokens": 33233220.0,
"step": 18020
},
{
"entropy": 5.573407888412476,
"epoch": 1.5143457256878807,
"grad_norm": 1.296875,
"learning_rate": 0.00047728988401978916,
"loss": 5.2663,
"mean_token_accuracy": 0.1760746493935585,
"num_tokens": 33242277.0,
"step": 18025
},
{
"entropy": 5.616208076477051,
"epoch": 1.5147658055030457,
"grad_norm": 1.390625,
"learning_rate": 0.0004772767712452756,
"loss": 5.3201,
"mean_token_accuracy": 0.18292029649019242,
"num_tokens": 33251113.0,
"step": 18030
},
{
"entropy": 5.556571435928345,
"epoch": 1.5151858853182105,
"grad_norm": 1.40625,
"learning_rate": 0.00047726365488753305,
"loss": 5.4646,
"mean_token_accuracy": 0.16428973972797395,
"num_tokens": 33261055.0,
"step": 18035
},
{
"entropy": 5.657004547119141,
"epoch": 1.5156059651333753,
"grad_norm": 1.375,
"learning_rate": 0.00047725053494679403,
"loss": 5.4257,
"mean_token_accuracy": 0.16714733242988586,
"num_tokens": 33270981.0,
"step": 18040
},
{
"entropy": 5.726076412200928,
"epoch": 1.51602604494854,
"grad_norm": 1.4765625,
"learning_rate": 0.00047723741142329104,
"loss": 5.375,
"mean_token_accuracy": 0.17077612578868867,
"num_tokens": 33279516.0,
"step": 18045
},
{
"entropy": 5.542011642456055,
"epoch": 1.516446124763705,
"grad_norm": 1.2109375,
"learning_rate": 0.00047722428431725637,
"loss": 5.2833,
"mean_token_accuracy": 0.18069555163383483,
"num_tokens": 33288300.0,
"step": 18050
},
{
"entropy": 5.530180358886719,
"epoch": 1.5168662045788701,
"grad_norm": 1.328125,
"learning_rate": 0.0004772111536289226,
"loss": 5.3407,
"mean_token_accuracy": 0.16956976056098938,
"num_tokens": 33299059.0,
"step": 18055
},
{
"entropy": 5.57734694480896,
"epoch": 1.517286284394035,
"grad_norm": 1.3515625,
"learning_rate": 0.00047719801935852235,
"loss": 5.3832,
"mean_token_accuracy": 0.16791318953037263,
"num_tokens": 33308879.0,
"step": 18060
},
{
"entropy": 5.699840641021728,
"epoch": 1.5177063642091997,
"grad_norm": 1.4921875,
"learning_rate": 0.0004771848815062883,
"loss": 5.4577,
"mean_token_accuracy": 0.16262564510107042,
"num_tokens": 33318615.0,
"step": 18065
},
{
"entropy": 5.701881361007691,
"epoch": 1.5181264440243645,
"grad_norm": 1.3359375,
"learning_rate": 0.0004771717400724532,
"loss": 5.4824,
"mean_token_accuracy": 0.16449054181575776,
"num_tokens": 33328748.0,
"step": 18070
},
{
"entropy": 5.64151611328125,
"epoch": 1.5185465238395295,
"grad_norm": 1.265625,
"learning_rate": 0.0004771585950572499,
"loss": 5.3066,
"mean_token_accuracy": 0.1706370383501053,
"num_tokens": 33338350.0,
"step": 18075
},
{
"entropy": 5.569923305511475,
"epoch": 1.5189666036546945,
"grad_norm": 1.1796875,
"learning_rate": 0.0004771454464609111,
"loss": 5.3196,
"mean_token_accuracy": 0.1730243444442749,
"num_tokens": 33348202.0,
"step": 18080
},
{
"entropy": 5.500693082809448,
"epoch": 1.5193866834698593,
"grad_norm": 1.53125,
"learning_rate": 0.0004771322942836699,
"loss": 5.3257,
"mean_token_accuracy": 0.17528198510408402,
"num_tokens": 33356996.0,
"step": 18085
},
{
"entropy": 5.691603899002075,
"epoch": 1.519806763285024,
"grad_norm": 1.3671875,
"learning_rate": 0.0004771191385257592,
"loss": 5.4404,
"mean_token_accuracy": 0.16499459147453308,
"num_tokens": 33366173.0,
"step": 18090
},
{
"entropy": 5.655971717834473,
"epoch": 1.520226843100189,
"grad_norm": 1.375,
"learning_rate": 0.0004771059791874119,
"loss": 5.335,
"mean_token_accuracy": 0.16834377944469453,
"num_tokens": 33375921.0,
"step": 18095
},
{
"entropy": 5.509462976455689,
"epoch": 1.520646922915354,
"grad_norm": 1.421875,
"learning_rate": 0.0004770928162688613,
"loss": 5.3233,
"mean_token_accuracy": 0.17299026995897293,
"num_tokens": 33385538.0,
"step": 18100
},
{
"entropy": 5.541356992721558,
"epoch": 1.5210670027305189,
"grad_norm": 1.203125,
"learning_rate": 0.00047707964977034055,
"loss": 5.2456,
"mean_token_accuracy": 0.1807330995798111,
"num_tokens": 33393728.0,
"step": 18105
},
{
"entropy": 5.667530918121338,
"epoch": 1.5214870825456837,
"grad_norm": 1.3046875,
"learning_rate": 0.0004770664796920828,
"loss": 5.3286,
"mean_token_accuracy": 0.17077636122703552,
"num_tokens": 33402540.0,
"step": 18110
},
{
"entropy": 5.490131521224976,
"epoch": 1.5219071623608484,
"grad_norm": 1.6171875,
"learning_rate": 0.0004770533060343215,
"loss": 5.3043,
"mean_token_accuracy": 0.17259591370820998,
"num_tokens": 33411706.0,
"step": 18115
},
{
"entropy": 5.521937036514283,
"epoch": 1.5223272421760135,
"grad_norm": 1.296875,
"learning_rate": 0.0004770401287972899,
"loss": 5.2664,
"mean_token_accuracy": 0.1781530499458313,
"num_tokens": 33420604.0,
"step": 18120
},
{
"entropy": 5.532708930969238,
"epoch": 1.5227473219911785,
"grad_norm": 1.1953125,
"learning_rate": 0.00047702694798122143,
"loss": 5.2264,
"mean_token_accuracy": 0.18006049543619157,
"num_tokens": 33429558.0,
"step": 18125
},
{
"entropy": 5.753080224990844,
"epoch": 1.5231674018063432,
"grad_norm": 1.265625,
"learning_rate": 0.00047701376358634957,
"loss": 5.4545,
"mean_token_accuracy": 0.1655917227268219,
"num_tokens": 33439620.0,
"step": 18130
},
{
"entropy": 5.61541485786438,
"epoch": 1.523587481621508,
"grad_norm": 1.453125,
"learning_rate": 0.00047700057561290797,
"loss": 5.4068,
"mean_token_accuracy": 0.16134429723024368,
"num_tokens": 33449067.0,
"step": 18135
},
{
"entropy": 5.503342485427856,
"epoch": 1.5240075614366728,
"grad_norm": 1.453125,
"learning_rate": 0.0004769873840611302,
"loss": 5.2821,
"mean_token_accuracy": 0.1764286294579506,
"num_tokens": 33458089.0,
"step": 18140
},
{
"entropy": 5.607730436325073,
"epoch": 1.5244276412518378,
"grad_norm": 1.453125,
"learning_rate": 0.0004769741889312499,
"loss": 5.4277,
"mean_token_accuracy": 0.17113181203603745,
"num_tokens": 33466883.0,
"step": 18145
},
{
"entropy": 5.618331575393677,
"epoch": 1.5248477210670028,
"grad_norm": 1.5703125,
"learning_rate": 0.00047696099022350087,
"loss": 5.4271,
"mean_token_accuracy": 0.17118463665246964,
"num_tokens": 33476649.0,
"step": 18150
},
{
"entropy": 5.692851448059082,
"epoch": 1.5252678008821676,
"grad_norm": 1.4453125,
"learning_rate": 0.00047694778793811685,
"loss": 5.4196,
"mean_token_accuracy": 0.16614175736904144,
"num_tokens": 33486274.0,
"step": 18155
},
{
"entropy": 5.601528739929199,
"epoch": 1.5256878806973324,
"grad_norm": 1.640625,
"learning_rate": 0.00047693458207533177,
"loss": 5.2884,
"mean_token_accuracy": 0.17039375752210617,
"num_tokens": 33494950.0,
"step": 18160
},
{
"entropy": 5.574803876876831,
"epoch": 1.5261079605124974,
"grad_norm": 1.4296875,
"learning_rate": 0.0004769213726353795,
"loss": 5.3216,
"mean_token_accuracy": 0.1731376364827156,
"num_tokens": 33503545.0,
"step": 18165
},
{
"entropy": 5.57457480430603,
"epoch": 1.5265280403276622,
"grad_norm": 1.5,
"learning_rate": 0.00047690815961849416,
"loss": 5.3575,
"mean_token_accuracy": 0.17385486662387847,
"num_tokens": 33512871.0,
"step": 18170
},
{
"entropy": 5.5856538772583,
"epoch": 1.5269481201428272,
"grad_norm": 1.4453125,
"learning_rate": 0.0004768949430249097,
"loss": 5.2891,
"mean_token_accuracy": 0.1725542053580284,
"num_tokens": 33521933.0,
"step": 18175
},
{
"entropy": 5.559426784515381,
"epoch": 1.527368199957992,
"grad_norm": 1.3828125,
"learning_rate": 0.0004768817228548603,
"loss": 5.268,
"mean_token_accuracy": 0.1707003191113472,
"num_tokens": 33531370.0,
"step": 18180
},
{
"entropy": 5.616719961166382,
"epoch": 1.5277882797731568,
"grad_norm": 1.375,
"learning_rate": 0.0004768684991085802,
"loss": 5.3545,
"mean_token_accuracy": 0.173495814204216,
"num_tokens": 33540310.0,
"step": 18185
},
{
"entropy": 5.6103486061096195,
"epoch": 1.5282083595883218,
"grad_norm": 1.421875,
"learning_rate": 0.00047685527178630347,
"loss": 5.3681,
"mean_token_accuracy": 0.16762521266937255,
"num_tokens": 33549943.0,
"step": 18190
},
{
"entropy": 5.646720027923584,
"epoch": 1.5286284394034868,
"grad_norm": 1.328125,
"learning_rate": 0.0004768420408882646,
"loss": 5.4264,
"mean_token_accuracy": 0.16873300820589066,
"num_tokens": 33560167.0,
"step": 18195
},
{
"entropy": 5.669474267959595,
"epoch": 1.5290485192186516,
"grad_norm": 1.53125,
"learning_rate": 0.00047682880641469787,
"loss": 5.3234,
"mean_token_accuracy": 0.17365003377199173,
"num_tokens": 33569604.0,
"step": 18200
},
{
"entropy": 5.606691360473633,
"epoch": 1.5294685990338164,
"grad_norm": 1.4765625,
"learning_rate": 0.0004768155683658378,
"loss": 5.3184,
"mean_token_accuracy": 0.17364266365766526,
"num_tokens": 33578400.0,
"step": 18205
},
{
"entropy": 5.547252130508423,
"epoch": 1.5298886788489812,
"grad_norm": 1.3671875,
"learning_rate": 0.0004768023267419188,
"loss": 5.2908,
"mean_token_accuracy": 0.17417265027761458,
"num_tokens": 33587527.0,
"step": 18210
},
{
"entropy": 5.549459171295166,
"epoch": 1.5303087586641462,
"grad_norm": 1.375,
"learning_rate": 0.0004767890815431756,
"loss": 5.2271,
"mean_token_accuracy": 0.1846227303147316,
"num_tokens": 33596026.0,
"step": 18215
},
{
"entropy": 5.595511770248413,
"epoch": 1.5307288384793112,
"grad_norm": 1.34375,
"learning_rate": 0.00047677583276984264,
"loss": 5.3043,
"mean_token_accuracy": 0.17908285707235336,
"num_tokens": 33605906.0,
"step": 18220
},
{
"entropy": 5.576016998291015,
"epoch": 1.531148918294476,
"grad_norm": 1.3203125,
"learning_rate": 0.0004767625804221548,
"loss": 5.2753,
"mean_token_accuracy": 0.1736286923289299,
"num_tokens": 33615758.0,
"step": 18225
},
{
"entropy": 5.545658111572266,
"epoch": 1.5315689981096408,
"grad_norm": 1.40625,
"learning_rate": 0.0004767493245003466,
"loss": 5.344,
"mean_token_accuracy": 0.18771408200263978,
"num_tokens": 33625486.0,
"step": 18230
},
{
"entropy": 5.576317882537841,
"epoch": 1.5319890779248058,
"grad_norm": 1.3359375,
"learning_rate": 0.00047673606500465315,
"loss": 5.2737,
"mean_token_accuracy": 0.18102063238620758,
"num_tokens": 33633954.0,
"step": 18235
},
{
"entropy": 5.600673866271973,
"epoch": 1.5324091577399706,
"grad_norm": 1.9921875,
"learning_rate": 0.000476722801935309,
"loss": 5.3624,
"mean_token_accuracy": 0.17380766123533248,
"num_tokens": 33642478.0,
"step": 18240
},
{
"entropy": 5.4986964702606205,
"epoch": 1.5328292375551356,
"grad_norm": 1.4296875,
"learning_rate": 0.0004767095352925495,
"loss": 5.2843,
"mean_token_accuracy": 0.18213609904050826,
"num_tokens": 33650785.0,
"step": 18245
},
{
"entropy": 5.544048881530761,
"epoch": 1.5332493173703003,
"grad_norm": 1.5390625,
"learning_rate": 0.0004766962650766093,
"loss": 5.2401,
"mean_token_accuracy": 0.18110573291778564,
"num_tokens": 33659677.0,
"step": 18250
},
{
"entropy": 5.651211929321289,
"epoch": 1.5336693971854651,
"grad_norm": 1.4609375,
"learning_rate": 0.00047668299128772365,
"loss": 5.4181,
"mean_token_accuracy": 0.16567772328853608,
"num_tokens": 33669493.0,
"step": 18255
},
{
"entropy": 5.650081586837769,
"epoch": 1.5340894770006301,
"grad_norm": 1.2890625,
"learning_rate": 0.0004766697139261277,
"loss": 5.4001,
"mean_token_accuracy": 0.17230152040719987,
"num_tokens": 33678446.0,
"step": 18260
},
{
"entropy": 5.608279085159301,
"epoch": 1.5345095568157952,
"grad_norm": 1.375,
"learning_rate": 0.0004766564329920566,
"loss": 5.246,
"mean_token_accuracy": 0.18318932354450226,
"num_tokens": 33687647.0,
"step": 18265
},
{
"entropy": 5.5770190238952635,
"epoch": 1.53492963663096,
"grad_norm": 1.921875,
"learning_rate": 0.0004766431484857456,
"loss": 5.342,
"mean_token_accuracy": 0.1709815412759781,
"num_tokens": 33697395.0,
"step": 18270
},
{
"entropy": 5.521883773803711,
"epoch": 1.5353497164461247,
"grad_norm": 1.2734375,
"learning_rate": 0.00047662986040743004,
"loss": 5.3378,
"mean_token_accuracy": 0.17965355515480042,
"num_tokens": 33706779.0,
"step": 18275
},
{
"entropy": 5.573388719558716,
"epoch": 1.5357697962612895,
"grad_norm": 1.46875,
"learning_rate": 0.0004766165687573454,
"loss": 5.3217,
"mean_token_accuracy": 0.17336629778146745,
"num_tokens": 33714828.0,
"step": 18280
},
{
"entropy": 5.66737322807312,
"epoch": 1.5361898760764545,
"grad_norm": 1.2734375,
"learning_rate": 0.000476603273535727,
"loss": 5.3234,
"mean_token_accuracy": 0.17424203604459762,
"num_tokens": 33724730.0,
"step": 18285
},
{
"entropy": 5.643036127090454,
"epoch": 1.5366099558916195,
"grad_norm": 1.5078125,
"learning_rate": 0.0004765899747428104,
"loss": 5.3793,
"mean_token_accuracy": 0.17208393216133117,
"num_tokens": 33734374.0,
"step": 18290
},
{
"entropy": 5.637545013427735,
"epoch": 1.5370300357067843,
"grad_norm": 1.421875,
"learning_rate": 0.00047657667237883125,
"loss": 5.367,
"mean_token_accuracy": 0.17792066782712937,
"num_tokens": 33743395.0,
"step": 18295
},
{
"entropy": 5.623939180374146,
"epoch": 1.537450115521949,
"grad_norm": 1.3671875,
"learning_rate": 0.00047656336644402513,
"loss": 5.4171,
"mean_token_accuracy": 0.1672622188925743,
"num_tokens": 33752526.0,
"step": 18300
},
{
"entropy": 5.607234048843384,
"epoch": 1.5378701953371139,
"grad_norm": 1.640625,
"learning_rate": 0.0004765500569386278,
"loss": 5.3636,
"mean_token_accuracy": 0.18062976002693176,
"num_tokens": 33761310.0,
"step": 18305
},
{
"entropy": 5.569280052185059,
"epoch": 1.538290275152279,
"grad_norm": 1.3515625,
"learning_rate": 0.000476536743862875,
"loss": 5.2889,
"mean_token_accuracy": 0.1751941427588463,
"num_tokens": 33770870.0,
"step": 18310
},
{
"entropy": 5.504629468917846,
"epoch": 1.538710354967444,
"grad_norm": 1.3828125,
"learning_rate": 0.00047652342721700246,
"loss": 5.2176,
"mean_token_accuracy": 0.1756708025932312,
"num_tokens": 33779648.0,
"step": 18315
},
{
"entropy": 5.60121750831604,
"epoch": 1.5391304347826087,
"grad_norm": 1.375,
"learning_rate": 0.0004765101070012462,
"loss": 5.4085,
"mean_token_accuracy": 0.16546077877283097,
"num_tokens": 33789172.0,
"step": 18320
},
{
"entropy": 5.679625797271728,
"epoch": 1.5395505145977735,
"grad_norm": 1.484375,
"learning_rate": 0.00047649678321584214,
"loss": 5.392,
"mean_token_accuracy": 0.16740069687366485,
"num_tokens": 33798069.0,
"step": 18325
},
{
"entropy": 5.640754508972168,
"epoch": 1.5399705944129385,
"grad_norm": 1.4765625,
"learning_rate": 0.00047648345586102643,
"loss": 5.336,
"mean_token_accuracy": 0.17600129991769792,
"num_tokens": 33806214.0,
"step": 18330
},
{
"entropy": 5.59784722328186,
"epoch": 1.5403906742281035,
"grad_norm": 1.4765625,
"learning_rate": 0.000476470124937035,
"loss": 5.3451,
"mean_token_accuracy": 0.17743887156248092,
"num_tokens": 33815365.0,
"step": 18335
},
{
"entropy": 5.620279121398926,
"epoch": 1.5408107540432683,
"grad_norm": 1.546875,
"learning_rate": 0.000476456790444104,
"loss": 5.2657,
"mean_token_accuracy": 0.18388193994760513,
"num_tokens": 33825204.0,
"step": 18340
},
{
"entropy": 5.615415573120117,
"epoch": 1.541230833858433,
"grad_norm": 1.484375,
"learning_rate": 0.0004764434523824697,
"loss": 5.3927,
"mean_token_accuracy": 0.17317160218954086,
"num_tokens": 33834439.0,
"step": 18345
},
{
"entropy": 5.585323619842529,
"epoch": 1.5416509136735979,
"grad_norm": 1.4296875,
"learning_rate": 0.00047643011075236845,
"loss": 5.353,
"mean_token_accuracy": 0.171696674823761,
"num_tokens": 33843959.0,
"step": 18350
},
{
"entropy": 5.693304347991943,
"epoch": 1.5420709934887629,
"grad_norm": 1.3515625,
"learning_rate": 0.00047641676555403646,
"loss": 5.4078,
"mean_token_accuracy": 0.16951505839824677,
"num_tokens": 33853234.0,
"step": 18355
},
{
"entropy": 5.6106541633605955,
"epoch": 1.5424910733039279,
"grad_norm": 1.6015625,
"learning_rate": 0.0004764034167877102,
"loss": 5.3165,
"mean_token_accuracy": 0.1737983211874962,
"num_tokens": 33861755.0,
"step": 18360
},
{
"entropy": 5.593862056732178,
"epoch": 1.5429111531190927,
"grad_norm": 1.4375,
"learning_rate": 0.00047639006445362607,
"loss": 5.4074,
"mean_token_accuracy": 0.17588503062725067,
"num_tokens": 33870956.0,
"step": 18365
},
{
"entropy": 5.504476165771484,
"epoch": 1.5433312329342574,
"grad_norm": 1.3515625,
"learning_rate": 0.0004763767085520207,
"loss": 5.2616,
"mean_token_accuracy": 0.1767754912376404,
"num_tokens": 33880568.0,
"step": 18370
},
{
"entropy": 5.670652580261231,
"epoch": 1.5437513127494222,
"grad_norm": 1.4140625,
"learning_rate": 0.0004763633490831306,
"loss": 5.4574,
"mean_token_accuracy": 0.16699085980653763,
"num_tokens": 33890145.0,
"step": 18375
},
{
"entropy": 5.565153551101685,
"epoch": 1.5441713925645872,
"grad_norm": 1.6015625,
"learning_rate": 0.0004763499860471925,
"loss": 5.3088,
"mean_token_accuracy": 0.17394726872444152,
"num_tokens": 33899155.0,
"step": 18380
},
{
"entropy": 5.599471759796143,
"epoch": 1.5445914723797523,
"grad_norm": 1.390625,
"learning_rate": 0.000476336619444443,
"loss": 5.3449,
"mean_token_accuracy": 0.16752467006444932,
"num_tokens": 33909410.0,
"step": 18385
},
{
"entropy": 5.594975757598877,
"epoch": 1.545011552194917,
"grad_norm": 1.328125,
"learning_rate": 0.000476323249275119,
"loss": 5.2232,
"mean_token_accuracy": 0.1820020020008087,
"num_tokens": 33918451.0,
"step": 18390
},
{
"entropy": 5.563750982284546,
"epoch": 1.5454316320100818,
"grad_norm": 1.4140625,
"learning_rate": 0.0004763098755394573,
"loss": 5.2662,
"mean_token_accuracy": 0.1729125678539276,
"num_tokens": 33928317.0,
"step": 18395
},
{
"entropy": 5.560149192810059,
"epoch": 1.5458517118252468,
"grad_norm": 1.359375,
"learning_rate": 0.0004762964982376949,
"loss": 5.4287,
"mean_token_accuracy": 0.17457393258810044,
"num_tokens": 33938010.0,
"step": 18400
},
{
"entropy": 5.597962379455566,
"epoch": 1.5462717916404118,
"grad_norm": 1.375,
"learning_rate": 0.00047628311737006856,
"loss": 5.2532,
"mean_token_accuracy": 0.1729839026927948,
"num_tokens": 33946964.0,
"step": 18405
},
{
"entropy": 5.578508758544922,
"epoch": 1.5466918714555766,
"grad_norm": 1.4609375,
"learning_rate": 0.00047626973293681555,
"loss": 5.2648,
"mean_token_accuracy": 0.1738666921854019,
"num_tokens": 33956026.0,
"step": 18410
},
{
"entropy": 5.514623785018921,
"epoch": 1.5471119512707414,
"grad_norm": 1.28125,
"learning_rate": 0.0004762563449381728,
"loss": 5.3227,
"mean_token_accuracy": 0.16739058643579482,
"num_tokens": 33965787.0,
"step": 18415
},
{
"entropy": 5.593003463745117,
"epoch": 1.5475320310859062,
"grad_norm": 1.3671875,
"learning_rate": 0.00047624295337437753,
"loss": 5.3348,
"mean_token_accuracy": 0.1739223748445511,
"num_tokens": 33974178.0,
"step": 18420
},
{
"entropy": 5.562832403182983,
"epoch": 1.5479521109010712,
"grad_norm": 1.328125,
"learning_rate": 0.0004762295582456669,
"loss": 5.2217,
"mean_token_accuracy": 0.17637139260768891,
"num_tokens": 33983652.0,
"step": 18425
},
{
"entropy": 5.62539267539978,
"epoch": 1.5483721907162362,
"grad_norm": 1.28125,
"learning_rate": 0.00047621615955227835,
"loss": 5.283,
"mean_token_accuracy": 0.18211861401796342,
"num_tokens": 33991938.0,
"step": 18430
},
{
"entropy": 5.49025821685791,
"epoch": 1.548792270531401,
"grad_norm": 1.3046875,
"learning_rate": 0.0004762027572944491,
"loss": 5.2782,
"mean_token_accuracy": 0.17370593100786208,
"num_tokens": 33999918.0,
"step": 18435
},
{
"entropy": 5.4819506168365475,
"epoch": 1.5492123503465658,
"grad_norm": 1.3671875,
"learning_rate": 0.00047618935147241667,
"loss": 5.2957,
"mean_token_accuracy": 0.18002650886774063,
"num_tokens": 34008416.0,
"step": 18440
},
{
"entropy": 5.612959527969361,
"epoch": 1.5496324301617306,
"grad_norm": 1.3515625,
"learning_rate": 0.0004761759420864184,
"loss": 5.365,
"mean_token_accuracy": 0.16775781512260438,
"num_tokens": 34017616.0,
"step": 18445
},
{
"entropy": 5.588992166519165,
"epoch": 1.5500525099768956,
"grad_norm": 1.3359375,
"learning_rate": 0.000476162529136692,
"loss": 5.301,
"mean_token_accuracy": 0.1780085414648056,
"num_tokens": 34026064.0,
"step": 18450
},
{
"entropy": 5.547345972061157,
"epoch": 1.5504725897920606,
"grad_norm": 1.4921875,
"learning_rate": 0.0004761491126234749,
"loss": 5.2024,
"mean_token_accuracy": 0.1776260942220688,
"num_tokens": 34035378.0,
"step": 18455
},
{
"entropy": 5.568131017684936,
"epoch": 1.5508926696072254,
"grad_norm": 1.3828125,
"learning_rate": 0.0004761356925470049,
"loss": 5.2649,
"mean_token_accuracy": 0.1744866043329239,
"num_tokens": 34044600.0,
"step": 18460
},
{
"entropy": 5.572581338882446,
"epoch": 1.5513127494223902,
"grad_norm": 1.390625,
"learning_rate": 0.00047612226890751956,
"loss": 5.337,
"mean_token_accuracy": 0.17249666899442673,
"num_tokens": 34054680.0,
"step": 18465
},
{
"entropy": 5.541142511367798,
"epoch": 1.5517328292375552,
"grad_norm": 1.3828125,
"learning_rate": 0.00047610884170525697,
"loss": 5.2711,
"mean_token_accuracy": 0.18112896382808685,
"num_tokens": 34063034.0,
"step": 18470
},
{
"entropy": 5.518846035003662,
"epoch": 1.55215290905272,
"grad_norm": 1.375,
"learning_rate": 0.0004760954109404547,
"loss": 5.2738,
"mean_token_accuracy": 0.18065272718667985,
"num_tokens": 34072122.0,
"step": 18475
},
{
"entropy": 5.6067907333374025,
"epoch": 1.552572988867885,
"grad_norm": 1.5546875,
"learning_rate": 0.0004760819766133508,
"loss": 5.287,
"mean_token_accuracy": 0.1691929116845131,
"num_tokens": 34081493.0,
"step": 18480
},
{
"entropy": 5.62939944267273,
"epoch": 1.5529930686830498,
"grad_norm": 1.5625,
"learning_rate": 0.00047606853872418317,
"loss": 5.3516,
"mean_token_accuracy": 0.16516932398080825,
"num_tokens": 34090872.0,
"step": 18485
},
{
"entropy": 5.4948036670684814,
"epoch": 1.5534131484982145,
"grad_norm": 1.21875,
"learning_rate": 0.0004760550972731899,
"loss": 5.2149,
"mean_token_accuracy": 0.1798781931400299,
"num_tokens": 34100729.0,
"step": 18490
},
{
"entropy": 5.4923333644866945,
"epoch": 1.5538332283133796,
"grad_norm": 1.46875,
"learning_rate": 0.0004760416522606092,
"loss": 5.2014,
"mean_token_accuracy": 0.18570833951234816,
"num_tokens": 34109492.0,
"step": 18495
},
{
"entropy": 5.525872039794922,
"epoch": 1.5542533081285446,
"grad_norm": 1.3828125,
"learning_rate": 0.0004760282036866791,
"loss": 5.3274,
"mean_token_accuracy": 0.1768384724855423,
"num_tokens": 34119529.0,
"step": 18500
},
{
"entropy": 5.663821506500244,
"epoch": 1.5546733879437094,
"grad_norm": 1.3984375,
"learning_rate": 0.0004760147515516379,
"loss": 5.348,
"mean_token_accuracy": 0.1726509377360344,
"num_tokens": 34128261.0,
"step": 18505
},
{
"entropy": 5.559554386138916,
"epoch": 1.5550934677588741,
"grad_norm": 1.375,
"learning_rate": 0.00047600129585572386,
"loss": 5.353,
"mean_token_accuracy": 0.17648673951625823,
"num_tokens": 34136916.0,
"step": 18510
},
{
"entropy": 5.5959492206573485,
"epoch": 1.555513547574039,
"grad_norm": 1.3125,
"learning_rate": 0.0004759878365991754,
"loss": 5.2611,
"mean_token_accuracy": 0.17831981778144837,
"num_tokens": 34146400.0,
"step": 18515
},
{
"entropy": 5.561361169815063,
"epoch": 1.555933627389204,
"grad_norm": 1.5703125,
"learning_rate": 0.0004759743737822309,
"loss": 5.3194,
"mean_token_accuracy": 0.1799671843647957,
"num_tokens": 34155611.0,
"step": 18520
},
{
"entropy": 5.5850330829620365,
"epoch": 1.556353707204369,
"grad_norm": 1.5234375,
"learning_rate": 0.00047596090740512884,
"loss": 5.3449,
"mean_token_accuracy": 0.17395980805158615,
"num_tokens": 34165301.0,
"step": 18525
},
{
"entropy": 5.642131185531616,
"epoch": 1.5567737870195337,
"grad_norm": 1.46875,
"learning_rate": 0.00047594743746810786,
"loss": 5.3091,
"mean_token_accuracy": 0.172430819272995,
"num_tokens": 34174655.0,
"step": 18530
},
{
"entropy": 5.695926475524902,
"epoch": 1.5571938668346985,
"grad_norm": 1.3046875,
"learning_rate": 0.00047593396397140644,
"loss": 5.4679,
"mean_token_accuracy": 0.16660561114549638,
"num_tokens": 34184293.0,
"step": 18535
},
{
"entropy": 5.604151725769043,
"epoch": 1.5576139466498635,
"grad_norm": 1.34375,
"learning_rate": 0.0004759204869152632,
"loss": 5.3602,
"mean_token_accuracy": 0.1705666095018387,
"num_tokens": 34193025.0,
"step": 18540
},
{
"entropy": 5.546497774124146,
"epoch": 1.5580340264650283,
"grad_norm": 1.3203125,
"learning_rate": 0.0004759070062999171,
"loss": 5.2662,
"mean_token_accuracy": 0.17659903615713118,
"num_tokens": 34201082.0,
"step": 18545
},
{
"entropy": 5.632139539718628,
"epoch": 1.5584541062801933,
"grad_norm": 1.3125,
"learning_rate": 0.0004758935221256069,
"loss": 5.4082,
"mean_token_accuracy": 0.16844913065433503,
"num_tokens": 34211210.0,
"step": 18550
},
{
"entropy": 5.6329648971557615,
"epoch": 1.558874186095358,
"grad_norm": 1.3046875,
"learning_rate": 0.00047588003439257134,
"loss": 5.3354,
"mean_token_accuracy": 0.17121753990650176,
"num_tokens": 34220309.0,
"step": 18555
},
{
"entropy": 5.622624158859253,
"epoch": 1.559294265910523,
"grad_norm": 1.296875,
"learning_rate": 0.00047586654310104946,
"loss": 5.3367,
"mean_token_accuracy": 0.1659384474158287,
"num_tokens": 34229532.0,
"step": 18560
},
{
"entropy": 5.664423656463623,
"epoch": 1.559714345725688,
"grad_norm": 1.4609375,
"learning_rate": 0.0004758530482512801,
"loss": 5.5497,
"mean_token_accuracy": 0.16148972809314727,
"num_tokens": 34239543.0,
"step": 18565
},
{
"entropy": 5.634637689590454,
"epoch": 1.560134425540853,
"grad_norm": 1.2578125,
"learning_rate": 0.0004758395498435024,
"loss": 5.3505,
"mean_token_accuracy": 0.17231488972902298,
"num_tokens": 34248654.0,
"step": 18570
},
{
"entropy": 5.59573802947998,
"epoch": 1.5605545053560177,
"grad_norm": 1.4296875,
"learning_rate": 0.00047582604787795555,
"loss": 5.3453,
"mean_token_accuracy": 0.16665179580450057,
"num_tokens": 34258757.0,
"step": 18575
},
{
"entropy": 5.604764938354492,
"epoch": 1.5609745851711825,
"grad_norm": 1.625,
"learning_rate": 0.0004758125423548787,
"loss": 5.3374,
"mean_token_accuracy": 0.17045502811670304,
"num_tokens": 34268253.0,
"step": 18580
},
{
"entropy": 5.6785985946655275,
"epoch": 1.5613946649863473,
"grad_norm": 1.8828125,
"learning_rate": 0.00047579903327451097,
"loss": 5.4108,
"mean_token_accuracy": 0.16939797401428222,
"num_tokens": 34277361.0,
"step": 18585
},
{
"entropy": 5.549003744125367,
"epoch": 1.5618147448015123,
"grad_norm": 1.2734375,
"learning_rate": 0.0004757855206370919,
"loss": 5.2855,
"mean_token_accuracy": 0.1721700608730316,
"num_tokens": 34285923.0,
"step": 18590
},
{
"entropy": 5.485878419876099,
"epoch": 1.5622348246166773,
"grad_norm": 1.46875,
"learning_rate": 0.00047577200444286064,
"loss": 5.2762,
"mean_token_accuracy": 0.17415012866258622,
"num_tokens": 34296300.0,
"step": 18595
},
{
"entropy": 5.696203279495239,
"epoch": 1.562654904431842,
"grad_norm": 1.2890625,
"learning_rate": 0.0004757584846920567,
"loss": 5.3281,
"mean_token_accuracy": 0.17202821522951126,
"num_tokens": 34305757.0,
"step": 18600
},
{
"entropy": 5.663634872436523,
"epoch": 1.5630749842470069,
"grad_norm": 1.3671875,
"learning_rate": 0.0004757449613849196,
"loss": 5.362,
"mean_token_accuracy": 0.16650892049074173,
"num_tokens": 34314714.0,
"step": 18605
},
{
"entropy": 5.615960884094238,
"epoch": 1.5634950640621716,
"grad_norm": 1.40625,
"learning_rate": 0.00047573143452168883,
"loss": 5.3831,
"mean_token_accuracy": 0.17684735208749772,
"num_tokens": 34323501.0,
"step": 18610
},
{
"entropy": 5.6220183849334715,
"epoch": 1.5639151438773367,
"grad_norm": 1.4453125,
"learning_rate": 0.00047571790410260405,
"loss": 5.3256,
"mean_token_accuracy": 0.17838662564754487,
"num_tokens": 34331752.0,
"step": 18615
},
{
"entropy": 5.6933129787445065,
"epoch": 1.5643352236925017,
"grad_norm": 1.234375,
"learning_rate": 0.000475704370127905,
"loss": 5.3725,
"mean_token_accuracy": 0.16566499918699265,
"num_tokens": 34341479.0,
"step": 18620
},
{
"entropy": 5.5972644805908205,
"epoch": 1.5647553035076665,
"grad_norm": 1.4921875,
"learning_rate": 0.0004756908325978314,
"loss": 5.3849,
"mean_token_accuracy": 0.17097063660621642,
"num_tokens": 34350991.0,
"step": 18625
},
{
"entropy": 5.599239778518677,
"epoch": 1.5651753833228312,
"grad_norm": 1.28125,
"learning_rate": 0.00047567729151262305,
"loss": 5.2867,
"mean_token_accuracy": 0.1815695881843567,
"num_tokens": 34360089.0,
"step": 18630
},
{
"entropy": 5.609292316436767,
"epoch": 1.5655954631379962,
"grad_norm": 1.3515625,
"learning_rate": 0.0004756637468725198,
"loss": 5.2688,
"mean_token_accuracy": 0.17064931839704514,
"num_tokens": 34370352.0,
"step": 18635
},
{
"entropy": 5.571317672729492,
"epoch": 1.5660155429531613,
"grad_norm": 1.3671875,
"learning_rate": 0.0004756501986777616,
"loss": 5.2612,
"mean_token_accuracy": 0.17301129549741745,
"num_tokens": 34378958.0,
"step": 18640
},
{
"entropy": 5.540798377990723,
"epoch": 1.566435622768326,
"grad_norm": 1.4140625,
"learning_rate": 0.00047563664692858843,
"loss": 5.2255,
"mean_token_accuracy": 0.1825499266386032,
"num_tokens": 34387723.0,
"step": 18645
},
{
"entropy": 5.579714918136597,
"epoch": 1.5668557025834908,
"grad_norm": 1.359375,
"learning_rate": 0.0004756230916252404,
"loss": 5.3369,
"mean_token_accuracy": 0.1789557605981827,
"num_tokens": 34397089.0,
"step": 18650
},
{
"entropy": 5.692389965057373,
"epoch": 1.5672757823986556,
"grad_norm": 1.390625,
"learning_rate": 0.00047560953276795756,
"loss": 5.3607,
"mean_token_accuracy": 0.17229301780462264,
"num_tokens": 34406278.0,
"step": 18655
},
{
"entropy": 5.606803941726684,
"epoch": 1.5676958622138206,
"grad_norm": 1.484375,
"learning_rate": 0.00047559597035698014,
"loss": 5.3271,
"mean_token_accuracy": 0.17409724295139312,
"num_tokens": 34415404.0,
"step": 18660
},
{
"entropy": 5.57696123123169,
"epoch": 1.5681159420289856,
"grad_norm": 1.4296875,
"learning_rate": 0.0004755824043925485,
"loss": 5.3642,
"mean_token_accuracy": 0.17574930042028428,
"num_tokens": 34425036.0,
"step": 18665
},
{
"entropy": 5.630114126205444,
"epoch": 1.5685360218441504,
"grad_norm": 2.328125,
"learning_rate": 0.0004755688348749027,
"loss": 5.2988,
"mean_token_accuracy": 0.1728532612323761,
"num_tokens": 34434246.0,
"step": 18670
},
{
"entropy": 5.559924459457397,
"epoch": 1.5689561016593152,
"grad_norm": 1.5,
"learning_rate": 0.0004755552618042834,
"loss": 5.2989,
"mean_token_accuracy": 0.17132496684789658,
"num_tokens": 34444189.0,
"step": 18675
},
{
"entropy": 5.58116307258606,
"epoch": 1.56937618147448,
"grad_norm": 1.421875,
"learning_rate": 0.0004755416851809308,
"loss": 5.2859,
"mean_token_accuracy": 0.17897624522447586,
"num_tokens": 34453727.0,
"step": 18680
},
{
"entropy": 5.503720998764038,
"epoch": 1.569796261289645,
"grad_norm": 1.4296875,
"learning_rate": 0.0004755281050050856,
"loss": 5.2814,
"mean_token_accuracy": 0.17434412688016893,
"num_tokens": 34462835.0,
"step": 18685
},
{
"entropy": 5.573409175872802,
"epoch": 1.57021634110481,
"grad_norm": 1.25,
"learning_rate": 0.0004755145212769882,
"loss": 5.3447,
"mean_token_accuracy": 0.17159089893102647,
"num_tokens": 34471642.0,
"step": 18690
},
{
"entropy": 5.614776182174682,
"epoch": 1.5706364209199748,
"grad_norm": 1.46875,
"learning_rate": 0.00047550093399687936,
"loss": 5.315,
"mean_token_accuracy": 0.1728378102183342,
"num_tokens": 34480468.0,
"step": 18695
},
{
"entropy": 5.687031221389771,
"epoch": 1.5710565007351396,
"grad_norm": 1.3203125,
"learning_rate": 0.0004754873431649997,
"loss": 5.3355,
"mean_token_accuracy": 0.17302245199680327,
"num_tokens": 34490299.0,
"step": 18700
},
{
"entropy": 5.629738712310791,
"epoch": 1.5714765805503046,
"grad_norm": 1.3125,
"learning_rate": 0.00047547374878159003,
"loss": 5.3642,
"mean_token_accuracy": 0.16929103881120683,
"num_tokens": 34498831.0,
"step": 18705
},
{
"entropy": 5.581797409057617,
"epoch": 1.5718966603654696,
"grad_norm": 1.5625,
"learning_rate": 0.0004754601508468911,
"loss": 5.3323,
"mean_token_accuracy": 0.17347660511732102,
"num_tokens": 34508048.0,
"step": 18710
},
{
"entropy": 5.559632110595703,
"epoch": 1.5723167401806344,
"grad_norm": 1.421875,
"learning_rate": 0.0004754465493611438,
"loss": 5.4249,
"mean_token_accuracy": 0.16627505868673326,
"num_tokens": 34517070.0,
"step": 18715
},
{
"entropy": 5.587091970443725,
"epoch": 1.5727368199957992,
"grad_norm": 1.3046875,
"learning_rate": 0.00047543294432458904,
"loss": 5.2219,
"mean_token_accuracy": 0.17662144601345062,
"num_tokens": 34525934.0,
"step": 18720
},
{
"entropy": 5.697764205932617,
"epoch": 1.573156899810964,
"grad_norm": 1.28125,
"learning_rate": 0.000475419335737468,
"loss": 5.4227,
"mean_token_accuracy": 0.1721249610185623,
"num_tokens": 34534222.0,
"step": 18725
},
{
"entropy": 5.649804353713989,
"epoch": 1.573576979626129,
"grad_norm": 1.484375,
"learning_rate": 0.00047540572360002157,
"loss": 5.4063,
"mean_token_accuracy": 0.17289355844259263,
"num_tokens": 34543291.0,
"step": 18730
},
{
"entropy": 5.58485975265503,
"epoch": 1.573997059441294,
"grad_norm": 1.265625,
"learning_rate": 0.00047539210791249095,
"loss": 5.2708,
"mean_token_accuracy": 0.17763001322746277,
"num_tokens": 34552383.0,
"step": 18735
},
{
"entropy": 5.607690334320068,
"epoch": 1.5744171392564588,
"grad_norm": 1.5390625,
"learning_rate": 0.0004753784886751173,
"loss": 5.2726,
"mean_token_accuracy": 0.18037284165620804,
"num_tokens": 34560311.0,
"step": 18740
},
{
"entropy": 5.541455221176148,
"epoch": 1.5748372190716236,
"grad_norm": 1.4453125,
"learning_rate": 0.0004753648658881419,
"loss": 5.2919,
"mean_token_accuracy": 0.18228598237037658,
"num_tokens": 34569903.0,
"step": 18745
},
{
"entropy": 5.554886388778686,
"epoch": 1.5752572988867883,
"grad_norm": 1.3203125,
"learning_rate": 0.00047535123955180607,
"loss": 5.3184,
"mean_token_accuracy": 0.17227640748023987,
"num_tokens": 34579735.0,
"step": 18750
},
{
"entropy": 5.67020959854126,
"epoch": 1.5756773787019533,
"grad_norm": 1.359375,
"learning_rate": 0.0004753376096663512,
"loss": 5.3529,
"mean_token_accuracy": 0.16938093155622483,
"num_tokens": 34589105.0,
"step": 18755
},
{
"entropy": 5.537098121643067,
"epoch": 1.5760974585171184,
"grad_norm": 1.3203125,
"learning_rate": 0.00047532397623201877,
"loss": 5.2889,
"mean_token_accuracy": 0.18200737982988358,
"num_tokens": 34597883.0,
"step": 18760
},
{
"entropy": 5.568551349639892,
"epoch": 1.5765175383322831,
"grad_norm": 1.453125,
"learning_rate": 0.00047531033924905024,
"loss": 5.2668,
"mean_token_accuracy": 0.18290874511003494,
"num_tokens": 34606666.0,
"step": 18765
},
{
"entropy": 5.645462083816528,
"epoch": 1.576937618147448,
"grad_norm": 1.34375,
"learning_rate": 0.0004752966987176873,
"loss": 5.3654,
"mean_token_accuracy": 0.17319027781486512,
"num_tokens": 34616547.0,
"step": 18770
},
{
"entropy": 5.613454675674438,
"epoch": 1.577357697962613,
"grad_norm": 1.3515625,
"learning_rate": 0.0004752830546381713,
"loss": 5.3634,
"mean_token_accuracy": 0.17614342719316484,
"num_tokens": 34625679.0,
"step": 18775
},
{
"entropy": 5.492621231079101,
"epoch": 1.5777777777777777,
"grad_norm": 1.578125,
"learning_rate": 0.0004752694070107442,
"loss": 5.3067,
"mean_token_accuracy": 0.17374027371406556,
"num_tokens": 34635633.0,
"step": 18780
},
{
"entropy": 5.575917339324951,
"epoch": 1.5781978575929427,
"grad_norm": 1.3515625,
"learning_rate": 0.0004752557558356476,
"loss": 5.3324,
"mean_token_accuracy": 0.1735405743122101,
"num_tokens": 34645206.0,
"step": 18785
},
{
"entropy": 5.65060510635376,
"epoch": 1.5786179374081075,
"grad_norm": 1.6015625,
"learning_rate": 0.0004752421011131234,
"loss": 5.3402,
"mean_token_accuracy": 0.16817475110292435,
"num_tokens": 34653884.0,
"step": 18790
},
{
"entropy": 5.572496652603149,
"epoch": 1.5790380172232723,
"grad_norm": 1.40625,
"learning_rate": 0.00047522844284341364,
"loss": 5.1925,
"mean_token_accuracy": 0.17981759756803511,
"num_tokens": 34662170.0,
"step": 18795
},
{
"entropy": 5.503551292419433,
"epoch": 1.5794580970384373,
"grad_norm": 1.5390625,
"learning_rate": 0.0004752147810267601,
"loss": 5.3483,
"mean_token_accuracy": 0.17128741592168809,
"num_tokens": 34672548.0,
"step": 18800
},
{
"entropy": 5.639070844650268,
"epoch": 1.5798781768536023,
"grad_norm": 1.3203125,
"learning_rate": 0.00047520111566340465,
"loss": 5.3612,
"mean_token_accuracy": 0.17719526588916779,
"num_tokens": 34680972.0,
"step": 18805
},
{
"entropy": 5.569992733001709,
"epoch": 1.580298256668767,
"grad_norm": 1.453125,
"learning_rate": 0.00047518744675358965,
"loss": 5.2337,
"mean_token_accuracy": 0.18217017501592636,
"num_tokens": 34689589.0,
"step": 18810
},
{
"entropy": 5.5319360256195065,
"epoch": 1.580718336483932,
"grad_norm": 1.3203125,
"learning_rate": 0.0004751737742975571,
"loss": 5.2579,
"mean_token_accuracy": 0.17374635636806487,
"num_tokens": 34698747.0,
"step": 18815
},
{
"entropy": 5.580434083938599,
"epoch": 1.5811384162990967,
"grad_norm": 1.296875,
"learning_rate": 0.00047516009829554913,
"loss": 5.3204,
"mean_token_accuracy": 0.1734059274196625,
"num_tokens": 34707502.0,
"step": 18820
},
{
"entropy": 5.5628293514251705,
"epoch": 1.5815584961142617,
"grad_norm": 1.3203125,
"learning_rate": 0.00047514641874780815,
"loss": 5.246,
"mean_token_accuracy": 0.17817162573337555,
"num_tokens": 34715879.0,
"step": 18825
},
{
"entropy": 5.532876491546631,
"epoch": 1.5819785759294267,
"grad_norm": 1.3671875,
"learning_rate": 0.00047513273565457644,
"loss": 5.4367,
"mean_token_accuracy": 0.17213662713766098,
"num_tokens": 34726090.0,
"step": 18830
},
{
"entropy": 5.704280281066895,
"epoch": 1.5823986557445915,
"grad_norm": 1.484375,
"learning_rate": 0.0004751190490160964,
"loss": 5.3869,
"mean_token_accuracy": 0.16371422857046128,
"num_tokens": 34736014.0,
"step": 18835
},
{
"entropy": 5.698468255996704,
"epoch": 1.5828187355597563,
"grad_norm": 1.203125,
"learning_rate": 0.00047510535883261035,
"loss": 5.3194,
"mean_token_accuracy": 0.1788153365254402,
"num_tokens": 34745648.0,
"step": 18840
},
{
"entropy": 5.564353990554809,
"epoch": 1.5832388153749213,
"grad_norm": 1.421875,
"learning_rate": 0.000475091665104361,
"loss": 5.3036,
"mean_token_accuracy": 0.17691005319356917,
"num_tokens": 34753908.0,
"step": 18845
},
{
"entropy": 5.540375280380249,
"epoch": 1.583658895190086,
"grad_norm": 1.7578125,
"learning_rate": 0.0004750779678315908,
"loss": 5.165,
"mean_token_accuracy": 0.18461597859859466,
"num_tokens": 34762303.0,
"step": 18850
},
{
"entropy": 5.511554479598999,
"epoch": 1.584078975005251,
"grad_norm": 1.390625,
"learning_rate": 0.0004750642670145424,
"loss": 5.3476,
"mean_token_accuracy": 0.16993636637926102,
"num_tokens": 34771463.0,
"step": 18855
},
{
"entropy": 5.6761252880096436,
"epoch": 1.5844990548204159,
"grad_norm": 1.3671875,
"learning_rate": 0.0004750505626534585,
"loss": 5.4194,
"mean_token_accuracy": 0.1731710687279701,
"num_tokens": 34780704.0,
"step": 18860
},
{
"entropy": 5.542312288284302,
"epoch": 1.5849191346355807,
"grad_norm": 1.3359375,
"learning_rate": 0.00047503685474858194,
"loss": 5.2351,
"mean_token_accuracy": 0.18194950371980667,
"num_tokens": 34790262.0,
"step": 18865
},
{
"entropy": 5.591007041931152,
"epoch": 1.5853392144507457,
"grad_norm": 1.7109375,
"learning_rate": 0.0004750231433001555,
"loss": 5.2758,
"mean_token_accuracy": 0.17748731523752212,
"num_tokens": 34799450.0,
"step": 18870
},
{
"entropy": 5.659072828292847,
"epoch": 1.5857592942659107,
"grad_norm": 1.3671875,
"learning_rate": 0.0004750094283084221,
"loss": 5.3253,
"mean_token_accuracy": 0.17358425110578538,
"num_tokens": 34808220.0,
"step": 18875
},
{
"entropy": 5.631747198104859,
"epoch": 1.5861793740810755,
"grad_norm": 1.3125,
"learning_rate": 0.00047499570977362467,
"loss": 5.3478,
"mean_token_accuracy": 0.16828888207674025,
"num_tokens": 34817846.0,
"step": 18880
},
{
"entropy": 5.612795686721801,
"epoch": 1.5865994538962402,
"grad_norm": 1.5078125,
"learning_rate": 0.00047498198769600617,
"loss": 5.3675,
"mean_token_accuracy": 0.17242051810026168,
"num_tokens": 34826962.0,
"step": 18885
},
{
"entropy": 5.505703115463257,
"epoch": 1.587019533711405,
"grad_norm": 1.328125,
"learning_rate": 0.0004749682620758097,
"loss": 5.3021,
"mean_token_accuracy": 0.17003610134124755,
"num_tokens": 34837170.0,
"step": 18890
},
{
"entropy": 5.523403692245483,
"epoch": 1.58743961352657,
"grad_norm": 1.359375,
"learning_rate": 0.00047495453291327854,
"loss": 5.2822,
"mean_token_accuracy": 0.17889610230922698,
"num_tokens": 34845336.0,
"step": 18895
},
{
"entropy": 5.613968801498413,
"epoch": 1.587859693341735,
"grad_norm": 1.4296875,
"learning_rate": 0.00047494080020865577,
"loss": 5.2758,
"mean_token_accuracy": 0.17693043649196624,
"num_tokens": 34854613.0,
"step": 18900
},
{
"entropy": 5.659094333648682,
"epoch": 1.5882797731568998,
"grad_norm": 1.4765625,
"learning_rate": 0.0004749270639621846,
"loss": 5.3559,
"mean_token_accuracy": 0.1739572212100029,
"num_tokens": 34864254.0,
"step": 18905
},
{
"entropy": 5.597303676605224,
"epoch": 1.5886998529720646,
"grad_norm": 1.5234375,
"learning_rate": 0.0004749133241741085,
"loss": 5.4026,
"mean_token_accuracy": 0.16696639060974122,
"num_tokens": 34874380.0,
"step": 18910
},
{
"entropy": 5.572857189178467,
"epoch": 1.5891199327872296,
"grad_norm": 1.4296875,
"learning_rate": 0.0004748995808446708,
"loss": 5.352,
"mean_token_accuracy": 0.17355572879314424,
"num_tokens": 34883688.0,
"step": 18915
},
{
"entropy": 5.559622383117675,
"epoch": 1.5895400126023944,
"grad_norm": 1.4609375,
"learning_rate": 0.00047488583397411495,
"loss": 5.2227,
"mean_token_accuracy": 0.18158024847507476,
"num_tokens": 34892831.0,
"step": 18920
},
{
"entropy": 5.602974081039429,
"epoch": 1.5899600924175594,
"grad_norm": 1.578125,
"learning_rate": 0.00047487208356268454,
"loss": 5.3125,
"mean_token_accuracy": 0.18413764983415604,
"num_tokens": 34901517.0,
"step": 18925
},
{
"entropy": 5.559728956222534,
"epoch": 1.5903801722327242,
"grad_norm": 1.4140625,
"learning_rate": 0.00047485832961062296,
"loss": 5.3148,
"mean_token_accuracy": 0.17255878299474717,
"num_tokens": 34910765.0,
"step": 18930
},
{
"entropy": 5.599199008941651,
"epoch": 1.590800252047889,
"grad_norm": 1.6015625,
"learning_rate": 0.00047484457211817405,
"loss": 5.3574,
"mean_token_accuracy": 0.1614689275622368,
"num_tokens": 34919799.0,
"step": 18935
},
{
"entropy": 5.485872983932495,
"epoch": 1.591220331863054,
"grad_norm": 1.421875,
"learning_rate": 0.00047483081108558143,
"loss": 5.2247,
"mean_token_accuracy": 0.1758006453514099,
"num_tokens": 34928199.0,
"step": 18940
},
{
"entropy": 5.621739387512207,
"epoch": 1.591640411678219,
"grad_norm": 1.5546875,
"learning_rate": 0.000474817046513089,
"loss": 5.379,
"mean_token_accuracy": 0.17343777865171434,
"num_tokens": 34937751.0,
"step": 18945
},
{
"entropy": 5.656492614746094,
"epoch": 1.5920604914933838,
"grad_norm": 1.3359375,
"learning_rate": 0.0004748032784009403,
"loss": 5.3082,
"mean_token_accuracy": 0.17786511331796645,
"num_tokens": 34946052.0,
"step": 18950
},
{
"entropy": 5.608128404617309,
"epoch": 1.5924805713085486,
"grad_norm": 1.3984375,
"learning_rate": 0.0004747895067493796,
"loss": 5.3067,
"mean_token_accuracy": 0.17302953004837035,
"num_tokens": 34954932.0,
"step": 18955
},
{
"entropy": 5.6118710994720455,
"epoch": 1.5929006511237134,
"grad_norm": 1.4296875,
"learning_rate": 0.0004747757315586505,
"loss": 5.2776,
"mean_token_accuracy": 0.17757894545793534,
"num_tokens": 34963581.0,
"step": 18960
},
{
"entropy": 5.433234643936157,
"epoch": 1.5933207309388784,
"grad_norm": 1.3515625,
"learning_rate": 0.00047476195282899727,
"loss": 5.0891,
"mean_token_accuracy": 0.1897970035672188,
"num_tokens": 34972844.0,
"step": 18965
},
{
"entropy": 5.538211870193481,
"epoch": 1.5937408107540434,
"grad_norm": 1.640625,
"learning_rate": 0.00047474817056066383,
"loss": 5.3046,
"mean_token_accuracy": 0.1808512181043625,
"num_tokens": 34981998.0,
"step": 18970
},
{
"entropy": 5.514722394943237,
"epoch": 1.5941608905692082,
"grad_norm": 1.2890625,
"learning_rate": 0.00047473438475389453,
"loss": 5.2382,
"mean_token_accuracy": 0.17814717888832093,
"num_tokens": 34990552.0,
"step": 18975
},
{
"entropy": 5.573274278640747,
"epoch": 1.594580970384373,
"grad_norm": 1.3203125,
"learning_rate": 0.0004747205954089333,
"loss": 5.249,
"mean_token_accuracy": 0.18452810496091843,
"num_tokens": 35000259.0,
"step": 18980
},
{
"entropy": 5.637682247161865,
"epoch": 1.5950010501995378,
"grad_norm": 1.3828125,
"learning_rate": 0.0004747068025260247,
"loss": 5.3551,
"mean_token_accuracy": 0.16950240433216096,
"num_tokens": 35009592.0,
"step": 18985
},
{
"entropy": 5.603251504898071,
"epoch": 1.5954211300147028,
"grad_norm": 1.578125,
"learning_rate": 0.0004746930061054129,
"loss": 5.3895,
"mean_token_accuracy": 0.16506906151771544,
"num_tokens": 35019356.0,
"step": 18990
},
{
"entropy": 5.5750750541687015,
"epoch": 1.5958412098298678,
"grad_norm": 1.3515625,
"learning_rate": 0.00047467920614734224,
"loss": 5.3095,
"mean_token_accuracy": 0.17518302649259568,
"num_tokens": 35028764.0,
"step": 18995
},
{
"entropy": 5.562907695770264,
"epoch": 1.5962612896450326,
"grad_norm": 1.1796875,
"learning_rate": 0.0004746654026520573,
"loss": 5.3159,
"mean_token_accuracy": 0.1747448429465294,
"num_tokens": 35037903.0,
"step": 19000
},
{
"entropy": 5.531657886505127,
"epoch": 1.5966813694601973,
"grad_norm": 1.3515625,
"learning_rate": 0.0004746515956198026,
"loss": 5.2225,
"mean_token_accuracy": 0.17857593297958374,
"num_tokens": 35046326.0,
"step": 19005
},
{
"entropy": 5.692345142364502,
"epoch": 1.5971014492753624,
"grad_norm": 1.5078125,
"learning_rate": 0.00047463778505082266,
"loss": 5.436,
"mean_token_accuracy": 0.1656318113207817,
"num_tokens": 35055551.0,
"step": 19010
},
{
"entropy": 5.570341920852661,
"epoch": 1.5975215290905274,
"grad_norm": 1.71875,
"learning_rate": 0.0004746239709453621,
"loss": 5.2258,
"mean_token_accuracy": 0.18383382856845856,
"num_tokens": 35065595.0,
"step": 19015
},
{
"entropy": 5.574179649353027,
"epoch": 1.5979416089056921,
"grad_norm": 1.40625,
"learning_rate": 0.0004746101533036658,
"loss": 5.3549,
"mean_token_accuracy": 0.17486544847488403,
"num_tokens": 35075097.0,
"step": 19020
},
{
"entropy": 5.727721738815307,
"epoch": 1.598361688720857,
"grad_norm": 1.4375,
"learning_rate": 0.00047459633212597834,
"loss": 5.4221,
"mean_token_accuracy": 0.16538667380809785,
"num_tokens": 35084092.0,
"step": 19025
},
{
"entropy": 5.57632040977478,
"epoch": 1.5987817685360217,
"grad_norm": 1.328125,
"learning_rate": 0.0004745825074125447,
"loss": 5.3016,
"mean_token_accuracy": 0.17495665550231934,
"num_tokens": 35093007.0,
"step": 19030
},
{
"entropy": 5.655797910690308,
"epoch": 1.5992018483511867,
"grad_norm": 1.359375,
"learning_rate": 0.0004745686791636097,
"loss": 5.3811,
"mean_token_accuracy": 0.16567041873931884,
"num_tokens": 35103094.0,
"step": 19035
},
{
"entropy": 5.529991865158081,
"epoch": 1.5996219281663517,
"grad_norm": 1.21875,
"learning_rate": 0.00047455484737941823,
"loss": 5.2064,
"mean_token_accuracy": 0.179788838326931,
"num_tokens": 35112561.0,
"step": 19040
},
{
"entropy": 5.520506525039673,
"epoch": 1.6000420079815165,
"grad_norm": 1.3828125,
"learning_rate": 0.0004745410120602155,
"loss": 5.3016,
"mean_token_accuracy": 0.1710441455245018,
"num_tokens": 35121718.0,
"step": 19045
},
{
"entropy": 5.591058444976807,
"epoch": 1.6004620877966813,
"grad_norm": 1.1953125,
"learning_rate": 0.00047452717320624647,
"loss": 5.2565,
"mean_token_accuracy": 0.18709157705307006,
"num_tokens": 35130073.0,
"step": 19050
},
{
"entropy": 5.600809001922608,
"epoch": 1.600882167611846,
"grad_norm": 1.40625,
"learning_rate": 0.0004745133308177562,
"loss": 5.3002,
"mean_token_accuracy": 0.1735777661204338,
"num_tokens": 35138876.0,
"step": 19055
},
{
"entropy": 5.56061429977417,
"epoch": 1.601302247427011,
"grad_norm": 1.421875,
"learning_rate": 0.00047449948489499007,
"loss": 5.3071,
"mean_token_accuracy": 0.1721906304359436,
"num_tokens": 35147750.0,
"step": 19060
},
{
"entropy": 5.592257070541382,
"epoch": 1.6017223272421761,
"grad_norm": 1.4375,
"learning_rate": 0.00047448563543819335,
"loss": 5.3202,
"mean_token_accuracy": 0.17968302518129348,
"num_tokens": 35156955.0,
"step": 19065
},
{
"entropy": 5.5946574211120605,
"epoch": 1.602142407057341,
"grad_norm": 1.28125,
"learning_rate": 0.0004744717824476112,
"loss": 5.3402,
"mean_token_accuracy": 0.17173443287611007,
"num_tokens": 35166542.0,
"step": 19070
},
{
"entropy": 5.635613679885864,
"epoch": 1.6025624868725057,
"grad_norm": 1.453125,
"learning_rate": 0.00047445792592348926,
"loss": 5.2959,
"mean_token_accuracy": 0.17714515775442125,
"num_tokens": 35175258.0,
"step": 19075
},
{
"entropy": 5.582405424118042,
"epoch": 1.6029825666876707,
"grad_norm": 1.5859375,
"learning_rate": 0.0004744440658660729,
"loss": 5.2984,
"mean_token_accuracy": 0.17503910958766938,
"num_tokens": 35184970.0,
"step": 19080
},
{
"entropy": 5.5917713165283205,
"epoch": 1.6034026465028357,
"grad_norm": 1.25,
"learning_rate": 0.0004744302022756075,
"loss": 5.2838,
"mean_token_accuracy": 0.1705200970172882,
"num_tokens": 35193948.0,
"step": 19085
},
{
"entropy": 5.50913405418396,
"epoch": 1.6038227263180005,
"grad_norm": 1.4921875,
"learning_rate": 0.00047441633515233874,
"loss": 5.2458,
"mean_token_accuracy": 0.17985685020685196,
"num_tokens": 35203792.0,
"step": 19090
},
{
"entropy": 5.581792640686035,
"epoch": 1.6042428061331653,
"grad_norm": 1.4765625,
"learning_rate": 0.0004744024644965123,
"loss": 5.41,
"mean_token_accuracy": 0.16977565437555314,
"num_tokens": 35212684.0,
"step": 19095
},
{
"entropy": 5.574728631973267,
"epoch": 1.60466288594833,
"grad_norm": 1.3203125,
"learning_rate": 0.00047438859030837397,
"loss": 5.2231,
"mean_token_accuracy": 0.1803961992263794,
"num_tokens": 35220830.0,
"step": 19100
},
{
"entropy": 5.607393741607666,
"epoch": 1.605082965763495,
"grad_norm": 1.265625,
"learning_rate": 0.00047437471258816936,
"loss": 5.2887,
"mean_token_accuracy": 0.17364899814128876,
"num_tokens": 35230171.0,
"step": 19105
},
{
"entropy": 5.560548639297485,
"epoch": 1.60550304557866,
"grad_norm": 1.3671875,
"learning_rate": 0.00047436083133614446,
"loss": 5.2057,
"mean_token_accuracy": 0.17734796106815337,
"num_tokens": 35239022.0,
"step": 19110
},
{
"entropy": 5.5449165344238285,
"epoch": 1.6059231253938249,
"grad_norm": 1.265625,
"learning_rate": 0.00047434694655254495,
"loss": 5.2288,
"mean_token_accuracy": 0.17386915683746337,
"num_tokens": 35247564.0,
"step": 19115
},
{
"entropy": 5.526667261123658,
"epoch": 1.6063432052089897,
"grad_norm": 1.421875,
"learning_rate": 0.000474333058237617,
"loss": 5.2712,
"mean_token_accuracy": 0.16903400421142578,
"num_tokens": 35256175.0,
"step": 19120
},
{
"entropy": 5.672097826004029,
"epoch": 1.6067632850241544,
"grad_norm": 1.4296875,
"learning_rate": 0.00047431916639160656,
"loss": 5.4293,
"mean_token_accuracy": 0.17139460295438766,
"num_tokens": 35265278.0,
"step": 19125
},
{
"entropy": 5.492290496826172,
"epoch": 1.6071833648393195,
"grad_norm": 1.421875,
"learning_rate": 0.0004743052710147598,
"loss": 5.1382,
"mean_token_accuracy": 0.1823027327656746,
"num_tokens": 35274715.0,
"step": 19130
},
{
"entropy": 5.455765104293823,
"epoch": 1.6076034446544845,
"grad_norm": 1.625,
"learning_rate": 0.00047429137210732266,
"loss": 5.2576,
"mean_token_accuracy": 0.17954068183898925,
"num_tokens": 35285450.0,
"step": 19135
},
{
"entropy": 5.582807874679565,
"epoch": 1.6080235244696492,
"grad_norm": 1.3984375,
"learning_rate": 0.0004742774696695415,
"loss": 5.278,
"mean_token_accuracy": 0.16844225078821182,
"num_tokens": 35294531.0,
"step": 19140
},
{
"entropy": 5.627006578445434,
"epoch": 1.608443604284814,
"grad_norm": 1.2890625,
"learning_rate": 0.00047426356370166266,
"loss": 5.3311,
"mean_token_accuracy": 0.16784023940563203,
"num_tokens": 35303749.0,
"step": 19145
},
{
"entropy": 5.52952880859375,
"epoch": 1.608863684099979,
"grad_norm": 1.3203125,
"learning_rate": 0.0004742496542039324,
"loss": 5.2841,
"mean_token_accuracy": 0.1710781380534172,
"num_tokens": 35312994.0,
"step": 19150
},
{
"entropy": 5.547932195663452,
"epoch": 1.6092837639151438,
"grad_norm": 1.3125,
"learning_rate": 0.00047423574117659703,
"loss": 5.2623,
"mean_token_accuracy": 0.1761298656463623,
"num_tokens": 35322533.0,
"step": 19155
},
{
"entropy": 5.572248315811157,
"epoch": 1.6097038437303088,
"grad_norm": 1.359375,
"learning_rate": 0.00047422182461990316,
"loss": 5.2838,
"mean_token_accuracy": 0.1790923595428467,
"num_tokens": 35331872.0,
"step": 19160
},
{
"entropy": 5.552614784240722,
"epoch": 1.6101239235454736,
"grad_norm": 1.265625,
"learning_rate": 0.00047420790453409724,
"loss": 5.327,
"mean_token_accuracy": 0.17446180433034897,
"num_tokens": 35341517.0,
"step": 19165
},
{
"entropy": 5.54572696685791,
"epoch": 1.6105440033606384,
"grad_norm": 1.2421875,
"learning_rate": 0.0004741939809194258,
"loss": 5.2363,
"mean_token_accuracy": 0.17793529629707336,
"num_tokens": 35350291.0,
"step": 19170
},
{
"entropy": 5.622583627700806,
"epoch": 1.6109640831758034,
"grad_norm": 1.4140625,
"learning_rate": 0.00047418005377613566,
"loss": 5.4104,
"mean_token_accuracy": 0.16662026047706605,
"num_tokens": 35360711.0,
"step": 19175
},
{
"entropy": 5.610341024398804,
"epoch": 1.6113841629909684,
"grad_norm": 1.3515625,
"learning_rate": 0.0004741661231044733,
"loss": 5.3238,
"mean_token_accuracy": 0.17247323095798492,
"num_tokens": 35370069.0,
"step": 19180
},
{
"entropy": 5.6449347019195555,
"epoch": 1.6118042428061332,
"grad_norm": 1.328125,
"learning_rate": 0.00047415218890468577,
"loss": 5.3008,
"mean_token_accuracy": 0.18313287645578386,
"num_tokens": 35380389.0,
"step": 19185
},
{
"entropy": 5.5601356506347654,
"epoch": 1.612224322621298,
"grad_norm": 1.3125,
"learning_rate": 0.0004741382511770197,
"loss": 5.3058,
"mean_token_accuracy": 0.17109933644533157,
"num_tokens": 35389420.0,
"step": 19190
},
{
"entropy": 5.634692716598511,
"epoch": 1.6126444024364628,
"grad_norm": 1.3125,
"learning_rate": 0.00047412430992172205,
"loss": 5.3993,
"mean_token_accuracy": 0.1628316357731819,
"num_tokens": 35399418.0,
"step": 19195
},
{
"entropy": 5.604163455963135,
"epoch": 1.6130644822516278,
"grad_norm": 1.3125,
"learning_rate": 0.00047411036513903974,
"loss": 5.2895,
"mean_token_accuracy": 0.1781422734260559,
"num_tokens": 35408717.0,
"step": 19200
},
{
"entropy": 5.549163198471069,
"epoch": 1.6134845620667928,
"grad_norm": 1.2578125,
"learning_rate": 0.00047409641682921987,
"loss": 5.2432,
"mean_token_accuracy": 0.18698584735393525,
"num_tokens": 35417118.0,
"step": 19205
},
{
"entropy": 5.571460771560669,
"epoch": 1.6139046418819576,
"grad_norm": 1.265625,
"learning_rate": 0.0004740824649925096,
"loss": 5.3268,
"mean_token_accuracy": 0.1735591173171997,
"num_tokens": 35425526.0,
"step": 19210
},
{
"entropy": 5.527240848541259,
"epoch": 1.6143247216971224,
"grad_norm": 1.375,
"learning_rate": 0.0004740685096291559,
"loss": 5.3385,
"mean_token_accuracy": 0.1702783703804016,
"num_tokens": 35434932.0,
"step": 19215
},
{
"entropy": 5.654340124130249,
"epoch": 1.6147448015122874,
"grad_norm": 1.4765625,
"learning_rate": 0.00047405455073940597,
"loss": 5.3584,
"mean_token_accuracy": 0.17494580149650574,
"num_tokens": 35443909.0,
"step": 19220
},
{
"entropy": 5.6878255844116214,
"epoch": 1.6151648813274522,
"grad_norm": 1.3671875,
"learning_rate": 0.0004740405883235072,
"loss": 5.3209,
"mean_token_accuracy": 0.17715817987918853,
"num_tokens": 35454082.0,
"step": 19225
},
{
"entropy": 5.635473680496216,
"epoch": 1.6155849611426172,
"grad_norm": 1.21875,
"learning_rate": 0.00047402662238170694,
"loss": 5.3797,
"mean_token_accuracy": 0.16992955505847931,
"num_tokens": 35464547.0,
"step": 19230
},
{
"entropy": 5.577557563781738,
"epoch": 1.616005040957782,
"grad_norm": 1.2421875,
"learning_rate": 0.0004740126529142526,
"loss": 5.2542,
"mean_token_accuracy": 0.17887276858091355,
"num_tokens": 35473310.0,
"step": 19235
},
{
"entropy": 5.564691352844238,
"epoch": 1.6164251207729468,
"grad_norm": 1.1875,
"learning_rate": 0.0004739986799213915,
"loss": 5.3622,
"mean_token_accuracy": 0.17988502979278564,
"num_tokens": 35483502.0,
"step": 19240
},
{
"entropy": 5.615821075439453,
"epoch": 1.6168452005881118,
"grad_norm": 1.3125,
"learning_rate": 0.0004739847034033713,
"loss": 5.3301,
"mean_token_accuracy": 0.17195536941289902,
"num_tokens": 35493063.0,
"step": 19245
},
{
"entropy": 5.576445436477661,
"epoch": 1.6172652804032768,
"grad_norm": 1.4296875,
"learning_rate": 0.00047397072336043957,
"loss": 5.2995,
"mean_token_accuracy": 0.17133233398199083,
"num_tokens": 35501829.0,
"step": 19250
},
{
"entropy": 5.622234344482422,
"epoch": 1.6176853602184416,
"grad_norm": 1.25,
"learning_rate": 0.00047395673979284383,
"loss": 5.3043,
"mean_token_accuracy": 0.16928572207689285,
"num_tokens": 35510411.0,
"step": 19255
},
{
"entropy": 5.614874029159546,
"epoch": 1.6181054400336063,
"grad_norm": 1.1953125,
"learning_rate": 0.000473942752700832,
"loss": 5.3216,
"mean_token_accuracy": 0.17303272932767869,
"num_tokens": 35519571.0,
"step": 19260
},
{
"entropy": 5.539082002639771,
"epoch": 1.6185255198487711,
"grad_norm": 1.3359375,
"learning_rate": 0.00047392876208465166,
"loss": 5.2757,
"mean_token_accuracy": 0.1720677822828293,
"num_tokens": 35527306.0,
"step": 19265
},
{
"entropy": 5.591310262680054,
"epoch": 1.6189455996639361,
"grad_norm": 1.359375,
"learning_rate": 0.0004739147679445508,
"loss": 5.2801,
"mean_token_accuracy": 0.1705125540494919,
"num_tokens": 35536126.0,
"step": 19270
},
{
"entropy": 5.619841194152832,
"epoch": 1.6193656794791011,
"grad_norm": 1.25,
"learning_rate": 0.0004739007702807773,
"loss": 5.3384,
"mean_token_accuracy": 0.1726538673043251,
"num_tokens": 35545593.0,
"step": 19275
},
{
"entropy": 5.500962829589843,
"epoch": 1.619785759294266,
"grad_norm": 1.3828125,
"learning_rate": 0.00047388676909357894,
"loss": 5.2674,
"mean_token_accuracy": 0.17220294028520583,
"num_tokens": 35554780.0,
"step": 19280
},
{
"entropy": 5.585822010040284,
"epoch": 1.6202058391094307,
"grad_norm": 1.296875,
"learning_rate": 0.00047387276438320394,
"loss": 5.258,
"mean_token_accuracy": 0.18452005684375763,
"num_tokens": 35562982.0,
"step": 19285
},
{
"entropy": 5.6263950824737545,
"epoch": 1.6206259189245955,
"grad_norm": 1.40625,
"learning_rate": 0.0004738587561499003,
"loss": 5.3627,
"mean_token_accuracy": 0.175518299639225,
"num_tokens": 35571528.0,
"step": 19290
},
{
"entropy": 5.4953423023223875,
"epoch": 1.6210459987397605,
"grad_norm": 1.234375,
"learning_rate": 0.00047384474439391615,
"loss": 5.2288,
"mean_token_accuracy": 0.18026942163705825,
"num_tokens": 35580386.0,
"step": 19295
},
{
"entropy": 5.542523384094238,
"epoch": 1.6214660785549255,
"grad_norm": 1.1484375,
"learning_rate": 0.0004738307291154998,
"loss": 5.1939,
"mean_token_accuracy": 0.17714485973119737,
"num_tokens": 35589456.0,
"step": 19300
},
{
"entropy": 5.5750566005706785,
"epoch": 1.6218861583700903,
"grad_norm": 1.28125,
"learning_rate": 0.0004738167103148995,
"loss": 5.288,
"mean_token_accuracy": 0.1765538990497589,
"num_tokens": 35598116.0,
"step": 19305
},
{
"entropy": 5.570009708404541,
"epoch": 1.622306238185255,
"grad_norm": 1.40625,
"learning_rate": 0.00047380268799236355,
"loss": 5.2793,
"mean_token_accuracy": 0.17032337933778763,
"num_tokens": 35606481.0,
"step": 19310
},
{
"entropy": 5.5349977016448975,
"epoch": 1.62272631800042,
"grad_norm": 1.3359375,
"learning_rate": 0.00047378866214814024,
"loss": 5.2616,
"mean_token_accuracy": 0.17349669337272644,
"num_tokens": 35615517.0,
"step": 19315
},
{
"entropy": 5.544026136398315,
"epoch": 1.6231463978155851,
"grad_norm": 1.25,
"learning_rate": 0.00047377463278247827,
"loss": 5.3033,
"mean_token_accuracy": 0.17272030413150788,
"num_tokens": 35625100.0,
"step": 19320
},
{
"entropy": 5.576516628265381,
"epoch": 1.62356647763075,
"grad_norm": 1.2890625,
"learning_rate": 0.000473760599895626,
"loss": 5.2418,
"mean_token_accuracy": 0.1717881292104721,
"num_tokens": 35634572.0,
"step": 19325
},
{
"entropy": 5.574368715286255,
"epoch": 1.6239865574459147,
"grad_norm": 1.21875,
"learning_rate": 0.000473746563487832,
"loss": 5.2562,
"mean_token_accuracy": 0.17792798280715943,
"num_tokens": 35643883.0,
"step": 19330
},
{
"entropy": 5.590154838562012,
"epoch": 1.6244066372610795,
"grad_norm": 1.375,
"learning_rate": 0.00047373252355934506,
"loss": 5.3326,
"mean_token_accuracy": 0.1750644028186798,
"num_tokens": 35652527.0,
"step": 19335
},
{
"entropy": 5.654927921295166,
"epoch": 1.6248267170762445,
"grad_norm": 1.265625,
"learning_rate": 0.00047371848011041375,
"loss": 5.3769,
"mean_token_accuracy": 0.17651614248752595,
"num_tokens": 35662436.0,
"step": 19340
},
{
"entropy": 5.590336799621582,
"epoch": 1.6252467968914095,
"grad_norm": 1.2734375,
"learning_rate": 0.00047370443314128687,
"loss": 5.2678,
"mean_token_accuracy": 0.17819317132234574,
"num_tokens": 35672302.0,
"step": 19345
},
{
"entropy": 5.572092580795288,
"epoch": 1.6256668767065743,
"grad_norm": 1.3359375,
"learning_rate": 0.0004736903826522132,
"loss": 5.3118,
"mean_token_accuracy": 0.17263237088918687,
"num_tokens": 35680852.0,
"step": 19350
},
{
"entropy": 5.5830944061279295,
"epoch": 1.626086956521739,
"grad_norm": 1.3125,
"learning_rate": 0.0004736763286434419,
"loss": 5.2955,
"mean_token_accuracy": 0.1772119805216789,
"num_tokens": 35690159.0,
"step": 19355
},
{
"entropy": 5.5720783233642575,
"epoch": 1.6265070363369039,
"grad_norm": 1.375,
"learning_rate": 0.0004736622711152216,
"loss": 5.2289,
"mean_token_accuracy": 0.17568332105875015,
"num_tokens": 35699165.0,
"step": 19360
},
{
"entropy": 5.540960597991943,
"epoch": 1.6269271161520689,
"grad_norm": 1.2734375,
"learning_rate": 0.0004736482100678015,
"loss": 5.2909,
"mean_token_accuracy": 0.17809885442256929,
"num_tokens": 35708910.0,
"step": 19365
},
{
"entropy": 5.63184494972229,
"epoch": 1.6273471959672339,
"grad_norm": 1.328125,
"learning_rate": 0.00047363414550143063,
"loss": 5.3671,
"mean_token_accuracy": 0.16394011229276656,
"num_tokens": 35718218.0,
"step": 19370
},
{
"entropy": 5.598116683959961,
"epoch": 1.6277672757823987,
"grad_norm": 1.3671875,
"learning_rate": 0.00047362007741635816,
"loss": 5.2981,
"mean_token_accuracy": 0.17405816316604614,
"num_tokens": 35727076.0,
"step": 19375
},
{
"entropy": 5.575759172439575,
"epoch": 1.6281873555975634,
"grad_norm": 1.453125,
"learning_rate": 0.0004736060058128333,
"loss": 5.3938,
"mean_token_accuracy": 0.17170560359954834,
"num_tokens": 35736316.0,
"step": 19380
},
{
"entropy": 5.600625896453858,
"epoch": 1.6286074354127285,
"grad_norm": 1.2734375,
"learning_rate": 0.00047359193069110533,
"loss": 5.3474,
"mean_token_accuracy": 0.17974155247211457,
"num_tokens": 35745747.0,
"step": 19385
},
{
"entropy": 5.689561557769776,
"epoch": 1.6290275152278935,
"grad_norm": 1.3203125,
"learning_rate": 0.00047357785205142354,
"loss": 5.3066,
"mean_token_accuracy": 0.17518018782138825,
"num_tokens": 35754825.0,
"step": 19390
},
{
"entropy": 5.501223421096801,
"epoch": 1.6294475950430583,
"grad_norm": 1.2109375,
"learning_rate": 0.0004735637698940374,
"loss": 5.2698,
"mean_token_accuracy": 0.17844593971967698,
"num_tokens": 35764504.0,
"step": 19395
},
{
"entropy": 5.632344722747803,
"epoch": 1.629867674858223,
"grad_norm": 1.34375,
"learning_rate": 0.0004735496842191963,
"loss": 5.3496,
"mean_token_accuracy": 0.17488671094179153,
"num_tokens": 35774195.0,
"step": 19400
},
{
"entropy": 5.541965532302856,
"epoch": 1.6302877546733878,
"grad_norm": 1.3515625,
"learning_rate": 0.00047353559502714976,
"loss": 5.2302,
"mean_token_accuracy": 0.1812527135014534,
"num_tokens": 35783721.0,
"step": 19405
},
{
"entropy": 5.5491180419921875,
"epoch": 1.6307078344885528,
"grad_norm": 1.6796875,
"learning_rate": 0.0004735215023181474,
"loss": 5.2994,
"mean_token_accuracy": 0.17203922867774962,
"num_tokens": 35792821.0,
"step": 19410
},
{
"entropy": 5.589169359207153,
"epoch": 1.6311279143037178,
"grad_norm": 1.3515625,
"learning_rate": 0.00047350740609243883,
"loss": 5.3409,
"mean_token_accuracy": 0.17211707830429077,
"num_tokens": 35802746.0,
"step": 19415
},
{
"entropy": 5.66703782081604,
"epoch": 1.6315479941188826,
"grad_norm": 1.34375,
"learning_rate": 0.0004734933063502738,
"loss": 5.3423,
"mean_token_accuracy": 0.17652736753225326,
"num_tokens": 35811196.0,
"step": 19420
},
{
"entropy": 5.722614717483521,
"epoch": 1.6319680739340474,
"grad_norm": 1.4375,
"learning_rate": 0.00047347920309190203,
"loss": 5.3722,
"mean_token_accuracy": 0.16953300535678864,
"num_tokens": 35820787.0,
"step": 19425
},
{
"entropy": 5.5699670791625975,
"epoch": 1.6323881537492122,
"grad_norm": 1.5625,
"learning_rate": 0.0004734650963175734,
"loss": 5.3389,
"mean_token_accuracy": 0.16996063143014908,
"num_tokens": 35831247.0,
"step": 19430
},
{
"entropy": 5.552160263061523,
"epoch": 1.6328082335643772,
"grad_norm": 1.3046875,
"learning_rate": 0.00047345098602753777,
"loss": 5.3845,
"mean_token_accuracy": 0.17247342467308044,
"num_tokens": 35840759.0,
"step": 19435
},
{
"entropy": 5.582460689544678,
"epoch": 1.6332283133795422,
"grad_norm": 1.328125,
"learning_rate": 0.0004734368722220451,
"loss": 5.3205,
"mean_token_accuracy": 0.17218994945287705,
"num_tokens": 35850137.0,
"step": 19440
},
{
"entropy": 5.5876445293426515,
"epoch": 1.633648393194707,
"grad_norm": 1.234375,
"learning_rate": 0.0004734227549013455,
"loss": 5.1614,
"mean_token_accuracy": 0.18389954864978791,
"num_tokens": 35858412.0,
"step": 19445
},
{
"entropy": 5.522119998931885,
"epoch": 1.6340684730098718,
"grad_norm": 1.3125,
"learning_rate": 0.0004734086340656889,
"loss": 5.2462,
"mean_token_accuracy": 0.17678166925907135,
"num_tokens": 35868202.0,
"step": 19450
},
{
"entropy": 5.518358850479126,
"epoch": 1.6344885528250368,
"grad_norm": 1.3125,
"learning_rate": 0.0004733945097153255,
"loss": 5.3094,
"mean_token_accuracy": 0.17758093625307084,
"num_tokens": 35877237.0,
"step": 19455
},
{
"entropy": 5.486013126373291,
"epoch": 1.6349086326402016,
"grad_norm": 1.2421875,
"learning_rate": 0.0004733803818505055,
"loss": 5.1676,
"mean_token_accuracy": 0.18571697473526,
"num_tokens": 35887016.0,
"step": 19460
},
{
"entropy": 5.5516619205474855,
"epoch": 1.6353287124553666,
"grad_norm": 1.296875,
"learning_rate": 0.00047336625047147924,
"loss": 5.2658,
"mean_token_accuracy": 0.179465851187706,
"num_tokens": 35896393.0,
"step": 19465
},
{
"entropy": 5.532120656967163,
"epoch": 1.6357487922705314,
"grad_norm": 1.3828125,
"learning_rate": 0.00047335211557849693,
"loss": 5.3006,
"mean_token_accuracy": 0.17451370805501937,
"num_tokens": 35905237.0,
"step": 19470
},
{
"entropy": 5.5930602073669435,
"epoch": 1.6361688720856962,
"grad_norm": 1.3359375,
"learning_rate": 0.0004733379771718092,
"loss": 5.3274,
"mean_token_accuracy": 0.17903787642717361,
"num_tokens": 35914352.0,
"step": 19475
},
{
"entropy": 5.624740314483643,
"epoch": 1.6365889519008612,
"grad_norm": 1.7109375,
"learning_rate": 0.0004733238352516661,
"loss": 5.3845,
"mean_token_accuracy": 0.17454177141189575,
"num_tokens": 35923785.0,
"step": 19480
},
{
"entropy": 5.66873574256897,
"epoch": 1.6370090317160262,
"grad_norm": 1.421875,
"learning_rate": 0.00047330968981831856,
"loss": 5.3165,
"mean_token_accuracy": 0.17200869023799897,
"num_tokens": 35932495.0,
"step": 19485
},
{
"entropy": 5.630843830108643,
"epoch": 1.637429111531191,
"grad_norm": 1.4765625,
"learning_rate": 0.00047329554087201687,
"loss": 5.2929,
"mean_token_accuracy": 0.18362895995378495,
"num_tokens": 35941745.0,
"step": 19490
},
{
"entropy": 5.639635515213013,
"epoch": 1.6378491913463558,
"grad_norm": 1.296875,
"learning_rate": 0.00047328138841301186,
"loss": 5.3571,
"mean_token_accuracy": 0.17077856808900832,
"num_tokens": 35950281.0,
"step": 19495
},
{
"entropy": 5.554693174362183,
"epoch": 1.6382692711615205,
"grad_norm": 1.3203125,
"learning_rate": 0.0004732672324415541,
"loss": 5.287,
"mean_token_accuracy": 0.17891302108764648,
"num_tokens": 35959531.0,
"step": 19500
},
{
"entropy": 5.606399488449097,
"epoch": 1.6386893509766856,
"grad_norm": 1.3671875,
"learning_rate": 0.0004732530729578945,
"loss": 5.3535,
"mean_token_accuracy": 0.18230517357587814,
"num_tokens": 35969462.0,
"step": 19505
},
{
"entropy": 5.570599842071533,
"epoch": 1.6391094307918506,
"grad_norm": 1.5,
"learning_rate": 0.0004732389099622837,
"loss": 5.3135,
"mean_token_accuracy": 0.1722819149494171,
"num_tokens": 35978022.0,
"step": 19510
},
{
"entropy": 5.61481819152832,
"epoch": 1.6395295106070154,
"grad_norm": 1.28125,
"learning_rate": 0.00047322474345497267,
"loss": 5.3344,
"mean_token_accuracy": 0.1692362666130066,
"num_tokens": 35988193.0,
"step": 19515
},
{
"entropy": 5.669213199615479,
"epoch": 1.6399495904221801,
"grad_norm": 1.3125,
"learning_rate": 0.00047321057343621247,
"loss": 5.3388,
"mean_token_accuracy": 0.17151684165000916,
"num_tokens": 35997404.0,
"step": 19520
},
{
"entropy": 5.542697858810425,
"epoch": 1.6403696702373451,
"grad_norm": 1.3359375,
"learning_rate": 0.00047319639990625395,
"loss": 5.2265,
"mean_token_accuracy": 0.18443240821361542,
"num_tokens": 36005356.0,
"step": 19525
},
{
"entropy": 5.658288383483887,
"epoch": 1.64078975005251,
"grad_norm": 1.5078125,
"learning_rate": 0.00047318222286534824,
"loss": 5.4798,
"mean_token_accuracy": 0.16460684090852737,
"num_tokens": 36015305.0,
"step": 19530
},
{
"entropy": 5.6122137069702145,
"epoch": 1.641209829867675,
"grad_norm": 1.2421875,
"learning_rate": 0.00047316804231374663,
"loss": 5.3298,
"mean_token_accuracy": 0.17140717953443527,
"num_tokens": 36024278.0,
"step": 19535
},
{
"entropy": 5.553801393508911,
"epoch": 1.6416299096828397,
"grad_norm": 1.484375,
"learning_rate": 0.0004731538582517001,
"loss": 5.1628,
"mean_token_accuracy": 0.18298932164907455,
"num_tokens": 36032870.0,
"step": 19540
},
{
"entropy": 5.5007366180419925,
"epoch": 1.6420499894980045,
"grad_norm": 1.34375,
"learning_rate": 0.00047313967067945996,
"loss": 5.1964,
"mean_token_accuracy": 0.1870368778705597,
"num_tokens": 36041725.0,
"step": 19545
},
{
"entropy": 5.588674545288086,
"epoch": 1.6424700693131695,
"grad_norm": 1.34375,
"learning_rate": 0.0004731254795972777,
"loss": 5.3534,
"mean_token_accuracy": 0.17318006604909897,
"num_tokens": 36050929.0,
"step": 19550
},
{
"entropy": 5.618828821182251,
"epoch": 1.6428901491283345,
"grad_norm": 1.328125,
"learning_rate": 0.0004731112850054045,
"loss": 5.3224,
"mean_token_accuracy": 0.17352661937475206,
"num_tokens": 36060059.0,
"step": 19555
},
{
"entropy": 5.575858783721924,
"epoch": 1.6433102289434993,
"grad_norm": 1.4765625,
"learning_rate": 0.0004730970869040919,
"loss": 5.2627,
"mean_token_accuracy": 0.18821395933628082,
"num_tokens": 36069445.0,
"step": 19560
},
{
"entropy": 5.606625556945801,
"epoch": 1.643730308758664,
"grad_norm": 1.3671875,
"learning_rate": 0.00047308288529359147,
"loss": 5.396,
"mean_token_accuracy": 0.17273110002279282,
"num_tokens": 36079129.0,
"step": 19565
},
{
"entropy": 5.589566802978515,
"epoch": 1.644150388573829,
"grad_norm": 1.265625,
"learning_rate": 0.0004730686801741547,
"loss": 5.268,
"mean_token_accuracy": 0.178928379714489,
"num_tokens": 36088320.0,
"step": 19570
},
{
"entropy": 5.555714559555054,
"epoch": 1.644570468388994,
"grad_norm": 1.3984375,
"learning_rate": 0.0004730544715460332,
"loss": 5.3339,
"mean_token_accuracy": 0.1806238815188408,
"num_tokens": 36097728.0,
"step": 19575
},
{
"entropy": 5.637714052200318,
"epoch": 1.644990548204159,
"grad_norm": 1.3359375,
"learning_rate": 0.00047304025940947875,
"loss": 5.3425,
"mean_token_accuracy": 0.178213070333004,
"num_tokens": 36106566.0,
"step": 19580
},
{
"entropy": 5.642990064620972,
"epoch": 1.6454106280193237,
"grad_norm": 1.2578125,
"learning_rate": 0.00047302604376474306,
"loss": 5.2912,
"mean_token_accuracy": 0.171270252764225,
"num_tokens": 36115475.0,
"step": 19585
},
{
"entropy": 5.559804630279541,
"epoch": 1.6458307078344885,
"grad_norm": 1.2734375,
"learning_rate": 0.00047301182461207807,
"loss": 5.3765,
"mean_token_accuracy": 0.17772205471992492,
"num_tokens": 36124404.0,
"step": 19590
},
{
"entropy": 5.528610134124756,
"epoch": 1.6462507876496533,
"grad_norm": 1.3671875,
"learning_rate": 0.00047299760195173554,
"loss": 5.2409,
"mean_token_accuracy": 0.17628025263547897,
"num_tokens": 36132987.0,
"step": 19595
},
{
"entropy": 5.58651819229126,
"epoch": 1.6466708674648183,
"grad_norm": 1.2265625,
"learning_rate": 0.0004729833757839673,
"loss": 5.3991,
"mean_token_accuracy": 0.17285286337137223,
"num_tokens": 36142163.0,
"step": 19600
},
{
"entropy": 5.635810804367066,
"epoch": 1.6470909472799833,
"grad_norm": 1.234375,
"learning_rate": 0.00047296914610902565,
"loss": 5.3433,
"mean_token_accuracy": 0.1677408069372177,
"num_tokens": 36152561.0,
"step": 19605
},
{
"entropy": 5.609593343734741,
"epoch": 1.647511027095148,
"grad_norm": 1.515625,
"learning_rate": 0.00047295491292716245,
"loss": 5.2585,
"mean_token_accuracy": 0.17212305217981339,
"num_tokens": 36161877.0,
"step": 19610
},
{
"entropy": 5.561789560317993,
"epoch": 1.6479311069103129,
"grad_norm": 1.328125,
"learning_rate": 0.00047294067623862996,
"loss": 5.3294,
"mean_token_accuracy": 0.172064708173275,
"num_tokens": 36171523.0,
"step": 19615
},
{
"entropy": 5.565540170669555,
"epoch": 1.6483511867254779,
"grad_norm": 1.4765625,
"learning_rate": 0.00047292643604368025,
"loss": 5.2536,
"mean_token_accuracy": 0.1832684025168419,
"num_tokens": 36180339.0,
"step": 19620
},
{
"entropy": 5.606383323669434,
"epoch": 1.6487712665406429,
"grad_norm": 1.3125,
"learning_rate": 0.0004729121923425657,
"loss": 5.3595,
"mean_token_accuracy": 0.1696375221014023,
"num_tokens": 36191584.0,
"step": 19625
},
{
"entropy": 5.67185959815979,
"epoch": 1.6491913463558077,
"grad_norm": 1.328125,
"learning_rate": 0.0004728979451355385,
"loss": 5.3759,
"mean_token_accuracy": 0.1815030872821808,
"num_tokens": 36200738.0,
"step": 19630
},
{
"entropy": 5.522432422637939,
"epoch": 1.6496114261709725,
"grad_norm": 1.2890625,
"learning_rate": 0.00047288369442285115,
"loss": 5.19,
"mean_token_accuracy": 0.18398086577653885,
"num_tokens": 36209394.0,
"step": 19635
},
{
"entropy": 5.555903387069702,
"epoch": 1.6500315059861372,
"grad_norm": 1.3359375,
"learning_rate": 0.00047286944020475606,
"loss": 5.3261,
"mean_token_accuracy": 0.1730799287557602,
"num_tokens": 36218268.0,
"step": 19640
},
{
"entropy": 5.579511833190918,
"epoch": 1.6504515858013022,
"grad_norm": 1.4765625,
"learning_rate": 0.0004728551824815057,
"loss": 5.2744,
"mean_token_accuracy": 0.1794183671474457,
"num_tokens": 36226974.0,
"step": 19645
},
{
"entropy": 5.4848850727081295,
"epoch": 1.6508716656164673,
"grad_norm": 1.3203125,
"learning_rate": 0.00047284092125335277,
"loss": 5.2216,
"mean_token_accuracy": 0.1821077972650528,
"num_tokens": 36235892.0,
"step": 19650
},
{
"entropy": 5.534023332595825,
"epoch": 1.651291745431632,
"grad_norm": 1.3828125,
"learning_rate": 0.0004728266565205497,
"loss": 5.2477,
"mean_token_accuracy": 0.18161226361989974,
"num_tokens": 36244750.0,
"step": 19655
},
{
"entropy": 5.556950473785401,
"epoch": 1.6517118252467968,
"grad_norm": 1.515625,
"learning_rate": 0.00047281238828334924,
"loss": 5.2981,
"mean_token_accuracy": 0.17662404030561446,
"num_tokens": 36254902.0,
"step": 19660
},
{
"entropy": 5.625682592391968,
"epoch": 1.6521319050619616,
"grad_norm": 1.2578125,
"learning_rate": 0.0004727981165420042,
"loss": 5.3379,
"mean_token_accuracy": 0.17301919758319856,
"num_tokens": 36265546.0,
"step": 19665
},
{
"entropy": 5.521275472640991,
"epoch": 1.6525519848771266,
"grad_norm": 1.4140625,
"learning_rate": 0.0004727838412967674,
"loss": 5.2467,
"mean_token_accuracy": 0.17551186680793762,
"num_tokens": 36273978.0,
"step": 19670
},
{
"entropy": 5.627513122558594,
"epoch": 1.6529720646922916,
"grad_norm": 1.328125,
"learning_rate": 0.0004727695625478917,
"loss": 5.3007,
"mean_token_accuracy": 0.17185374349355698,
"num_tokens": 36283117.0,
"step": 19675
},
{
"entropy": 5.6354491233825685,
"epoch": 1.6533921445074564,
"grad_norm": 1.25,
"learning_rate": 0.00047275528029562996,
"loss": 5.2789,
"mean_token_accuracy": 0.1717686489224434,
"num_tokens": 36293031.0,
"step": 19680
},
{
"entropy": 5.514062976837158,
"epoch": 1.6538122243226212,
"grad_norm": 1.265625,
"learning_rate": 0.00047274099454023535,
"loss": 5.2867,
"mean_token_accuracy": 0.18071516007184982,
"num_tokens": 36302080.0,
"step": 19685
},
{
"entropy": 5.553048515319825,
"epoch": 1.6542323041377862,
"grad_norm": 1.3203125,
"learning_rate": 0.00047272670528196084,
"loss": 5.2943,
"mean_token_accuracy": 0.17276879847049714,
"num_tokens": 36311077.0,
"step": 19690
},
{
"entropy": 5.545906114578247,
"epoch": 1.6546523839529512,
"grad_norm": 1.265625,
"learning_rate": 0.0004727124125210595,
"loss": 5.2499,
"mean_token_accuracy": 0.17723502367734909,
"num_tokens": 36320300.0,
"step": 19695
},
{
"entropy": 5.53257122039795,
"epoch": 1.655072463768116,
"grad_norm": 1.3984375,
"learning_rate": 0.00047269811625778456,
"loss": 5.2959,
"mean_token_accuracy": 0.18281998485326767,
"num_tokens": 36330184.0,
"step": 19700
},
{
"entropy": 5.548378133773804,
"epoch": 1.6554925435832808,
"grad_norm": 1.21875,
"learning_rate": 0.0004726838164923893,
"loss": 5.3039,
"mean_token_accuracy": 0.17259059399366378,
"num_tokens": 36339526.0,
"step": 19705
},
{
"entropy": 5.552331829071045,
"epoch": 1.6559126233984456,
"grad_norm": 1.5625,
"learning_rate": 0.00047266951322512716,
"loss": 5.297,
"mean_token_accuracy": 0.1746842920780182,
"num_tokens": 36348849.0,
"step": 19710
},
{
"entropy": 5.58737998008728,
"epoch": 1.6563327032136106,
"grad_norm": 1.203125,
"learning_rate": 0.00047265520645625123,
"loss": 5.2681,
"mean_token_accuracy": 0.1709459751844406,
"num_tokens": 36358924.0,
"step": 19715
},
{
"entropy": 5.642772150039673,
"epoch": 1.6567527830287756,
"grad_norm": 1.5390625,
"learning_rate": 0.00047264089618601513,
"loss": 5.334,
"mean_token_accuracy": 0.17573875635862352,
"num_tokens": 36367130.0,
"step": 19720
},
{
"entropy": 5.5366904735565186,
"epoch": 1.6571728628439404,
"grad_norm": 1.34375,
"learning_rate": 0.0004726265824146724,
"loss": 5.2677,
"mean_token_accuracy": 0.17453360855579375,
"num_tokens": 36376575.0,
"step": 19725
},
{
"entropy": 5.491440296173096,
"epoch": 1.6575929426591052,
"grad_norm": 1.328125,
"learning_rate": 0.0004726122651424764,
"loss": 5.2142,
"mean_token_accuracy": 0.18081178665161132,
"num_tokens": 36385010.0,
"step": 19730
},
{
"entropy": 5.489852619171143,
"epoch": 1.65801302247427,
"grad_norm": 1.359375,
"learning_rate": 0.000472597944369681,
"loss": 5.0511,
"mean_token_accuracy": 0.1849038854241371,
"num_tokens": 36393574.0,
"step": 19735
},
{
"entropy": 5.526522397994995,
"epoch": 1.658433102289435,
"grad_norm": 1.1953125,
"learning_rate": 0.00047258362009653965,
"loss": 5.2319,
"mean_token_accuracy": 0.1859105870127678,
"num_tokens": 36401992.0,
"step": 19740
},
{
"entropy": 5.5713855743408205,
"epoch": 1.6588531821046,
"grad_norm": 1.2890625,
"learning_rate": 0.00047256929232330624,
"loss": 5.381,
"mean_token_accuracy": 0.1668798580765724,
"num_tokens": 36411712.0,
"step": 19745
},
{
"entropy": 5.5372272491455075,
"epoch": 1.6592732619197648,
"grad_norm": 1.3671875,
"learning_rate": 0.0004725549610502346,
"loss": 5.2014,
"mean_token_accuracy": 0.17726444751024245,
"num_tokens": 36420240.0,
"step": 19750
},
{
"entropy": 5.578176116943359,
"epoch": 1.6596933417349296,
"grad_norm": 1.3125,
"learning_rate": 0.00047254062627757854,
"loss": 5.3171,
"mean_token_accuracy": 0.18047765791416168,
"num_tokens": 36430068.0,
"step": 19755
},
{
"entropy": 5.639891910552978,
"epoch": 1.6601134215500946,
"grad_norm": 1.2265625,
"learning_rate": 0.000472526288005592,
"loss": 5.3582,
"mean_token_accuracy": 0.17145186811685562,
"num_tokens": 36439808.0,
"step": 19760
},
{
"entropy": 5.517648983001709,
"epoch": 1.6605335013652593,
"grad_norm": 1.2421875,
"learning_rate": 0.000472511946234529,
"loss": 5.2892,
"mean_token_accuracy": 0.1773042768239975,
"num_tokens": 36449609.0,
"step": 19765
},
{
"entropy": 5.637032604217529,
"epoch": 1.6609535811804244,
"grad_norm": 1.3125,
"learning_rate": 0.0004724976009646435,
"loss": 5.2717,
"mean_token_accuracy": 0.1799396902322769,
"num_tokens": 36457700.0,
"step": 19770
},
{
"entropy": 5.614208698272705,
"epoch": 1.6613736609955891,
"grad_norm": 1.5625,
"learning_rate": 0.0004724832521961897,
"loss": 5.3187,
"mean_token_accuracy": 0.1730481743812561,
"num_tokens": 36466881.0,
"step": 19775
},
{
"entropy": 5.630982112884522,
"epoch": 1.661793740810754,
"grad_norm": 1.4765625,
"learning_rate": 0.00047246889992942187,
"loss": 5.4136,
"mean_token_accuracy": 0.16720378249883652,
"num_tokens": 36475433.0,
"step": 19780
},
{
"entropy": 5.568074941635132,
"epoch": 1.662213820625919,
"grad_norm": 1.390625,
"learning_rate": 0.0004724545441645941,
"loss": 5.3301,
"mean_token_accuracy": 0.16874033063650132,
"num_tokens": 36484232.0,
"step": 19785
},
{
"entropy": 5.610231065750122,
"epoch": 1.662633900441084,
"grad_norm": 1.5,
"learning_rate": 0.0004724401849019608,
"loss": 5.4344,
"mean_token_accuracy": 0.16717463582754136,
"num_tokens": 36493588.0,
"step": 19790
},
{
"entropy": 5.552998113632202,
"epoch": 1.6630539802562487,
"grad_norm": 1.390625,
"learning_rate": 0.00047242582214177616,
"loss": 5.2308,
"mean_token_accuracy": 0.17591196298599243,
"num_tokens": 36502289.0,
"step": 19795
},
{
"entropy": 5.60495924949646,
"epoch": 1.6634740600714135,
"grad_norm": 1.3125,
"learning_rate": 0.00047241145588429483,
"loss": 5.3528,
"mean_token_accuracy": 0.1722363129258156,
"num_tokens": 36511978.0,
"step": 19800
},
{
"entropy": 5.644860696792603,
"epoch": 1.6638941398865783,
"grad_norm": 1.3046875,
"learning_rate": 0.0004723970861297712,
"loss": 5.3276,
"mean_token_accuracy": 0.17868301123380662,
"num_tokens": 36520378.0,
"step": 19805
},
{
"entropy": 5.599815368652344,
"epoch": 1.6643142197017433,
"grad_norm": 1.2890625,
"learning_rate": 0.0004723827128784599,
"loss": 5.3087,
"mean_token_accuracy": 0.17410200834274292,
"num_tokens": 36529965.0,
"step": 19810
},
{
"entropy": 5.692455196380616,
"epoch": 1.6647342995169083,
"grad_norm": 1.2421875,
"learning_rate": 0.00047236833613061534,
"loss": 5.3324,
"mean_token_accuracy": 0.17423984706401824,
"num_tokens": 36539394.0,
"step": 19815
},
{
"entropy": 5.58489580154419,
"epoch": 1.665154379332073,
"grad_norm": 1.3828125,
"learning_rate": 0.0004723539558864925,
"loss": 5.3821,
"mean_token_accuracy": 0.1736712023615837,
"num_tokens": 36548608.0,
"step": 19820
},
{
"entropy": 5.583039379119873,
"epoch": 1.665574459147238,
"grad_norm": 1.3046875,
"learning_rate": 0.0004723395721463459,
"loss": 5.2538,
"mean_token_accuracy": 0.17591053247451782,
"num_tokens": 36557736.0,
"step": 19825
},
{
"entropy": 5.619968509674072,
"epoch": 1.665994538962403,
"grad_norm": 1.2578125,
"learning_rate": 0.0004723251849104303,
"loss": 5.2772,
"mean_token_accuracy": 0.17381040155887603,
"num_tokens": 36566745.0,
"step": 19830
},
{
"entropy": 5.5246253490448,
"epoch": 1.6664146187775677,
"grad_norm": 1.203125,
"learning_rate": 0.00047231079417900076,
"loss": 5.2421,
"mean_token_accuracy": 0.1757187470793724,
"num_tokens": 36575956.0,
"step": 19835
},
{
"entropy": 5.5226061820983885,
"epoch": 1.6668346985927327,
"grad_norm": 1.4296875,
"learning_rate": 0.000472296399952312,
"loss": 5.2672,
"mean_token_accuracy": 0.1780347302556038,
"num_tokens": 36584673.0,
"step": 19840
},
{
"entropy": 5.588808822631836,
"epoch": 1.6672547784078975,
"grad_norm": 1.1953125,
"learning_rate": 0.0004722820022306192,
"loss": 5.3226,
"mean_token_accuracy": 0.17701348811388015,
"num_tokens": 36593758.0,
"step": 19845
},
{
"entropy": 5.523955154418945,
"epoch": 1.6676748582230623,
"grad_norm": 1.359375,
"learning_rate": 0.0004722676010141773,
"loss": 5.1979,
"mean_token_accuracy": 0.17654271572828292,
"num_tokens": 36603722.0,
"step": 19850
},
{
"entropy": 5.532243824005127,
"epoch": 1.6680949380382273,
"grad_norm": 1.640625,
"learning_rate": 0.00047225319630324136,
"loss": 5.2628,
"mean_token_accuracy": 0.17817289084196092,
"num_tokens": 36612478.0,
"step": 19855
},
{
"entropy": 5.583906650543213,
"epoch": 1.6685150178533923,
"grad_norm": 1.5546875,
"learning_rate": 0.0004722387880980667,
"loss": 5.4511,
"mean_token_accuracy": 0.16772886663675307,
"num_tokens": 36622399.0,
"step": 19860
},
{
"entropy": 5.604948663711548,
"epoch": 1.668935097668557,
"grad_norm": 1.484375,
"learning_rate": 0.00047222437639890844,
"loss": 5.2647,
"mean_token_accuracy": 0.17361174523830414,
"num_tokens": 36631798.0,
"step": 19865
},
{
"entropy": 5.50488133430481,
"epoch": 1.6693551774837219,
"grad_norm": 1.3671875,
"learning_rate": 0.00047220996120602197,
"loss": 5.283,
"mean_token_accuracy": 0.1793058469891548,
"num_tokens": 36640405.0,
"step": 19870
},
{
"entropy": 5.6831395626068115,
"epoch": 1.6697752572988867,
"grad_norm": 1.390625,
"learning_rate": 0.00047219554251966246,
"loss": 5.4256,
"mean_token_accuracy": 0.16782815530896186,
"num_tokens": 36650209.0,
"step": 19875
},
{
"entropy": 5.692617177963257,
"epoch": 1.6701953371140517,
"grad_norm": 1.234375,
"learning_rate": 0.0004721811203400855,
"loss": 5.3761,
"mean_token_accuracy": 0.1702076569199562,
"num_tokens": 36660248.0,
"step": 19880
},
{
"entropy": 5.591552972793579,
"epoch": 1.6706154169292167,
"grad_norm": 1.3359375,
"learning_rate": 0.00047216669466754657,
"loss": 5.2822,
"mean_token_accuracy": 0.17628030925989152,
"num_tokens": 36669938.0,
"step": 19885
},
{
"entropy": 5.484194564819336,
"epoch": 1.6710354967443815,
"grad_norm": 1.1640625,
"learning_rate": 0.0004721522655023012,
"loss": 5.2908,
"mean_token_accuracy": 0.18230575174093247,
"num_tokens": 36679903.0,
"step": 19890
},
{
"entropy": 5.671662521362305,
"epoch": 1.6714555765595462,
"grad_norm": 1.3515625,
"learning_rate": 0.0004721378328446049,
"loss": 5.3772,
"mean_token_accuracy": 0.17432742416858674,
"num_tokens": 36688424.0,
"step": 19895
},
{
"entropy": 5.692489719390869,
"epoch": 1.6718756563747112,
"grad_norm": 1.25,
"learning_rate": 0.0004721233966947134,
"loss": 5.4233,
"mean_token_accuracy": 0.1713166430592537,
"num_tokens": 36698715.0,
"step": 19900
},
{
"entropy": 5.493922090530395,
"epoch": 1.672295736189876,
"grad_norm": 1.203125,
"learning_rate": 0.00047210895705288237,
"loss": 5.3038,
"mean_token_accuracy": 0.18740737736225127,
"num_tokens": 36708456.0,
"step": 19905
},
{
"entropy": 5.559077167510987,
"epoch": 1.672715816005041,
"grad_norm": 1.2734375,
"learning_rate": 0.0004720945139193678,
"loss": 5.3097,
"mean_token_accuracy": 0.17013535946607589,
"num_tokens": 36717596.0,
"step": 19910
},
{
"entropy": 5.677094984054565,
"epoch": 1.6731358958202058,
"grad_norm": 1.6484375,
"learning_rate": 0.0004720800672944253,
"loss": 5.3919,
"mean_token_accuracy": 0.17011301219463348,
"num_tokens": 36727092.0,
"step": 19915
},
{
"entropy": 5.571652269363403,
"epoch": 1.6735559756353706,
"grad_norm": 1.4140625,
"learning_rate": 0.0004720656171783109,
"loss": 5.1284,
"mean_token_accuracy": 0.1827874079346657,
"num_tokens": 36735910.0,
"step": 19920
},
{
"entropy": 5.490898704528808,
"epoch": 1.6739760554505356,
"grad_norm": 1.296875,
"learning_rate": 0.0004720511635712806,
"loss": 5.2429,
"mean_token_accuracy": 0.1775929644703865,
"num_tokens": 36745237.0,
"step": 19925
},
{
"entropy": 5.592001247406006,
"epoch": 1.6743961352657006,
"grad_norm": 1.4375,
"learning_rate": 0.00047203670647359035,
"loss": 5.3744,
"mean_token_accuracy": 0.17781300097703934,
"num_tokens": 36753603.0,
"step": 19930
},
{
"entropy": 5.700658750534058,
"epoch": 1.6748162150808654,
"grad_norm": 1.28125,
"learning_rate": 0.0004720222458854964,
"loss": 5.36,
"mean_token_accuracy": 0.17066025733947754,
"num_tokens": 36763010.0,
"step": 19935
},
{
"entropy": 5.611382150650025,
"epoch": 1.6752362948960302,
"grad_norm": 1.2578125,
"learning_rate": 0.00047200778180725477,
"loss": 5.2871,
"mean_token_accuracy": 0.17411105930805207,
"num_tokens": 36772156.0,
"step": 19940
},
{
"entropy": 5.527815341949463,
"epoch": 1.675656374711195,
"grad_norm": 1.3359375,
"learning_rate": 0.00047199331423912174,
"loss": 5.2042,
"mean_token_accuracy": 0.1800895169377327,
"num_tokens": 36781386.0,
"step": 19945
},
{
"entropy": 5.62794451713562,
"epoch": 1.67607645452636,
"grad_norm": 1.359375,
"learning_rate": 0.0004719788431813536,
"loss": 5.3474,
"mean_token_accuracy": 0.17705000042915345,
"num_tokens": 36790754.0,
"step": 19950
},
{
"entropy": 5.655123281478882,
"epoch": 1.676496534341525,
"grad_norm": 1.2734375,
"learning_rate": 0.0004719643686342066,
"loss": 5.3167,
"mean_token_accuracy": 0.17164090424776077,
"num_tokens": 36799623.0,
"step": 19955
},
{
"entropy": 5.469016408920288,
"epoch": 1.6769166141566898,
"grad_norm": 1.3125,
"learning_rate": 0.0004719498905979373,
"loss": 5.1338,
"mean_token_accuracy": 0.19023518413305282,
"num_tokens": 36808662.0,
"step": 19960
},
{
"entropy": 5.5914829730987545,
"epoch": 1.6773366939718546,
"grad_norm": 1.328125,
"learning_rate": 0.0004719354090728021,
"loss": 5.2594,
"mean_token_accuracy": 0.17879579663276673,
"num_tokens": 36817730.0,
"step": 19965
},
{
"entropy": 5.596966505050659,
"epoch": 1.6777567737870194,
"grad_norm": 1.328125,
"learning_rate": 0.00047192092405905743,
"loss": 5.247,
"mean_token_accuracy": 0.17462601512670517,
"num_tokens": 36827203.0,
"step": 19970
},
{
"entropy": 5.645619487762451,
"epoch": 1.6781768536021844,
"grad_norm": 1.359375,
"learning_rate": 0.0004719064355569601,
"loss": 5.4092,
"mean_token_accuracy": 0.172246952354908,
"num_tokens": 36836145.0,
"step": 19975
},
{
"entropy": 5.601606607437134,
"epoch": 1.6785969334173494,
"grad_norm": 1.5078125,
"learning_rate": 0.00047189194356676666,
"loss": 5.4148,
"mean_token_accuracy": 0.17356153726577758,
"num_tokens": 36845609.0,
"step": 19980
},
{
"entropy": 5.6484596729278564,
"epoch": 1.6790170132325142,
"grad_norm": 1.3359375,
"learning_rate": 0.00047187744808873386,
"loss": 5.419,
"mean_token_accuracy": 0.1756739318370819,
"num_tokens": 36855367.0,
"step": 19985
},
{
"entropy": 5.622177171707153,
"epoch": 1.679437093047679,
"grad_norm": 1.5078125,
"learning_rate": 0.00047186294912311835,
"loss": 5.3679,
"mean_token_accuracy": 0.16962105929851531,
"num_tokens": 36864808.0,
"step": 19990
},
{
"entropy": 5.573296022415161,
"epoch": 1.679857172862844,
"grad_norm": 1.5390625,
"learning_rate": 0.00047184844667017705,
"loss": 5.2335,
"mean_token_accuracy": 0.18165623694658278,
"num_tokens": 36873651.0,
"step": 19995
},
{
"entropy": 5.6069403171539305,
"epoch": 1.680277252678009,
"grad_norm": 1.4453125,
"learning_rate": 0.00047183394073016695,
"loss": 5.3776,
"mean_token_accuracy": 0.1699774906039238,
"num_tokens": 36883227.0,
"step": 20000
},
{
"entropy": 5.527088737487793,
"epoch": 1.6806973324931738,
"grad_norm": 1.328125,
"learning_rate": 0.00047181943130334493,
"loss": 5.1529,
"mean_token_accuracy": 0.1817635715007782,
"num_tokens": 36891628.0,
"step": 20005
},
{
"entropy": 5.551967620849609,
"epoch": 1.6811174123083386,
"grad_norm": 1.375,
"learning_rate": 0.000471804918389968,
"loss": 5.2833,
"mean_token_accuracy": 0.1710153639316559,
"num_tokens": 36901819.0,
"step": 20010
},
{
"entropy": 5.556211805343628,
"epoch": 1.6815374921235033,
"grad_norm": 1.1796875,
"learning_rate": 0.0004717904019902933,
"loss": 5.3187,
"mean_token_accuracy": 0.1734260395169258,
"num_tokens": 36911206.0,
"step": 20015
},
{
"entropy": 5.568561363220215,
"epoch": 1.6819575719386683,
"grad_norm": 1.46875,
"learning_rate": 0.000471775882104578,
"loss": 5.2649,
"mean_token_accuracy": 0.1752187117934227,
"num_tokens": 36920830.0,
"step": 20020
},
{
"entropy": 5.525005054473877,
"epoch": 1.6823776517538334,
"grad_norm": 1.2890625,
"learning_rate": 0.00047176135873307917,
"loss": 5.1763,
"mean_token_accuracy": 0.17966212928295136,
"num_tokens": 36929702.0,
"step": 20025
},
{
"entropy": 5.656943511962891,
"epoch": 1.6827977315689981,
"grad_norm": 1.3515625,
"learning_rate": 0.0004717468318760543,
"loss": 5.3931,
"mean_token_accuracy": 0.17449060827493668,
"num_tokens": 36938423.0,
"step": 20030
},
{
"entropy": 5.610716390609741,
"epoch": 1.683217811384163,
"grad_norm": 1.3046875,
"learning_rate": 0.00047173230153376057,
"loss": 5.3031,
"mean_token_accuracy": 0.17407506704330444,
"num_tokens": 36947198.0,
"step": 20035
},
{
"entropy": 5.59521484375,
"epoch": 1.6836378911993277,
"grad_norm": 1.3125,
"learning_rate": 0.0004717177677064554,
"loss": 5.2857,
"mean_token_accuracy": 0.17993054986000062,
"num_tokens": 36955636.0,
"step": 20040
},
{
"entropy": 5.499891138076782,
"epoch": 1.6840579710144927,
"grad_norm": 1.2421875,
"learning_rate": 0.00047170323039439634,
"loss": 5.2431,
"mean_token_accuracy": 0.17491396367549897,
"num_tokens": 36964463.0,
"step": 20045
},
{
"entropy": 5.640566730499268,
"epoch": 1.6844780508296577,
"grad_norm": 1.3125,
"learning_rate": 0.0004716886895978408,
"loss": 5.3732,
"mean_token_accuracy": 0.17605782449245452,
"num_tokens": 36974043.0,
"step": 20050
},
{
"entropy": 5.629862880706787,
"epoch": 1.6848981306448225,
"grad_norm": 1.4375,
"learning_rate": 0.00047167414531704637,
"loss": 5.2613,
"mean_token_accuracy": 0.1774591788649559,
"num_tokens": 36983856.0,
"step": 20055
},
{
"entropy": 5.542819356918335,
"epoch": 1.6853182104599873,
"grad_norm": 1.3203125,
"learning_rate": 0.00047165959755227077,
"loss": 5.2746,
"mean_token_accuracy": 0.18359435200691224,
"num_tokens": 36992664.0,
"step": 20060
},
{
"entropy": 5.55168399810791,
"epoch": 1.6857382902751523,
"grad_norm": 1.34375,
"learning_rate": 0.00047164504630377166,
"loss": 5.3354,
"mean_token_accuracy": 0.17987888902425767,
"num_tokens": 37001826.0,
"step": 20065
},
{
"entropy": 5.677304792404175,
"epoch": 1.6861583700903173,
"grad_norm": 1.2734375,
"learning_rate": 0.00047163049157180676,
"loss": 5.3578,
"mean_token_accuracy": 0.17599056512117386,
"num_tokens": 37010821.0,
"step": 20070
},
{
"entropy": 5.641095972061157,
"epoch": 1.6865784499054821,
"grad_norm": 1.5,
"learning_rate": 0.000471615933356634,
"loss": 5.4418,
"mean_token_accuracy": 0.1685781493782997,
"num_tokens": 37021293.0,
"step": 20075
},
{
"entropy": 5.575450658798218,
"epoch": 1.686998529720647,
"grad_norm": 1.3125,
"learning_rate": 0.0004716013716585112,
"loss": 5.2131,
"mean_token_accuracy": 0.18143570721149443,
"num_tokens": 37031063.0,
"step": 20080
},
{
"entropy": 5.511420679092407,
"epoch": 1.6874186095358117,
"grad_norm": 1.40625,
"learning_rate": 0.0004715868064776964,
"loss": 5.2901,
"mean_token_accuracy": 0.18405751883983612,
"num_tokens": 37040879.0,
"step": 20085
},
{
"entropy": 5.56152572631836,
"epoch": 1.6878386893509767,
"grad_norm": 1.328125,
"learning_rate": 0.0004715722378144474,
"loss": 5.183,
"mean_token_accuracy": 0.1834592640399933,
"num_tokens": 37049452.0,
"step": 20090
},
{
"entropy": 5.4955174922943115,
"epoch": 1.6882587691661417,
"grad_norm": 1.4453125,
"learning_rate": 0.0004715576656690225,
"loss": 5.1511,
"mean_token_accuracy": 0.18037759214639665,
"num_tokens": 37058010.0,
"step": 20095
},
{
"entropy": 5.564346027374268,
"epoch": 1.6886788489813065,
"grad_norm": 1.2734375,
"learning_rate": 0.00047154309004167984,
"loss": 5.3714,
"mean_token_accuracy": 0.16870234906673431,
"num_tokens": 37067580.0,
"step": 20100
},
{
"entropy": 5.538294458389283,
"epoch": 1.6890989287964713,
"grad_norm": 1.53125,
"learning_rate": 0.00047152851093267744,
"loss": 5.2496,
"mean_token_accuracy": 0.177216999232769,
"num_tokens": 37076584.0,
"step": 20105
},
{
"entropy": 5.5681194305419925,
"epoch": 1.689519008611636,
"grad_norm": 1.359375,
"learning_rate": 0.0004715139283422737,
"loss": 5.2715,
"mean_token_accuracy": 0.1719378724694252,
"num_tokens": 37086330.0,
"step": 20110
},
{
"entropy": 5.645929193496704,
"epoch": 1.689939088426801,
"grad_norm": 1.4921875,
"learning_rate": 0.000471499342270727,
"loss": 5.3357,
"mean_token_accuracy": 0.16819078028202056,
"num_tokens": 37096323.0,
"step": 20115
},
{
"entropy": 5.544728708267212,
"epoch": 1.690359168241966,
"grad_norm": 1.25,
"learning_rate": 0.00047148475271829556,
"loss": 5.2758,
"mean_token_accuracy": 0.17762979716062546,
"num_tokens": 37106281.0,
"step": 20120
},
{
"entropy": 5.505630922317505,
"epoch": 1.6907792480571309,
"grad_norm": 1.4140625,
"learning_rate": 0.0004714701596852379,
"loss": 5.2416,
"mean_token_accuracy": 0.1865881934762001,
"num_tokens": 37116002.0,
"step": 20125
},
{
"entropy": 5.579259443283081,
"epoch": 1.6911993278722957,
"grad_norm": 1.328125,
"learning_rate": 0.0004714555631718125,
"loss": 5.3223,
"mean_token_accuracy": 0.17461781948804855,
"num_tokens": 37125125.0,
"step": 20130
},
{
"entropy": 5.499294376373291,
"epoch": 1.6916194076874607,
"grad_norm": 1.484375,
"learning_rate": 0.000471440963178278,
"loss": 5.1678,
"mean_token_accuracy": 0.1853762999176979,
"num_tokens": 37134358.0,
"step": 20135
},
{
"entropy": 5.664219379425049,
"epoch": 1.6920394875026254,
"grad_norm": 1.2734375,
"learning_rate": 0.00047142635970489293,
"loss": 5.3426,
"mean_token_accuracy": 0.17598191499710084,
"num_tokens": 37143732.0,
"step": 20140
},
{
"entropy": 5.560906171798706,
"epoch": 1.6924595673177905,
"grad_norm": 1.3359375,
"learning_rate": 0.0004714117527519161,
"loss": 5.2439,
"mean_token_accuracy": 0.17713868170976638,
"num_tokens": 37153809.0,
"step": 20145
},
{
"entropy": 5.521042156219482,
"epoch": 1.6928796471329552,
"grad_norm": 1.3359375,
"learning_rate": 0.00047139714231960616,
"loss": 5.279,
"mean_token_accuracy": 0.17021969109773635,
"num_tokens": 37163272.0,
"step": 20150
},
{
"entropy": 5.507987546920776,
"epoch": 1.69329972694812,
"grad_norm": 1.546875,
"learning_rate": 0.000471382528408222,
"loss": 5.2069,
"mean_token_accuracy": 0.18287929743528367,
"num_tokens": 37172323.0,
"step": 20155
},
{
"entropy": 5.6668846130371096,
"epoch": 1.693719806763285,
"grad_norm": 1.4140625,
"learning_rate": 0.0004713679110180225,
"loss": 5.4094,
"mean_token_accuracy": 0.17531788125634193,
"num_tokens": 37181262.0,
"step": 20160
},
{
"entropy": 5.636130714416504,
"epoch": 1.69413988657845,
"grad_norm": 1.703125,
"learning_rate": 0.0004713532901492666,
"loss": 5.3649,
"mean_token_accuracy": 0.17455680370330812,
"num_tokens": 37189576.0,
"step": 20165
},
{
"entropy": 5.614585924148559,
"epoch": 1.6945599663936148,
"grad_norm": 1.375,
"learning_rate": 0.0004713386658022132,
"loss": 5.3468,
"mean_token_accuracy": 0.1719509854912758,
"num_tokens": 37199502.0,
"step": 20170
},
{
"entropy": 5.571990633010865,
"epoch": 1.6949800462087796,
"grad_norm": 1.2265625,
"learning_rate": 0.0004713240379771214,
"loss": 5.2602,
"mean_token_accuracy": 0.16983499974012375,
"num_tokens": 37209028.0,
"step": 20175
},
{
"entropy": 5.651020622253418,
"epoch": 1.6954001260239444,
"grad_norm": 1.3125,
"learning_rate": 0.0004713094066742505,
"loss": 5.4036,
"mean_token_accuracy": 0.17249808758497237,
"num_tokens": 37218087.0,
"step": 20180
},
{
"entropy": 5.6142457008361815,
"epoch": 1.6958202058391094,
"grad_norm": 1.2578125,
"learning_rate": 0.00047129477189385946,
"loss": 5.3624,
"mean_token_accuracy": 0.17154528796672822,
"num_tokens": 37227345.0,
"step": 20185
},
{
"entropy": 5.642702293395996,
"epoch": 1.6962402856542744,
"grad_norm": 1.328125,
"learning_rate": 0.0004712801336362076,
"loss": 5.2981,
"mean_token_accuracy": 0.1740754321217537,
"num_tokens": 37236011.0,
"step": 20190
},
{
"entropy": 5.584663200378418,
"epoch": 1.6966603654694392,
"grad_norm": 1.3984375,
"learning_rate": 0.0004712654919015543,
"loss": 5.273,
"mean_token_accuracy": 0.1756555661559105,
"num_tokens": 37244613.0,
"step": 20195
},
{
"entropy": 5.520324039459228,
"epoch": 1.697080445284604,
"grad_norm": 1.3984375,
"learning_rate": 0.0004712508466901588,
"loss": 5.299,
"mean_token_accuracy": 0.17936687916517258,
"num_tokens": 37253768.0,
"step": 20200
},
{
"entropy": 5.620243501663208,
"epoch": 1.697500525099769,
"grad_norm": 1.21875,
"learning_rate": 0.00047123619800228057,
"loss": 5.3549,
"mean_token_accuracy": 0.17055419608950614,
"num_tokens": 37263230.0,
"step": 20205
},
{
"entropy": 5.602057266235351,
"epoch": 1.6979206049149338,
"grad_norm": 1.296875,
"learning_rate": 0.0004712215458381792,
"loss": 5.2641,
"mean_token_accuracy": 0.1729967400431633,
"num_tokens": 37272752.0,
"step": 20210
},
{
"entropy": 5.594781112670899,
"epoch": 1.6983406847300988,
"grad_norm": 1.34375,
"learning_rate": 0.0004712068901981142,
"loss": 5.3173,
"mean_token_accuracy": 0.17693332582712173,
"num_tokens": 37281465.0,
"step": 20215
},
{
"entropy": 5.557209253311157,
"epoch": 1.6987607645452636,
"grad_norm": 1.3203125,
"learning_rate": 0.0004711922310823452,
"loss": 5.3058,
"mean_token_accuracy": 0.17530101239681245,
"num_tokens": 37290408.0,
"step": 20220
},
{
"entropy": 5.558608627319336,
"epoch": 1.6991808443604284,
"grad_norm": 1.375,
"learning_rate": 0.0004711775684911318,
"loss": 5.2619,
"mean_token_accuracy": 0.1848277762532234,
"num_tokens": 37298890.0,
"step": 20225
},
{
"entropy": 5.561495637893676,
"epoch": 1.6996009241755934,
"grad_norm": 1.3359375,
"learning_rate": 0.00047116290242473375,
"loss": 5.2775,
"mean_token_accuracy": 0.16978603452444077,
"num_tokens": 37307720.0,
"step": 20230
},
{
"entropy": 5.595826053619385,
"epoch": 1.7000210039907584,
"grad_norm": 1.5234375,
"learning_rate": 0.000471148232883411,
"loss": 5.315,
"mean_token_accuracy": 0.17254260778427125,
"num_tokens": 37317145.0,
"step": 20235
},
{
"entropy": 5.5474241256713865,
"epoch": 1.7004410838059232,
"grad_norm": 1.390625,
"learning_rate": 0.00047113355986742325,
"loss": 5.2406,
"mean_token_accuracy": 0.18234366327524185,
"num_tokens": 37326579.0,
"step": 20240
},
{
"entropy": 5.6315666198730465,
"epoch": 1.700861163621088,
"grad_norm": 1.296875,
"learning_rate": 0.00047111888337703046,
"loss": 5.3305,
"mean_token_accuracy": 0.1723786175251007,
"num_tokens": 37336065.0,
"step": 20245
},
{
"entropy": 5.508713865280152,
"epoch": 1.7012812434362528,
"grad_norm": 1.3984375,
"learning_rate": 0.0004711042034124926,
"loss": 5.2061,
"mean_token_accuracy": 0.17925113588571548,
"num_tokens": 37345297.0,
"step": 20250
},
{
"entropy": 5.615394401550293,
"epoch": 1.7017013232514178,
"grad_norm": 1.34375,
"learning_rate": 0.0004710895199740698,
"loss": 5.3493,
"mean_token_accuracy": 0.1747938558459282,
"num_tokens": 37354942.0,
"step": 20255
},
{
"entropy": 5.630298376083374,
"epoch": 1.7021214030665828,
"grad_norm": 1.28125,
"learning_rate": 0.0004710748330620222,
"loss": 5.2458,
"mean_token_accuracy": 0.1857147991657257,
"num_tokens": 37364068.0,
"step": 20260
},
{
"entropy": 5.616568470001221,
"epoch": 1.7025414828817476,
"grad_norm": 1.3046875,
"learning_rate": 0.0004710601426766098,
"loss": 5.3528,
"mean_token_accuracy": 0.17427913397550582,
"num_tokens": 37373256.0,
"step": 20265
},
{
"entropy": 5.5210045337677,
"epoch": 1.7029615626969123,
"grad_norm": 1.5078125,
"learning_rate": 0.00047104544881809295,
"loss": 5.1922,
"mean_token_accuracy": 0.1856342613697052,
"num_tokens": 37382098.0,
"step": 20270
},
{
"entropy": 5.514184141159058,
"epoch": 1.7033816425120771,
"grad_norm": 1.3359375,
"learning_rate": 0.0004710307514867319,
"loss": 5.2104,
"mean_token_accuracy": 0.18131417781114578,
"num_tokens": 37390844.0,
"step": 20275
},
{
"entropy": 5.617106676101685,
"epoch": 1.7038017223272421,
"grad_norm": 1.3515625,
"learning_rate": 0.0004710160506827871,
"loss": 5.2677,
"mean_token_accuracy": 0.17361823618412017,
"num_tokens": 37399617.0,
"step": 20280
},
{
"entropy": 5.632485580444336,
"epoch": 1.7042218021424071,
"grad_norm": 1.4921875,
"learning_rate": 0.0004710013464065189,
"loss": 5.3937,
"mean_token_accuracy": 0.17299257293343545,
"num_tokens": 37409368.0,
"step": 20285
},
{
"entropy": 5.5469035625457765,
"epoch": 1.704641881957572,
"grad_norm": 1.3828125,
"learning_rate": 0.0004709866386581877,
"loss": 5.1953,
"mean_token_accuracy": 0.1838440015912056,
"num_tokens": 37418026.0,
"step": 20290
},
{
"entropy": 5.577427625656128,
"epoch": 1.7050619617727367,
"grad_norm": 1.6328125,
"learning_rate": 0.00047097192743805413,
"loss": 5.2195,
"mean_token_accuracy": 0.18078586012125014,
"num_tokens": 37426850.0,
"step": 20295
},
{
"entropy": 5.580478572845459,
"epoch": 1.7054820415879017,
"grad_norm": 1.515625,
"learning_rate": 0.0004709572127463788,
"loss": 5.2681,
"mean_token_accuracy": 0.18005667328834535,
"num_tokens": 37436631.0,
"step": 20300
},
{
"entropy": 5.566556596755982,
"epoch": 1.7059021214030667,
"grad_norm": 1.390625,
"learning_rate": 0.0004709424945834223,
"loss": 5.2829,
"mean_token_accuracy": 0.1722734674811363,
"num_tokens": 37445619.0,
"step": 20305
},
{
"entropy": 5.541394805908203,
"epoch": 1.7063222012182315,
"grad_norm": 1.390625,
"learning_rate": 0.00047092777294944544,
"loss": 5.2553,
"mean_token_accuracy": 0.1764194667339325,
"num_tokens": 37454205.0,
"step": 20310
},
{
"entropy": 5.562058353424073,
"epoch": 1.7067422810333963,
"grad_norm": 1.3203125,
"learning_rate": 0.000470913047844709,
"loss": 5.3409,
"mean_token_accuracy": 0.17506177127361297,
"num_tokens": 37463301.0,
"step": 20315
},
{
"entropy": 5.6119575023651125,
"epoch": 1.707162360848561,
"grad_norm": 1.359375,
"learning_rate": 0.00047089831926947374,
"loss": 5.321,
"mean_token_accuracy": 0.18098929077386855,
"num_tokens": 37471937.0,
"step": 20320
},
{
"entropy": 5.6446630477905275,
"epoch": 1.707582440663726,
"grad_norm": 1.4140625,
"learning_rate": 0.0004708835872240007,
"loss": 5.2987,
"mean_token_accuracy": 0.1805913880467415,
"num_tokens": 37480779.0,
"step": 20325
},
{
"entropy": 5.607933044433594,
"epoch": 1.7080025204788911,
"grad_norm": 1.25,
"learning_rate": 0.00047086885170855074,
"loss": 5.3324,
"mean_token_accuracy": 0.16565900146961213,
"num_tokens": 37491053.0,
"step": 20330
},
{
"entropy": 5.634414005279541,
"epoch": 1.708422600294056,
"grad_norm": 1.3359375,
"learning_rate": 0.000470854112723385,
"loss": 5.2946,
"mean_token_accuracy": 0.17619498670101166,
"num_tokens": 37499091.0,
"step": 20335
},
{
"entropy": 5.5753278732299805,
"epoch": 1.7088426801092207,
"grad_norm": 1.390625,
"learning_rate": 0.0004708393702687644,
"loss": 5.3493,
"mean_token_accuracy": 0.1677510753273964,
"num_tokens": 37507882.0,
"step": 20340
},
{
"entropy": 5.564841461181641,
"epoch": 1.7092627599243855,
"grad_norm": 1.515625,
"learning_rate": 0.00047082462434495015,
"loss": 5.2924,
"mean_token_accuracy": 0.18322856575250626,
"num_tokens": 37517048.0,
"step": 20345
},
{
"entropy": 5.612966156005859,
"epoch": 1.7096828397395505,
"grad_norm": 1.375,
"learning_rate": 0.0004708098749522036,
"loss": 5.3223,
"mean_token_accuracy": 0.1702086165547371,
"num_tokens": 37526355.0,
"step": 20350
},
{
"entropy": 5.594509410858154,
"epoch": 1.7101029195547155,
"grad_norm": 1.2421875,
"learning_rate": 0.0004707951220907859,
"loss": 5.3826,
"mean_token_accuracy": 0.17475855201482773,
"num_tokens": 37535746.0,
"step": 20355
},
{
"entropy": 5.598930215835571,
"epoch": 1.7105229993698803,
"grad_norm": 1.3125,
"learning_rate": 0.0004707803657609585,
"loss": 5.3435,
"mean_token_accuracy": 0.16974906921386718,
"num_tokens": 37546479.0,
"step": 20360
},
{
"entropy": 5.683509492874146,
"epoch": 1.710943079185045,
"grad_norm": 1.2421875,
"learning_rate": 0.00047076560596298275,
"loss": 5.3767,
"mean_token_accuracy": 0.17090600356459618,
"num_tokens": 37556805.0,
"step": 20365
},
{
"entropy": 5.644847393035889,
"epoch": 1.71136315900021,
"grad_norm": 1.2734375,
"learning_rate": 0.00047075084269712,
"loss": 5.363,
"mean_token_accuracy": 0.1756405934691429,
"num_tokens": 37564748.0,
"step": 20370
},
{
"entropy": 5.468269491195679,
"epoch": 1.711783238815375,
"grad_norm": 1.46875,
"learning_rate": 0.0004707360759636319,
"loss": 5.1626,
"mean_token_accuracy": 0.19014602154493332,
"num_tokens": 37574674.0,
"step": 20375
},
{
"entropy": 5.595285129547119,
"epoch": 1.7122033186305399,
"grad_norm": 1.3671875,
"learning_rate": 0.00047072130576278,
"loss": 5.2843,
"mean_token_accuracy": 0.17274381965398788,
"num_tokens": 37584459.0,
"step": 20380
},
{
"entropy": 5.623803043365479,
"epoch": 1.7126233984457047,
"grad_norm": 1.3125,
"learning_rate": 0.0004707065320948259,
"loss": 5.3308,
"mean_token_accuracy": 0.1756378158926964,
"num_tokens": 37593570.0,
"step": 20385
},
{
"entropy": 5.582317781448364,
"epoch": 1.7130434782608694,
"grad_norm": 1.2421875,
"learning_rate": 0.00047069175496003147,
"loss": 5.3428,
"mean_token_accuracy": 0.169034381210804,
"num_tokens": 37603032.0,
"step": 20390
},
{
"entropy": 5.512708854675293,
"epoch": 1.7134635580760345,
"grad_norm": 1.203125,
"learning_rate": 0.0004706769743586583,
"loss": 5.2525,
"mean_token_accuracy": 0.18294099867343902,
"num_tokens": 37612404.0,
"step": 20395
},
{
"entropy": 5.5761744499206545,
"epoch": 1.7138836378911995,
"grad_norm": 1.546875,
"learning_rate": 0.00047066219029096837,
"loss": 5.2728,
"mean_token_accuracy": 0.17195031940937042,
"num_tokens": 37621933.0,
"step": 20400
},
{
"entropy": 5.651831007003784,
"epoch": 1.7143037177063642,
"grad_norm": 1.5,
"learning_rate": 0.0004706474027572234,
"loss": 5.2945,
"mean_token_accuracy": 0.17908292561769484,
"num_tokens": 37632078.0,
"step": 20405
},
{
"entropy": 5.481583452224731,
"epoch": 1.714723797521529,
"grad_norm": 1.1796875,
"learning_rate": 0.00047063261175768543,
"loss": 5.2288,
"mean_token_accuracy": 0.17573564797639846,
"num_tokens": 37641665.0,
"step": 20410
},
{
"entropy": 5.633792209625244,
"epoch": 1.7151438773366938,
"grad_norm": 1.2734375,
"learning_rate": 0.00047061781729261656,
"loss": 5.2974,
"mean_token_accuracy": 0.1704529970884323,
"num_tokens": 37650751.0,
"step": 20415
},
{
"entropy": 5.568549108505249,
"epoch": 1.7155639571518588,
"grad_norm": 2.453125,
"learning_rate": 0.00047060301936227865,
"loss": 5.2686,
"mean_token_accuracy": 0.17993844598531722,
"num_tokens": 37659165.0,
"step": 20420
},
{
"entropy": 5.566297483444214,
"epoch": 1.7159840369670238,
"grad_norm": 1.3203125,
"learning_rate": 0.0004705882179669341,
"loss": 5.2851,
"mean_token_accuracy": 0.177535517513752,
"num_tokens": 37668057.0,
"step": 20425
},
{
"entropy": 5.5905169486999515,
"epoch": 1.7164041167821886,
"grad_norm": 1.296875,
"learning_rate": 0.0004705734131068449,
"loss": 5.2503,
"mean_token_accuracy": 0.17543450742959976,
"num_tokens": 37677674.0,
"step": 20430
},
{
"entropy": 5.561374711990356,
"epoch": 1.7168241965973534,
"grad_norm": 1.5390625,
"learning_rate": 0.0004705586047822734,
"loss": 5.3037,
"mean_token_accuracy": 0.17467762231826783,
"num_tokens": 37687009.0,
"step": 20435
},
{
"entropy": 5.519535827636719,
"epoch": 1.7172442764125184,
"grad_norm": 1.3671875,
"learning_rate": 0.00047054379299348194,
"loss": 5.1699,
"mean_token_accuracy": 0.18024923652410507,
"num_tokens": 37696723.0,
"step": 20440
},
{
"entropy": 5.528227758407593,
"epoch": 1.7176643562276832,
"grad_norm": 1.359375,
"learning_rate": 0.00047052897774073295,
"loss": 5.2983,
"mean_token_accuracy": 0.17500178664922714,
"num_tokens": 37706560.0,
"step": 20445
},
{
"entropy": 5.57094554901123,
"epoch": 1.7180844360428482,
"grad_norm": 1.375,
"learning_rate": 0.00047051415902428875,
"loss": 5.2998,
"mean_token_accuracy": 0.174894580245018,
"num_tokens": 37715176.0,
"step": 20450
},
{
"entropy": 5.591981601715088,
"epoch": 1.718504515858013,
"grad_norm": 1.296875,
"learning_rate": 0.0004704993368444119,
"loss": 5.2991,
"mean_token_accuracy": 0.1785847693681717,
"num_tokens": 37723956.0,
"step": 20455
},
{
"entropy": 5.595147562026978,
"epoch": 1.7189245956731778,
"grad_norm": 1.8359375,
"learning_rate": 0.0004704845112013649,
"loss": 5.2939,
"mean_token_accuracy": 0.1756458342075348,
"num_tokens": 37733236.0,
"step": 20460
},
{
"entropy": 5.5822196960449215,
"epoch": 1.7193446754883428,
"grad_norm": 1.2578125,
"learning_rate": 0.0004704696820954105,
"loss": 5.3456,
"mean_token_accuracy": 0.17483636140823364,
"num_tokens": 37742626.0,
"step": 20465
},
{
"entropy": 5.549651193618774,
"epoch": 1.7197647553035078,
"grad_norm": 1.46875,
"learning_rate": 0.0004704548495268113,
"loss": 5.2468,
"mean_token_accuracy": 0.18747982531785964,
"num_tokens": 37751854.0,
"step": 20470
},
{
"entropy": 5.546167612075806,
"epoch": 1.7201848351186726,
"grad_norm": 1.390625,
"learning_rate": 0.00047044001349583,
"loss": 5.2629,
"mean_token_accuracy": 0.17624340504407882,
"num_tokens": 37760993.0,
"step": 20475
},
{
"entropy": 5.569744825363159,
"epoch": 1.7206049149338374,
"grad_norm": 1.4765625,
"learning_rate": 0.00047042517400272966,
"loss": 5.3527,
"mean_token_accuracy": 0.17823934108018874,
"num_tokens": 37771714.0,
"step": 20480
},
{
"entropy": 5.591018438339233,
"epoch": 1.7210249947490022,
"grad_norm": 1.796875,
"learning_rate": 0.0004704103310477729,
"loss": 5.2704,
"mean_token_accuracy": 0.18319321572780609,
"num_tokens": 37780653.0,
"step": 20485
},
{
"entropy": 5.583276319503784,
"epoch": 1.7214450745641672,
"grad_norm": 1.6640625,
"learning_rate": 0.0004703954846312228,
"loss": 5.3351,
"mean_token_accuracy": 0.17502954453229905,
"num_tokens": 37790450.0,
"step": 20490
},
{
"entropy": 5.5930804252624515,
"epoch": 1.7218651543793322,
"grad_norm": 1.28125,
"learning_rate": 0.0004703806347533423,
"loss": 5.3413,
"mean_token_accuracy": 0.16975829750299454,
"num_tokens": 37800450.0,
"step": 20495
},
{
"entropy": 5.621373271942138,
"epoch": 1.722285234194497,
"grad_norm": 1.546875,
"learning_rate": 0.0004703657814143945,
"loss": 5.3506,
"mean_token_accuracy": 0.16938521564006806,
"num_tokens": 37809261.0,
"step": 20500
},
{
"entropy": 5.545867824554444,
"epoch": 1.7227053140096618,
"grad_norm": 1.3515625,
"learning_rate": 0.0004703509246146424,
"loss": 5.1384,
"mean_token_accuracy": 0.18370115756988525,
"num_tokens": 37818244.0,
"step": 20505
},
{
"entropy": 5.539609479904175,
"epoch": 1.7231253938248268,
"grad_norm": 1.3359375,
"learning_rate": 0.0004703360643543493,
"loss": 5.2565,
"mean_token_accuracy": 0.17891710698604585,
"num_tokens": 37828555.0,
"step": 20510
},
{
"entropy": 5.472784996032715,
"epoch": 1.7235454736399916,
"grad_norm": 1.5,
"learning_rate": 0.00047032120063377836,
"loss": 5.2394,
"mean_token_accuracy": 0.17572056204080583,
"num_tokens": 37837840.0,
"step": 20515
},
{
"entropy": 5.5847330570220945,
"epoch": 1.7239655534551566,
"grad_norm": 1.453125,
"learning_rate": 0.00047030633345319293,
"loss": 5.2809,
"mean_token_accuracy": 0.17465004473924636,
"num_tokens": 37846910.0,
"step": 20520
},
{
"entropy": 5.483146238327026,
"epoch": 1.7243856332703213,
"grad_norm": 1.5078125,
"learning_rate": 0.00047029146281285647,
"loss": 5.118,
"mean_token_accuracy": 0.1930585354566574,
"num_tokens": 37855642.0,
"step": 20525
},
{
"entropy": 5.606215620040894,
"epoch": 1.7248057130854861,
"grad_norm": 1.296875,
"learning_rate": 0.0004702765887130322,
"loss": 5.3009,
"mean_token_accuracy": 0.18125868886709212,
"num_tokens": 37864439.0,
"step": 20530
},
{
"entropy": 5.670219421386719,
"epoch": 1.7252257929006511,
"grad_norm": 1.34375,
"learning_rate": 0.00047026171115398377,
"loss": 5.3667,
"mean_token_accuracy": 0.16673457846045495,
"num_tokens": 37873801.0,
"step": 20535
},
{
"entropy": 5.543535375595093,
"epoch": 1.7256458727158162,
"grad_norm": 1.296875,
"learning_rate": 0.0004702468301359746,
"loss": 5.2545,
"mean_token_accuracy": 0.1855766221880913,
"num_tokens": 37883915.0,
"step": 20540
},
{
"entropy": 5.565848636627197,
"epoch": 1.726065952530981,
"grad_norm": 1.3515625,
"learning_rate": 0.0004702319456592684,
"loss": 5.3746,
"mean_token_accuracy": 0.17028053849935532,
"num_tokens": 37894083.0,
"step": 20545
},
{
"entropy": 5.612676286697388,
"epoch": 1.7264860323461457,
"grad_norm": 1.4140625,
"learning_rate": 0.00047021705772412885,
"loss": 5.3304,
"mean_token_accuracy": 0.17594763338565828,
"num_tokens": 37902264.0,
"step": 20550
},
{
"entropy": 5.548248910903931,
"epoch": 1.7269061121613105,
"grad_norm": 1.3046875,
"learning_rate": 0.00047020216633081964,
"loss": 5.2587,
"mean_token_accuracy": 0.17754550874233246,
"num_tokens": 37911071.0,
"step": 20555
},
{
"entropy": 5.541538333892822,
"epoch": 1.7273261919764755,
"grad_norm": 1.34375,
"learning_rate": 0.00047018727147960453,
"loss": 5.3564,
"mean_token_accuracy": 0.17294432371854782,
"num_tokens": 37920048.0,
"step": 20560
},
{
"entropy": 5.602067565917968,
"epoch": 1.7277462717916405,
"grad_norm": 1.25,
"learning_rate": 0.00047017237317074743,
"loss": 5.3093,
"mean_token_accuracy": 0.17899594604969024,
"num_tokens": 37928877.0,
"step": 20565
},
{
"entropy": 5.618015432357788,
"epoch": 1.7281663516068053,
"grad_norm": 1.3203125,
"learning_rate": 0.0004701574714045123,
"loss": 5.3342,
"mean_token_accuracy": 0.16978157311677933,
"num_tokens": 37937860.0,
"step": 20570
},
{
"entropy": 5.576488018035889,
"epoch": 1.72858643142197,
"grad_norm": 1.3671875,
"learning_rate": 0.00047014256618116304,
"loss": 5.3322,
"mean_token_accuracy": 0.1728402554988861,
"num_tokens": 37947588.0,
"step": 20575
},
{
"entropy": 5.547093248367309,
"epoch": 1.729006511237135,
"grad_norm": 1.3359375,
"learning_rate": 0.00047012765750096365,
"loss": 5.2516,
"mean_token_accuracy": 0.17566161453723908,
"num_tokens": 37957598.0,
"step": 20580
},
{
"entropy": 5.585799074172973,
"epoch": 1.7294265910523,
"grad_norm": 1.5390625,
"learning_rate": 0.00047011274536417827,
"loss": 5.2296,
"mean_token_accuracy": 0.185555799305439,
"num_tokens": 37965294.0,
"step": 20585
},
{
"entropy": 5.533174467086792,
"epoch": 1.729846670867465,
"grad_norm": 1.375,
"learning_rate": 0.00047009782977107113,
"loss": 5.2375,
"mean_token_accuracy": 0.18368934690952302,
"num_tokens": 37973977.0,
"step": 20590
},
{
"entropy": 5.664621782302857,
"epoch": 1.7302667506826297,
"grad_norm": 1.4609375,
"learning_rate": 0.00047008291072190634,
"loss": 5.4095,
"mean_token_accuracy": 0.16538559794425964,
"num_tokens": 37984492.0,
"step": 20595
},
{
"entropy": 5.651897811889649,
"epoch": 1.7306868304977945,
"grad_norm": 1.515625,
"learning_rate": 0.0004700679882169482,
"loss": 5.3153,
"mean_token_accuracy": 0.1731739342212677,
"num_tokens": 37994404.0,
"step": 20600
},
{
"entropy": 5.4727442264556885,
"epoch": 1.7311069103129595,
"grad_norm": 1.40625,
"learning_rate": 0.0004700530622564613,
"loss": 5.2252,
"mean_token_accuracy": 0.18352428525686265,
"num_tokens": 38002659.0,
"step": 20605
},
{
"entropy": 5.510218620300293,
"epoch": 1.7315269901281245,
"grad_norm": 1.2734375,
"learning_rate": 0.0004700381328407096,
"loss": 5.2024,
"mean_token_accuracy": 0.17989274859428406,
"num_tokens": 38012290.0,
"step": 20610
},
{
"entropy": 5.6110443592071535,
"epoch": 1.7319470699432893,
"grad_norm": 1.234375,
"learning_rate": 0.0004700231999699579,
"loss": 5.3421,
"mean_token_accuracy": 0.1751223623752594,
"num_tokens": 38022163.0,
"step": 20615
},
{
"entropy": 5.592725324630737,
"epoch": 1.732367149758454,
"grad_norm": 1.4453125,
"learning_rate": 0.0004700082636444706,
"loss": 5.2765,
"mean_token_accuracy": 0.17348547279834747,
"num_tokens": 38031051.0,
"step": 20620
},
{
"entropy": 5.587480926513672,
"epoch": 1.7327872295736189,
"grad_norm": 1.3671875,
"learning_rate": 0.00046999332386451245,
"loss": 5.3382,
"mean_token_accuracy": 0.17624160945415496,
"num_tokens": 38040474.0,
"step": 20625
},
{
"entropy": 5.616314792633057,
"epoch": 1.7332073093887839,
"grad_norm": 1.3046875,
"learning_rate": 0.00046997838063034784,
"loss": 5.2962,
"mean_token_accuracy": 0.17645470798015594,
"num_tokens": 38049620.0,
"step": 20630
},
{
"entropy": 5.51703987121582,
"epoch": 1.7336273892039489,
"grad_norm": 1.2421875,
"learning_rate": 0.00046996343394224173,
"loss": 5.2479,
"mean_token_accuracy": 0.17883582711219786,
"num_tokens": 38059866.0,
"step": 20635
},
{
"entropy": 5.490606117248535,
"epoch": 1.7340474690191137,
"grad_norm": 1.25,
"learning_rate": 0.00046994848380045866,
"loss": 5.2412,
"mean_token_accuracy": 0.17526740282773973,
"num_tokens": 38068948.0,
"step": 20640
},
{
"entropy": 5.6812032699584964,
"epoch": 1.7344675488342784,
"grad_norm": 1.3125,
"learning_rate": 0.00046993353020526366,
"loss": 5.4319,
"mean_token_accuracy": 0.17586117684841157,
"num_tokens": 38079239.0,
"step": 20645
},
{
"entropy": 5.610349845886231,
"epoch": 1.7348876286494432,
"grad_norm": 1.4140625,
"learning_rate": 0.0004699185731569215,
"loss": 5.3249,
"mean_token_accuracy": 0.1761300280690193,
"num_tokens": 38087999.0,
"step": 20650
},
{
"entropy": 5.56100435256958,
"epoch": 1.7353077084646082,
"grad_norm": 1.25,
"learning_rate": 0.0004699036126556972,
"loss": 5.2889,
"mean_token_accuracy": 0.17589117884635924,
"num_tokens": 38096586.0,
"step": 20655
},
{
"entropy": 5.519883155822754,
"epoch": 1.7357277882797733,
"grad_norm": 1.4140625,
"learning_rate": 0.0004698886487018558,
"loss": 5.2473,
"mean_token_accuracy": 0.17950861006975175,
"num_tokens": 38104766.0,
"step": 20660
},
{
"entropy": 5.56228494644165,
"epoch": 1.736147868094938,
"grad_norm": 1.390625,
"learning_rate": 0.0004698736812956623,
"loss": 5.299,
"mean_token_accuracy": 0.17932037860155106,
"num_tokens": 38113574.0,
"step": 20665
},
{
"entropy": 5.535027313232422,
"epoch": 1.7365679479101028,
"grad_norm": 1.296875,
"learning_rate": 0.0004698587104373819,
"loss": 5.2459,
"mean_token_accuracy": 0.17193092703819274,
"num_tokens": 38122513.0,
"step": 20670
},
{
"entropy": 5.445349597930909,
"epoch": 1.7369880277252678,
"grad_norm": 1.34375,
"learning_rate": 0.00046984373612727975,
"loss": 5.2406,
"mean_token_accuracy": 0.17000415921211243,
"num_tokens": 38131105.0,
"step": 20675
},
{
"entropy": 5.546124792098999,
"epoch": 1.7374081075404328,
"grad_norm": 1.6328125,
"learning_rate": 0.00046982875836562116,
"loss": 5.3088,
"mean_token_accuracy": 0.16814721822738649,
"num_tokens": 38140106.0,
"step": 20680
},
{
"entropy": 5.635690546035766,
"epoch": 1.7378281873555976,
"grad_norm": 1.28125,
"learning_rate": 0.00046981377715267145,
"loss": 5.2672,
"mean_token_accuracy": 0.1834568426012993,
"num_tokens": 38149215.0,
"step": 20685
},
{
"entropy": 5.542273759841919,
"epoch": 1.7382482671707624,
"grad_norm": 1.3125,
"learning_rate": 0.000469798792488696,
"loss": 5.1806,
"mean_token_accuracy": 0.1841437429189682,
"num_tokens": 38157591.0,
"step": 20690
},
{
"entropy": 5.4450897693634035,
"epoch": 1.7386683469859272,
"grad_norm": 1.5078125,
"learning_rate": 0.0004697838043739602,
"loss": 5.3104,
"mean_token_accuracy": 0.1721317082643509,
"num_tokens": 38167673.0,
"step": 20695
},
{
"entropy": 5.647896146774292,
"epoch": 1.7390884268010922,
"grad_norm": 1.28125,
"learning_rate": 0.00046976881280872974,
"loss": 5.2795,
"mean_token_accuracy": 0.17500491738319396,
"num_tokens": 38177586.0,
"step": 20700
},
{
"entropy": 5.638086891174316,
"epoch": 1.7395085066162572,
"grad_norm": 1.2578125,
"learning_rate": 0.0004697538177932699,
"loss": 5.3005,
"mean_token_accuracy": 0.17529842704534532,
"num_tokens": 38187020.0,
"step": 20705
},
{
"entropy": 5.402251243591309,
"epoch": 1.739928586431422,
"grad_norm": 1.3515625,
"learning_rate": 0.0004697388193278465,
"loss": 5.0719,
"mean_token_accuracy": 0.19213451743125914,
"num_tokens": 38195705.0,
"step": 20710
},
{
"entropy": 5.461338472366333,
"epoch": 1.7403486662465868,
"grad_norm": 1.5,
"learning_rate": 0.0004697238174127252,
"loss": 5.1939,
"mean_token_accuracy": 0.17999443858861924,
"num_tokens": 38204726.0,
"step": 20715
},
{
"entropy": 5.529233503341675,
"epoch": 1.7407687460617516,
"grad_norm": 1.3046875,
"learning_rate": 0.0004697088120481717,
"loss": 5.3016,
"mean_token_accuracy": 0.17639971822500228,
"num_tokens": 38214376.0,
"step": 20720
},
{
"entropy": 5.53774209022522,
"epoch": 1.7411888258769166,
"grad_norm": 1.359375,
"learning_rate": 0.0004696938032344519,
"loss": 5.1969,
"mean_token_accuracy": 0.1796201542019844,
"num_tokens": 38223631.0,
"step": 20725
},
{
"entropy": 5.5487611293792725,
"epoch": 1.7416089056920816,
"grad_norm": 1.28125,
"learning_rate": 0.0004696787909718317,
"loss": 5.2369,
"mean_token_accuracy": 0.18440187424421312,
"num_tokens": 38233519.0,
"step": 20730
},
{
"entropy": 5.558741188049316,
"epoch": 1.7420289855072464,
"grad_norm": 1.5390625,
"learning_rate": 0.00046966377526057686,
"loss": 5.2233,
"mean_token_accuracy": 0.18502927124500274,
"num_tokens": 38242340.0,
"step": 20735
},
{
"entropy": 5.527883768081665,
"epoch": 1.7424490653224112,
"grad_norm": 1.5078125,
"learning_rate": 0.0004696487561009535,
"loss": 5.2115,
"mean_token_accuracy": 0.17893013656139373,
"num_tokens": 38251194.0,
"step": 20740
},
{
"entropy": 5.545695352554321,
"epoch": 1.7428691451375762,
"grad_norm": 1.2734375,
"learning_rate": 0.0004696337334932277,
"loss": 5.2668,
"mean_token_accuracy": 0.17433553487062453,
"num_tokens": 38259938.0,
"step": 20745
},
{
"entropy": 5.553517532348633,
"epoch": 1.743289224952741,
"grad_norm": 1.3671875,
"learning_rate": 0.00046961870743766546,
"loss": 5.279,
"mean_token_accuracy": 0.17590255290269852,
"num_tokens": 38268073.0,
"step": 20750
},
{
"entropy": 5.557691669464111,
"epoch": 1.743709304767906,
"grad_norm": 1.3828125,
"learning_rate": 0.00046960367793453313,
"loss": 5.3857,
"mean_token_accuracy": 0.16716455072164535,
"num_tokens": 38277667.0,
"step": 20755
},
{
"entropy": 5.611572647094727,
"epoch": 1.7441293845830708,
"grad_norm": 1.3203125,
"learning_rate": 0.00046958864498409673,
"loss": 5.33,
"mean_token_accuracy": 0.1733332246541977,
"num_tokens": 38287142.0,
"step": 20760
},
{
"entropy": 5.6386466979980465,
"epoch": 1.7445494643982355,
"grad_norm": 1.3828125,
"learning_rate": 0.00046957360858662276,
"loss": 5.315,
"mean_token_accuracy": 0.18019963204860687,
"num_tokens": 38296199.0,
"step": 20765
},
{
"entropy": 5.602294111251831,
"epoch": 1.7449695442134006,
"grad_norm": 1.4140625,
"learning_rate": 0.0004695585687423775,
"loss": 5.3124,
"mean_token_accuracy": 0.1713024079799652,
"num_tokens": 38305412.0,
"step": 20770
},
{
"entropy": 5.4710324764251705,
"epoch": 1.7453896240285656,
"grad_norm": 1.3671875,
"learning_rate": 0.0004695435254516273,
"loss": 5.2457,
"mean_token_accuracy": 0.189963199198246,
"num_tokens": 38313898.0,
"step": 20775
},
{
"entropy": 5.595325517654419,
"epoch": 1.7458097038437304,
"grad_norm": 1.28125,
"learning_rate": 0.0004695284787146388,
"loss": 5.3439,
"mean_token_accuracy": 0.1715154990553856,
"num_tokens": 38322835.0,
"step": 20780
},
{
"entropy": 5.539307594299316,
"epoch": 1.7462297836588951,
"grad_norm": 1.328125,
"learning_rate": 0.0004695134285316784,
"loss": 5.1759,
"mean_token_accuracy": 0.186397522687912,
"num_tokens": 38331448.0,
"step": 20785
},
{
"entropy": 5.517307949066162,
"epoch": 1.74664986347406,
"grad_norm": 1.4453125,
"learning_rate": 0.00046949837490301293,
"loss": 5.2944,
"mean_token_accuracy": 0.17323679625988006,
"num_tokens": 38340837.0,
"step": 20790
},
{
"entropy": 5.534305810928345,
"epoch": 1.747069943289225,
"grad_norm": 1.609375,
"learning_rate": 0.0004694833178289088,
"loss": 5.2612,
"mean_token_accuracy": 0.17903849184513093,
"num_tokens": 38349363.0,
"step": 20795
},
{
"entropy": 5.580279779434204,
"epoch": 1.74749002310439,
"grad_norm": 1.4609375,
"learning_rate": 0.0004694682573096328,
"loss": 5.2996,
"mean_token_accuracy": 0.17908350825309755,
"num_tokens": 38358017.0,
"step": 20800
},
{
"entropy": 5.555252599716186,
"epoch": 1.7479101029195547,
"grad_norm": 1.3359375,
"learning_rate": 0.00046945319334545184,
"loss": 5.2785,
"mean_token_accuracy": 0.1748664602637291,
"num_tokens": 38367256.0,
"step": 20805
},
{
"entropy": 5.528793382644653,
"epoch": 1.7483301827347195,
"grad_norm": 1.21875,
"learning_rate": 0.0004694381259366327,
"loss": 5.2872,
"mean_token_accuracy": 0.1795317456126213,
"num_tokens": 38376169.0,
"step": 20810
},
{
"entropy": 5.5296714305877686,
"epoch": 1.7487502625498845,
"grad_norm": 1.3046875,
"learning_rate": 0.00046942305508344216,
"loss": 5.2635,
"mean_token_accuracy": 0.17527851164340974,
"num_tokens": 38385379.0,
"step": 20815
},
{
"entropy": 5.570555305480957,
"epoch": 1.7491703423650493,
"grad_norm": 1.296875,
"learning_rate": 0.0004694079807861473,
"loss": 5.3433,
"mean_token_accuracy": 0.17313087731599808,
"num_tokens": 38395217.0,
"step": 20820
},
{
"entropy": 5.575230693817138,
"epoch": 1.7495904221802143,
"grad_norm": 1.3046875,
"learning_rate": 0.0004693929030450153,
"loss": 5.2696,
"mean_token_accuracy": 0.1774916172027588,
"num_tokens": 38404347.0,
"step": 20825
},
{
"entropy": 5.567524671554565,
"epoch": 1.750010501995379,
"grad_norm": 1.3203125,
"learning_rate": 0.00046937782186031303,
"loss": 5.1985,
"mean_token_accuracy": 0.1796313852071762,
"num_tokens": 38413394.0,
"step": 20830
},
{
"entropy": 5.542976903915405,
"epoch": 1.750430581810544,
"grad_norm": 1.40625,
"learning_rate": 0.0004693627372323078,
"loss": 5.2771,
"mean_token_accuracy": 0.17429673671722412,
"num_tokens": 38422043.0,
"step": 20835
},
{
"entropy": 5.633368635177613,
"epoch": 1.750850661625709,
"grad_norm": 1.3125,
"learning_rate": 0.0004693476491612667,
"loss": 5.4196,
"mean_token_accuracy": 0.17291610538959504,
"num_tokens": 38430792.0,
"step": 20840
},
{
"entropy": 5.467224740982056,
"epoch": 1.751270741440874,
"grad_norm": 1.4921875,
"learning_rate": 0.0004693325576474571,
"loss": 5.2267,
"mean_token_accuracy": 0.180571611225605,
"num_tokens": 38439105.0,
"step": 20845
},
{
"entropy": 5.5831750392913815,
"epoch": 1.7516908212560387,
"grad_norm": 1.34375,
"learning_rate": 0.0004693174626911463,
"loss": 5.2435,
"mean_token_accuracy": 0.18055440485477448,
"num_tokens": 38447944.0,
"step": 20850
},
{
"entropy": 5.563493919372559,
"epoch": 1.7521109010712035,
"grad_norm": 1.5,
"learning_rate": 0.00046930236429260173,
"loss": 5.297,
"mean_token_accuracy": 0.16627456694841386,
"num_tokens": 38457206.0,
"step": 20855
},
{
"entropy": 5.596124792098999,
"epoch": 1.7525309808863683,
"grad_norm": 1.21875,
"learning_rate": 0.0004692872624520908,
"loss": 5.3621,
"mean_token_accuracy": 0.1721819207072258,
"num_tokens": 38467085.0,
"step": 20860
},
{
"entropy": 5.597991609573365,
"epoch": 1.7529510607015333,
"grad_norm": 1.390625,
"learning_rate": 0.000469272157169881,
"loss": 5.2123,
"mean_token_accuracy": 0.18100935518741607,
"num_tokens": 38475970.0,
"step": 20865
},
{
"entropy": 5.539360284805298,
"epoch": 1.7533711405166983,
"grad_norm": 1.3984375,
"learning_rate": 0.0004692570484462401,
"loss": 5.3359,
"mean_token_accuracy": 0.17976865470409392,
"num_tokens": 38484579.0,
"step": 20870
},
{
"entropy": 5.566984033584594,
"epoch": 1.753791220331863,
"grad_norm": 1.421875,
"learning_rate": 0.00046924193628143554,
"loss": 5.3759,
"mean_token_accuracy": 0.17198723405599595,
"num_tokens": 38495107.0,
"step": 20875
},
{
"entropy": 5.643288230895996,
"epoch": 1.7542113001470279,
"grad_norm": 1.4140625,
"learning_rate": 0.00046922682067573516,
"loss": 5.3712,
"mean_token_accuracy": 0.1766890287399292,
"num_tokens": 38505731.0,
"step": 20880
},
{
"entropy": 5.508024644851685,
"epoch": 1.7546313799621929,
"grad_norm": 1.34375,
"learning_rate": 0.00046921170162940657,
"loss": 5.2732,
"mean_token_accuracy": 0.18079008311033248,
"num_tokens": 38514483.0,
"step": 20885
},
{
"entropy": 5.518124580383301,
"epoch": 1.7550514597773577,
"grad_norm": 1.265625,
"learning_rate": 0.00046919657914271774,
"loss": 5.1688,
"mean_token_accuracy": 0.18382768779993058,
"num_tokens": 38522953.0,
"step": 20890
},
{
"entropy": 5.493333435058593,
"epoch": 1.7554715395925227,
"grad_norm": 1.2421875,
"learning_rate": 0.0004691814532159365,
"loss": 5.1851,
"mean_token_accuracy": 0.1921203464269638,
"num_tokens": 38531891.0,
"step": 20895
},
{
"entropy": 5.556777238845825,
"epoch": 1.7558916194076875,
"grad_norm": 1.6015625,
"learning_rate": 0.0004691663238493308,
"loss": 5.3429,
"mean_token_accuracy": 0.17430225014686584,
"num_tokens": 38541609.0,
"step": 20900
},
{
"entropy": 5.5950055599212645,
"epoch": 1.7563116992228522,
"grad_norm": 1.21875,
"learning_rate": 0.0004691511910431686,
"loss": 5.367,
"mean_token_accuracy": 0.1741584859788418,
"num_tokens": 38550348.0,
"step": 20905
},
{
"entropy": 5.525777006149292,
"epoch": 1.7567317790380172,
"grad_norm": 1.5859375,
"learning_rate": 0.0004691360547977181,
"loss": 5.188,
"mean_token_accuracy": 0.1817720964550972,
"num_tokens": 38559493.0,
"step": 20910
},
{
"entropy": 5.518686532974243,
"epoch": 1.7571518588531823,
"grad_norm": 1.5234375,
"learning_rate": 0.0004691209151132474,
"loss": 5.2439,
"mean_token_accuracy": 0.17163237035274506,
"num_tokens": 38567888.0,
"step": 20915
},
{
"entropy": 5.555866241455078,
"epoch": 1.757571938668347,
"grad_norm": 1.375,
"learning_rate": 0.0004691057719900246,
"loss": 5.3028,
"mean_token_accuracy": 0.17857276648283005,
"num_tokens": 38577216.0,
"step": 20920
},
{
"entropy": 5.526533365249634,
"epoch": 1.7579920184835118,
"grad_norm": 1.4140625,
"learning_rate": 0.00046909062542831794,
"loss": 5.2589,
"mean_token_accuracy": 0.18041514456272126,
"num_tokens": 38586258.0,
"step": 20925
},
{
"entropy": 5.567690086364746,
"epoch": 1.7584120982986766,
"grad_norm": 1.3671875,
"learning_rate": 0.0004690754754283959,
"loss": 5.2052,
"mean_token_accuracy": 0.18337651193141938,
"num_tokens": 38594900.0,
"step": 20930
},
{
"entropy": 5.454372978210449,
"epoch": 1.7588321781138416,
"grad_norm": 1.359375,
"learning_rate": 0.0004690603219905266,
"loss": 5.2889,
"mean_token_accuracy": 0.17600491940975188,
"num_tokens": 38603980.0,
"step": 20935
},
{
"entropy": 5.563923358917236,
"epoch": 1.7592522579290066,
"grad_norm": 1.328125,
"learning_rate": 0.00046904516511497873,
"loss": 5.3752,
"mean_token_accuracy": 0.16869162023067474,
"num_tokens": 38613888.0,
"step": 20940
},
{
"entropy": 5.6561531066894535,
"epoch": 1.7596723377441714,
"grad_norm": 1.3125,
"learning_rate": 0.00046903000480202065,
"loss": 5.3274,
"mean_token_accuracy": 0.1744433745741844,
"num_tokens": 38623969.0,
"step": 20945
},
{
"entropy": 5.545741033554077,
"epoch": 1.7600924175593362,
"grad_norm": 1.3203125,
"learning_rate": 0.00046901484105192094,
"loss": 5.2659,
"mean_token_accuracy": 0.1776861682534218,
"num_tokens": 38633387.0,
"step": 20950
},
{
"entropy": 5.556090021133423,
"epoch": 1.760512497374501,
"grad_norm": 1.3671875,
"learning_rate": 0.00046899967386494816,
"loss": 5.3256,
"mean_token_accuracy": 0.1711733728647232,
"num_tokens": 38642481.0,
"step": 20955
},
{
"entropy": 5.574460506439209,
"epoch": 1.760932577189666,
"grad_norm": 1.2890625,
"learning_rate": 0.0004689845032413712,
"loss": 5.3176,
"mean_token_accuracy": 0.16617531925439835,
"num_tokens": 38652345.0,
"step": 20960
},
{
"entropy": 5.6433697700500485,
"epoch": 1.761352657004831,
"grad_norm": 1.296875,
"learning_rate": 0.0004689693291814586,
"loss": 5.3489,
"mean_token_accuracy": 0.17165304273366927,
"num_tokens": 38661529.0,
"step": 20965
},
{
"entropy": 5.534882211685181,
"epoch": 1.7617727368199958,
"grad_norm": 1.2578125,
"learning_rate": 0.0004689541516854791,
"loss": 5.2566,
"mean_token_accuracy": 0.178298744559288,
"num_tokens": 38670191.0,
"step": 20970
},
{
"entropy": 5.533437728881836,
"epoch": 1.7621928166351606,
"grad_norm": 1.5078125,
"learning_rate": 0.0004689389707537018,
"loss": 5.3339,
"mean_token_accuracy": 0.17809650301933289,
"num_tokens": 38679089.0,
"step": 20975
},
{
"entropy": 5.5712419033050535,
"epoch": 1.7626128964503256,
"grad_norm": 1.296875,
"learning_rate": 0.00046892378638639545,
"loss": 5.2636,
"mean_token_accuracy": 0.18261911123991012,
"num_tokens": 38688821.0,
"step": 20980
},
{
"entropy": 5.622128677368164,
"epoch": 1.7630329762654906,
"grad_norm": 1.6953125,
"learning_rate": 0.00046890859858382913,
"loss": 5.3455,
"mean_token_accuracy": 0.16656963527202606,
"num_tokens": 38698232.0,
"step": 20985
},
{
"entropy": 5.7006600856781,
"epoch": 1.7634530560806554,
"grad_norm": 1.3046875,
"learning_rate": 0.0004688934073462718,
"loss": 5.4163,
"mean_token_accuracy": 0.16369396597146987,
"num_tokens": 38708090.0,
"step": 20990
},
{
"entropy": 5.635142040252686,
"epoch": 1.7638731358958202,
"grad_norm": 1.328125,
"learning_rate": 0.00046887821267399256,
"loss": 5.3194,
"mean_token_accuracy": 0.18188066631555558,
"num_tokens": 38717370.0,
"step": 20995
},
{
"entropy": 5.575509548187256,
"epoch": 1.764293215710985,
"grad_norm": 1.4453125,
"learning_rate": 0.0004688630145672607,
"loss": 5.2776,
"mean_token_accuracy": 0.1804552748799324,
"num_tokens": 38726758.0,
"step": 21000
},
{
"epoch": 1.764293215710985,
"eval_entropy": 5.370026161624236,
"eval_loss": 5.348424434661865,
"eval_mean_token_accuracy": 0.18274719849327753,
"eval_num_tokens": 38726758.0,
"eval_runtime": 27.5063,
"eval_samples_per_second": 1358.454,
"eval_steps_per_second": 169.816,
"step": 21000
},
{
"entropy": 5.5108339309692385,
"epoch": 1.76471329552615,
"grad_norm": 1.3125,
"learning_rate": 0.0004688478130263453,
"loss": 5.2675,
"mean_token_accuracy": 0.1718210682272911,
"num_tokens": 38736180.0,
"step": 21005
},
{
"entropy": 5.562737226486206,
"epoch": 1.765133375341315,
"grad_norm": 1.5859375,
"learning_rate": 0.0004688326080515157,
"loss": 5.2338,
"mean_token_accuracy": 0.1809652030467987,
"num_tokens": 38744529.0,
"step": 21010
},
{
"entropy": 5.4518450736999515,
"epoch": 1.7655534551564798,
"grad_norm": 1.3203125,
"learning_rate": 0.00046881739964304127,
"loss": 5.1528,
"mean_token_accuracy": 0.18581486344337464,
"num_tokens": 38753434.0,
"step": 21015
},
{
"entropy": 5.534206962585449,
"epoch": 1.7659735349716446,
"grad_norm": 1.375,
"learning_rate": 0.00046880218780119136,
"loss": 5.2705,
"mean_token_accuracy": 0.18431596457958221,
"num_tokens": 38762021.0,
"step": 21020
},
{
"entropy": 5.61903829574585,
"epoch": 1.7663936147868093,
"grad_norm": 1.4921875,
"learning_rate": 0.0004687869725262356,
"loss": 5.3827,
"mean_token_accuracy": 0.17010098695755005,
"num_tokens": 38771373.0,
"step": 21025
},
{
"entropy": 5.58247971534729,
"epoch": 1.7668136946019743,
"grad_norm": 1.25,
"learning_rate": 0.0004687717538184433,
"loss": 5.3534,
"mean_token_accuracy": 0.1779028668999672,
"num_tokens": 38780388.0,
"step": 21030
},
{
"entropy": 5.492437887191772,
"epoch": 1.7672337744171394,
"grad_norm": 1.2578125,
"learning_rate": 0.00046875653167808423,
"loss": 5.1953,
"mean_token_accuracy": 0.1803269624710083,
"num_tokens": 38789285.0,
"step": 21035
},
{
"entropy": 5.484241724014282,
"epoch": 1.7676538542323041,
"grad_norm": 1.359375,
"learning_rate": 0.00046874130610542796,
"loss": 5.2889,
"mean_token_accuracy": 0.1785847067832947,
"num_tokens": 38799321.0,
"step": 21040
},
{
"entropy": 5.652108764648437,
"epoch": 1.768073934047469,
"grad_norm": 1.421875,
"learning_rate": 0.0004687260771007442,
"loss": 5.2566,
"mean_token_accuracy": 0.17422612458467485,
"num_tokens": 38808515.0,
"step": 21045
},
{
"entropy": 5.556089258193969,
"epoch": 1.768494013862634,
"grad_norm": 1.4140625,
"learning_rate": 0.0004687108446643027,
"loss": 5.2183,
"mean_token_accuracy": 0.180132919549942,
"num_tokens": 38817634.0,
"step": 21050
},
{
"entropy": 5.639248371124268,
"epoch": 1.7689140936777987,
"grad_norm": 1.4296875,
"learning_rate": 0.0004686956087963734,
"loss": 5.4547,
"mean_token_accuracy": 0.17535045742988586,
"num_tokens": 38826766.0,
"step": 21055
},
{
"entropy": 5.513987350463867,
"epoch": 1.7693341734929637,
"grad_norm": 1.34375,
"learning_rate": 0.0004686803694972261,
"loss": 5.2106,
"mean_token_accuracy": 0.17584403306245805,
"num_tokens": 38835942.0,
"step": 21060
},
{
"entropy": 5.577651929855347,
"epoch": 1.7697542533081285,
"grad_norm": 1.2578125,
"learning_rate": 0.00046866512676713075,
"loss": 5.2828,
"mean_token_accuracy": 0.16697818487882615,
"num_tokens": 38845691.0,
"step": 21065
},
{
"entropy": 5.57818717956543,
"epoch": 1.7701743331232933,
"grad_norm": 1.5234375,
"learning_rate": 0.00046864988060635744,
"loss": 5.2912,
"mean_token_accuracy": 0.17076105773448944,
"num_tokens": 38855737.0,
"step": 21070
},
{
"entropy": 5.558654069900513,
"epoch": 1.7705944129384583,
"grad_norm": 1.3125,
"learning_rate": 0.0004686346310151762,
"loss": 5.3034,
"mean_token_accuracy": 0.17694543898105622,
"num_tokens": 38864887.0,
"step": 21075
},
{
"entropy": 5.6088512420654295,
"epoch": 1.7710144927536233,
"grad_norm": 1.5859375,
"learning_rate": 0.00046861937799385717,
"loss": 5.2641,
"mean_token_accuracy": 0.17852309942245484,
"num_tokens": 38873924.0,
"step": 21080
},
{
"entropy": 5.576352643966675,
"epoch": 1.7714345725687881,
"grad_norm": 1.6640625,
"learning_rate": 0.0004686041215426706,
"loss": 5.3292,
"mean_token_accuracy": 0.17449680864810943,
"num_tokens": 38883447.0,
"step": 21085
},
{
"entropy": 5.568763208389282,
"epoch": 1.771854652383953,
"grad_norm": 1.53125,
"learning_rate": 0.0004685888616618867,
"loss": 5.3223,
"mean_token_accuracy": 0.1743033543229103,
"num_tokens": 38892389.0,
"step": 21090
},
{
"entropy": 5.596828842163086,
"epoch": 1.7722747321991177,
"grad_norm": 1.34375,
"learning_rate": 0.00046857359835177575,
"loss": 5.3587,
"mean_token_accuracy": 0.17608010172843933,
"num_tokens": 38901574.0,
"step": 21095
},
{
"entropy": 5.614238834381103,
"epoch": 1.7726948120142827,
"grad_norm": 1.359375,
"learning_rate": 0.00046855833161260825,
"loss": 5.3331,
"mean_token_accuracy": 0.1758733034133911,
"num_tokens": 38910070.0,
"step": 21100
},
{
"entropy": 5.576064538955689,
"epoch": 1.7731148918294477,
"grad_norm": 1.328125,
"learning_rate": 0.0004685430614446545,
"loss": 5.2405,
"mean_token_accuracy": 0.17795732915401458,
"num_tokens": 38919868.0,
"step": 21105
},
{
"entropy": 5.600339698791504,
"epoch": 1.7735349716446125,
"grad_norm": 1.453125,
"learning_rate": 0.0004685277878481852,
"loss": 5.3052,
"mean_token_accuracy": 0.17355633080005645,
"num_tokens": 38928840.0,
"step": 21110
},
{
"entropy": 5.5694554328918455,
"epoch": 1.7739550514597773,
"grad_norm": 1.3671875,
"learning_rate": 0.00046851251082347063,
"loss": 5.3661,
"mean_token_accuracy": 0.17163064181804658,
"num_tokens": 38938112.0,
"step": 21115
},
{
"entropy": 5.59302248954773,
"epoch": 1.7743751312749423,
"grad_norm": 1.296875,
"learning_rate": 0.0004684972303707816,
"loss": 5.3001,
"mean_token_accuracy": 0.1774540662765503,
"num_tokens": 38947463.0,
"step": 21120
},
{
"entropy": 5.654083490371704,
"epoch": 1.774795211090107,
"grad_norm": 1.3828125,
"learning_rate": 0.0004684819464903888,
"loss": 5.4538,
"mean_token_accuracy": 0.16484599113464354,
"num_tokens": 38957221.0,
"step": 21125
},
{
"entropy": 5.5309038162231445,
"epoch": 1.775215290905272,
"grad_norm": 1.46875,
"learning_rate": 0.000468466659182563,
"loss": 5.2078,
"mean_token_accuracy": 0.1796431288123131,
"num_tokens": 38966656.0,
"step": 21130
},
{
"entropy": 5.5446457862854,
"epoch": 1.7756353707204369,
"grad_norm": 1.4296875,
"learning_rate": 0.0004684513684475749,
"loss": 5.2093,
"mean_token_accuracy": 0.18359885811805726,
"num_tokens": 38975281.0,
"step": 21135
},
{
"entropy": 5.594289636611938,
"epoch": 1.7760554505356017,
"grad_norm": 1.2890625,
"learning_rate": 0.00046843607428569546,
"loss": 5.3379,
"mean_token_accuracy": 0.1814683884382248,
"num_tokens": 38985147.0,
"step": 21140
},
{
"entropy": 5.58348650932312,
"epoch": 1.7764755303507667,
"grad_norm": 1.34375,
"learning_rate": 0.00046842077669719554,
"loss": 5.1385,
"mean_token_accuracy": 0.18595415204763413,
"num_tokens": 38994104.0,
"step": 21145
},
{
"entropy": 5.594953155517578,
"epoch": 1.7768956101659317,
"grad_norm": 1.2421875,
"learning_rate": 0.00046840547568234613,
"loss": 5.3385,
"mean_token_accuracy": 0.17065630108118057,
"num_tokens": 39003983.0,
"step": 21150
},
{
"entropy": 5.505486059188843,
"epoch": 1.7773156899810965,
"grad_norm": 1.265625,
"learning_rate": 0.00046839017124141835,
"loss": 5.2367,
"mean_token_accuracy": 0.18241161853075027,
"num_tokens": 39012636.0,
"step": 21155
},
{
"entropy": 5.564147043228149,
"epoch": 1.7777357697962612,
"grad_norm": 1.265625,
"learning_rate": 0.00046837486337468335,
"loss": 5.3592,
"mean_token_accuracy": 0.1752906173467636,
"num_tokens": 39022173.0,
"step": 21160
},
{
"entropy": 5.710106563568115,
"epoch": 1.778155849611426,
"grad_norm": 1.421875,
"learning_rate": 0.000468359552082412,
"loss": 5.3554,
"mean_token_accuracy": 0.16831561774015427,
"num_tokens": 39032651.0,
"step": 21165
},
{
"entropy": 5.6070699214935305,
"epoch": 1.778575929426591,
"grad_norm": 1.328125,
"learning_rate": 0.0004683442373648759,
"loss": 5.2852,
"mean_token_accuracy": 0.17293607741594313,
"num_tokens": 39041543.0,
"step": 21170
},
{
"entropy": 5.474166440963745,
"epoch": 1.778996009241756,
"grad_norm": 1.5,
"learning_rate": 0.0004683289192223462,
"loss": 5.2444,
"mean_token_accuracy": 0.1783604457974434,
"num_tokens": 39050467.0,
"step": 21175
},
{
"entropy": 5.628903436660766,
"epoch": 1.7794160890569208,
"grad_norm": 1.375,
"learning_rate": 0.00046831359765509424,
"loss": 5.3174,
"mean_token_accuracy": 0.1688731476664543,
"num_tokens": 39059224.0,
"step": 21180
},
{
"entropy": 5.653453254699707,
"epoch": 1.7798361688720856,
"grad_norm": 1.3984375,
"learning_rate": 0.00046829827266339134,
"loss": 5.348,
"mean_token_accuracy": 0.17437512129545213,
"num_tokens": 39067884.0,
"step": 21185
},
{
"entropy": 5.596850299835205,
"epoch": 1.7802562486872506,
"grad_norm": 1.3125,
"learning_rate": 0.00046828294424750916,
"loss": 5.3002,
"mean_token_accuracy": 0.1735824912786484,
"num_tokens": 39076774.0,
"step": 21190
},
{
"entropy": 5.579911470413208,
"epoch": 1.7806763285024154,
"grad_norm": 1.4296875,
"learning_rate": 0.0004682676124077192,
"loss": 5.2245,
"mean_token_accuracy": 0.17956041991710664,
"num_tokens": 39086021.0,
"step": 21195
},
{
"entropy": 5.589482593536377,
"epoch": 1.7810964083175804,
"grad_norm": 1.21875,
"learning_rate": 0.00046825227714429287,
"loss": 5.2407,
"mean_token_accuracy": 0.1770210087299347,
"num_tokens": 39095682.0,
"step": 21200
},
{
"entropy": 5.51285719871521,
"epoch": 1.7815164881327452,
"grad_norm": 1.3515625,
"learning_rate": 0.00046823693845750205,
"loss": 5.2737,
"mean_token_accuracy": 0.17985749244689941,
"num_tokens": 39104904.0,
"step": 21205
},
{
"entropy": 5.657285499572754,
"epoch": 1.78193656794791,
"grad_norm": 1.3515625,
"learning_rate": 0.00046822159634761837,
"loss": 5.4152,
"mean_token_accuracy": 0.16899724900722504,
"num_tokens": 39113128.0,
"step": 21210
},
{
"entropy": 5.559497261047364,
"epoch": 1.782356647763075,
"grad_norm": 1.390625,
"learning_rate": 0.0004682062508149136,
"loss": 5.2481,
"mean_token_accuracy": 0.17521494179964064,
"num_tokens": 39122503.0,
"step": 21215
},
{
"entropy": 5.542518663406372,
"epoch": 1.78277672757824,
"grad_norm": 1.2421875,
"learning_rate": 0.0004681909018596595,
"loss": 5.2587,
"mean_token_accuracy": 0.17790466248989106,
"num_tokens": 39132020.0,
"step": 21220
},
{
"entropy": 5.54741325378418,
"epoch": 1.7831968073934048,
"grad_norm": 1.515625,
"learning_rate": 0.00046817554948212813,
"loss": 5.2916,
"mean_token_accuracy": 0.17278432846069336,
"num_tokens": 39141542.0,
"step": 21225
},
{
"entropy": 5.615826272964478,
"epoch": 1.7836168872085696,
"grad_norm": 1.2890625,
"learning_rate": 0.00046816019368259136,
"loss": 5.2987,
"mean_token_accuracy": 0.1843265950679779,
"num_tokens": 39151573.0,
"step": 21230
},
{
"entropy": 5.573117733001709,
"epoch": 1.7840369670237344,
"grad_norm": 1.5234375,
"learning_rate": 0.0004681448344613212,
"loss": 5.3038,
"mean_token_accuracy": 0.1892576277256012,
"num_tokens": 39160023.0,
"step": 21235
},
{
"entropy": 5.521038436889649,
"epoch": 1.7844570468388994,
"grad_norm": 1.2890625,
"learning_rate": 0.00046812947181858986,
"loss": 5.2878,
"mean_token_accuracy": 0.17681599855422975,
"num_tokens": 39169335.0,
"step": 21240
},
{
"entropy": 5.65053014755249,
"epoch": 1.7848771266540644,
"grad_norm": 1.375,
"learning_rate": 0.0004681141057546693,
"loss": 5.3749,
"mean_token_accuracy": 0.16727607399225236,
"num_tokens": 39177953.0,
"step": 21245
},
{
"entropy": 5.600383281707764,
"epoch": 1.7852972064692292,
"grad_norm": 1.265625,
"learning_rate": 0.00046809873626983174,
"loss": 5.2821,
"mean_token_accuracy": 0.17690058201551437,
"num_tokens": 39188984.0,
"step": 21250
},
{
"entropy": 5.59075927734375,
"epoch": 1.785717286284394,
"grad_norm": 1.375,
"learning_rate": 0.00046808336336434946,
"loss": 5.2704,
"mean_token_accuracy": 0.1825893297791481,
"num_tokens": 39198033.0,
"step": 21255
},
{
"entropy": 5.569455242156982,
"epoch": 1.7861373660995588,
"grad_norm": 1.421875,
"learning_rate": 0.00046806798703849495,
"loss": 5.2417,
"mean_token_accuracy": 0.18158730119466782,
"num_tokens": 39207429.0,
"step": 21260
},
{
"entropy": 5.54582028388977,
"epoch": 1.7865574459147238,
"grad_norm": 1.421875,
"learning_rate": 0.0004680526072925404,
"loss": 5.2852,
"mean_token_accuracy": 0.17962480187416077,
"num_tokens": 39216484.0,
"step": 21265
},
{
"entropy": 5.663633823394775,
"epoch": 1.7869775257298888,
"grad_norm": 1.421875,
"learning_rate": 0.00046803722412675836,
"loss": 5.3449,
"mean_token_accuracy": 0.17407129257917403,
"num_tokens": 39226385.0,
"step": 21270
},
{
"entropy": 5.609802389144898,
"epoch": 1.7873976055450536,
"grad_norm": 1.40625,
"learning_rate": 0.00046802183754142125,
"loss": 5.29,
"mean_token_accuracy": 0.17717466354370118,
"num_tokens": 39235424.0,
"step": 21275
},
{
"entropy": 5.535266304016114,
"epoch": 1.7878176853602183,
"grad_norm": 1.3125,
"learning_rate": 0.0004680064475368017,
"loss": 5.2415,
"mean_token_accuracy": 0.17994185090065,
"num_tokens": 39244109.0,
"step": 21280
},
{
"entropy": 5.541873598098755,
"epoch": 1.7882377651753834,
"grad_norm": 1.390625,
"learning_rate": 0.00046799105411317234,
"loss": 5.2909,
"mean_token_accuracy": 0.18489663153886796,
"num_tokens": 39253685.0,
"step": 21285
},
{
"entropy": 5.570934867858886,
"epoch": 1.7886578449905484,
"grad_norm": 1.421875,
"learning_rate": 0.00046797565727080585,
"loss": 5.2574,
"mean_token_accuracy": 0.17506163120269774,
"num_tokens": 39262743.0,
"step": 21290
},
{
"entropy": 5.550674486160278,
"epoch": 1.7890779248057131,
"grad_norm": 1.296875,
"learning_rate": 0.00046796025700997484,
"loss": 5.1808,
"mean_token_accuracy": 0.19061725884675979,
"num_tokens": 39270962.0,
"step": 21295
},
{
"entropy": 5.549523401260376,
"epoch": 1.789498004620878,
"grad_norm": 1.2578125,
"learning_rate": 0.0004679448533309523,
"loss": 5.2765,
"mean_token_accuracy": 0.18288866132497789,
"num_tokens": 39279994.0,
"step": 21300
},
{
"entropy": 5.577452230453491,
"epoch": 1.7899180844360427,
"grad_norm": 1.3671875,
"learning_rate": 0.00046792944623401107,
"loss": 5.3206,
"mean_token_accuracy": 0.17367657870054246,
"num_tokens": 39289481.0,
"step": 21305
},
{
"entropy": 5.67880425453186,
"epoch": 1.7903381642512077,
"grad_norm": 1.2421875,
"learning_rate": 0.00046791403571942405,
"loss": 5.4027,
"mean_token_accuracy": 0.16363756358623505,
"num_tokens": 39298383.0,
"step": 21310
},
{
"entropy": 5.568470525741577,
"epoch": 1.7907582440663727,
"grad_norm": 1.3359375,
"learning_rate": 0.0004678986217874642,
"loss": 5.2865,
"mean_token_accuracy": 0.17316093295812607,
"num_tokens": 39307809.0,
"step": 21315
},
{
"entropy": 5.519428348541259,
"epoch": 1.7911783238815375,
"grad_norm": 1.2890625,
"learning_rate": 0.00046788320443840457,
"loss": 5.1692,
"mean_token_accuracy": 0.18938619196414946,
"num_tokens": 39316332.0,
"step": 21320
},
{
"entropy": 5.541726160049438,
"epoch": 1.7915984036967023,
"grad_norm": 1.328125,
"learning_rate": 0.00046786778367251833,
"loss": 5.2067,
"mean_token_accuracy": 0.17708574384450912,
"num_tokens": 39325672.0,
"step": 21325
},
{
"entropy": 5.522161817550659,
"epoch": 1.792018483511867,
"grad_norm": 1.3515625,
"learning_rate": 0.00046785235949007854,
"loss": 5.2875,
"mean_token_accuracy": 0.17861751466989517,
"num_tokens": 39334478.0,
"step": 21330
},
{
"entropy": 5.427237558364868,
"epoch": 1.792438563327032,
"grad_norm": 1.2890625,
"learning_rate": 0.00046783693189135863,
"loss": 5.1713,
"mean_token_accuracy": 0.1787275567650795,
"num_tokens": 39343573.0,
"step": 21335
},
{
"entropy": 5.565889596939087,
"epoch": 1.7928586431421971,
"grad_norm": 1.2890625,
"learning_rate": 0.00046782150087663167,
"loss": 5.2137,
"mean_token_accuracy": 0.18441734760999678,
"num_tokens": 39351956.0,
"step": 21340
},
{
"entropy": 5.609550142288208,
"epoch": 1.793278722957362,
"grad_norm": 1.4921875,
"learning_rate": 0.0004678060664461711,
"loss": 5.3746,
"mean_token_accuracy": 0.16671205312013626,
"num_tokens": 39361911.0,
"step": 21345
},
{
"entropy": 5.636532258987427,
"epoch": 1.7936988027725267,
"grad_norm": 1.203125,
"learning_rate": 0.0004677906286002504,
"loss": 5.3241,
"mean_token_accuracy": 0.16892957389354707,
"num_tokens": 39370916.0,
"step": 21350
},
{
"entropy": 5.596935033798218,
"epoch": 1.7941188825876917,
"grad_norm": 1.2578125,
"learning_rate": 0.0004677751873391429,
"loss": 5.3262,
"mean_token_accuracy": 0.17588718235492706,
"num_tokens": 39380662.0,
"step": 21355
},
{
"entropy": 5.565099096298217,
"epoch": 1.7945389624028567,
"grad_norm": 1.2109375,
"learning_rate": 0.00046775974266312234,
"loss": 5.2403,
"mean_token_accuracy": 0.1833554670214653,
"num_tokens": 39389644.0,
"step": 21360
},
{
"entropy": 5.571610689163208,
"epoch": 1.7949590422180215,
"grad_norm": 1.46875,
"learning_rate": 0.00046774429457246215,
"loss": 5.25,
"mean_token_accuracy": 0.17542143762111664,
"num_tokens": 39398662.0,
"step": 21365
},
{
"entropy": 5.560281419754029,
"epoch": 1.7953791220331863,
"grad_norm": 1.2578125,
"learning_rate": 0.000467728843067436,
"loss": 5.3066,
"mean_token_accuracy": 0.17805775701999665,
"num_tokens": 39408064.0,
"step": 21370
},
{
"entropy": 5.593321990966797,
"epoch": 1.795799201848351,
"grad_norm": 1.4296875,
"learning_rate": 0.0004677133881483177,
"loss": 5.3428,
"mean_token_accuracy": 0.1693786695599556,
"num_tokens": 39418991.0,
"step": 21375
},
{
"entropy": 5.536650276184082,
"epoch": 1.796219281663516,
"grad_norm": 1.3125,
"learning_rate": 0.0004676979298153809,
"loss": 5.2099,
"mean_token_accuracy": 0.17939385324716567,
"num_tokens": 39428707.0,
"step": 21380
},
{
"entropy": 5.622062349319458,
"epoch": 1.796639361478681,
"grad_norm": 1.328125,
"learning_rate": 0.0004676824680688996,
"loss": 5.3522,
"mean_token_accuracy": 0.1810104191303253,
"num_tokens": 39437173.0,
"step": 21385
},
{
"entropy": 5.587282991409301,
"epoch": 1.7970594412938459,
"grad_norm": 1.59375,
"learning_rate": 0.00046766700290914743,
"loss": 5.2648,
"mean_token_accuracy": 0.17317934334278107,
"num_tokens": 39446336.0,
"step": 21390
},
{
"entropy": 5.648490476608276,
"epoch": 1.7974795211090107,
"grad_norm": 1.3203125,
"learning_rate": 0.00046765153433639856,
"loss": 5.4733,
"mean_token_accuracy": 0.16446711868047714,
"num_tokens": 39456129.0,
"step": 21395
},
{
"entropy": 5.578564453125,
"epoch": 1.7978996009241754,
"grad_norm": 1.28125,
"learning_rate": 0.00046763606235092705,
"loss": 5.3095,
"mean_token_accuracy": 0.18040735721588136,
"num_tokens": 39465386.0,
"step": 21400
},
{
"entropy": 5.588835763931274,
"epoch": 1.7983196807393405,
"grad_norm": 1.2734375,
"learning_rate": 0.0004676205869530068,
"loss": 5.3656,
"mean_token_accuracy": 0.178868567943573,
"num_tokens": 39475085.0,
"step": 21405
},
{
"entropy": 5.595099401473999,
"epoch": 1.7987397605545055,
"grad_norm": 1.25,
"learning_rate": 0.00046760510814291206,
"loss": 5.3735,
"mean_token_accuracy": 0.17388088405132293,
"num_tokens": 39484500.0,
"step": 21410
},
{
"entropy": 5.563240337371826,
"epoch": 1.7991598403696702,
"grad_norm": 1.2890625,
"learning_rate": 0.000467589625920917,
"loss": 5.2676,
"mean_token_accuracy": 0.17611514925956726,
"num_tokens": 39494049.0,
"step": 21415
},
{
"entropy": 5.559879112243652,
"epoch": 1.799579920184835,
"grad_norm": 1.21875,
"learning_rate": 0.000467574140287296,
"loss": 5.2729,
"mean_token_accuracy": 0.1770334795117378,
"num_tokens": 39502874.0,
"step": 21420
},
{
"entropy": 5.527122831344604,
"epoch": 1.8,
"grad_norm": 1.421875,
"learning_rate": 0.0004675586512423231,
"loss": 5.2973,
"mean_token_accuracy": 0.16903972178697585,
"num_tokens": 39512371.0,
"step": 21425
},
{
"entropy": 5.549544763565064,
"epoch": 1.8004200798151648,
"grad_norm": 1.25,
"learning_rate": 0.000467543158786273,
"loss": 5.329,
"mean_token_accuracy": 0.17464708387851716,
"num_tokens": 39521477.0,
"step": 21430
},
{
"entropy": 5.605368709564209,
"epoch": 1.8008401596303298,
"grad_norm": 1.28125,
"learning_rate": 0.00046752766291941985,
"loss": 5.3273,
"mean_token_accuracy": 0.16941720098257065,
"num_tokens": 39530072.0,
"step": 21435
},
{
"entropy": 5.586881399154663,
"epoch": 1.8012602394454946,
"grad_norm": 1.3671875,
"learning_rate": 0.0004675121636420383,
"loss": 5.3154,
"mean_token_accuracy": 0.1716737389564514,
"num_tokens": 39540762.0,
"step": 21440
},
{
"entropy": 5.57447247505188,
"epoch": 1.8016803192606594,
"grad_norm": 1.2265625,
"learning_rate": 0.000467496660954403,
"loss": 5.3317,
"mean_token_accuracy": 0.1700157880783081,
"num_tokens": 39549699.0,
"step": 21445
},
{
"entropy": 5.5867876529693605,
"epoch": 1.8021003990758244,
"grad_norm": 1.2890625,
"learning_rate": 0.00046748115485678837,
"loss": 5.373,
"mean_token_accuracy": 0.17287895381450652,
"num_tokens": 39558725.0,
"step": 21450
},
{
"entropy": 5.557333517074585,
"epoch": 1.8025204788909894,
"grad_norm": 1.2578125,
"learning_rate": 0.00046746564534946926,
"loss": 5.2205,
"mean_token_accuracy": 0.17810179591178893,
"num_tokens": 39567357.0,
"step": 21455
},
{
"entropy": 5.536188173294067,
"epoch": 1.8029405587061542,
"grad_norm": 1.4453125,
"learning_rate": 0.0004674501324327203,
"loss": 5.1968,
"mean_token_accuracy": 0.18433017581701278,
"num_tokens": 39576147.0,
"step": 21460
},
{
"entropy": 5.552721405029297,
"epoch": 1.803360638521319,
"grad_norm": 1.3671875,
"learning_rate": 0.00046743461610681636,
"loss": 5.3569,
"mean_token_accuracy": 0.18128087520599365,
"num_tokens": 39584963.0,
"step": 21465
},
{
"entropy": 5.532331895828247,
"epoch": 1.8037807183364838,
"grad_norm": 1.25,
"learning_rate": 0.0004674190963720323,
"loss": 5.2473,
"mean_token_accuracy": 0.18094887733459472,
"num_tokens": 39594420.0,
"step": 21470
},
{
"entropy": 5.558094120025634,
"epoch": 1.8042007981516488,
"grad_norm": 1.3203125,
"learning_rate": 0.000467403573228643,
"loss": 5.3205,
"mean_token_accuracy": 0.1709510400891304,
"num_tokens": 39603276.0,
"step": 21475
},
{
"entropy": 5.5868173122406,
"epoch": 1.8046208779668138,
"grad_norm": 1.3203125,
"learning_rate": 0.0004673880466769235,
"loss": 5.387,
"mean_token_accuracy": 0.17044233977794648,
"num_tokens": 39613161.0,
"step": 21480
},
{
"entropy": 5.551188278198242,
"epoch": 1.8050409577819786,
"grad_norm": 1.3046875,
"learning_rate": 0.00046737251671714886,
"loss": 5.2068,
"mean_token_accuracy": 0.18112777322530746,
"num_tokens": 39621889.0,
"step": 21485
},
{
"entropy": 5.581471395492554,
"epoch": 1.8054610375971434,
"grad_norm": 1.2890625,
"learning_rate": 0.00046735698334959407,
"loss": 5.4047,
"mean_token_accuracy": 0.1765786126255989,
"num_tokens": 39632009.0,
"step": 21490
},
{
"entropy": 5.634868240356445,
"epoch": 1.8058811174123084,
"grad_norm": 1.4296875,
"learning_rate": 0.00046734144657453443,
"loss": 5.3034,
"mean_token_accuracy": 0.17712171599268914,
"num_tokens": 39640639.0,
"step": 21495
},
{
"entropy": 5.535697793960571,
"epoch": 1.8063011972274732,
"grad_norm": 1.28125,
"learning_rate": 0.00046732590639224505,
"loss": 5.3134,
"mean_token_accuracy": 0.18016737848520278,
"num_tokens": 39649837.0,
"step": 21500
},
{
"entropy": 5.549190711975098,
"epoch": 1.8067212770426382,
"grad_norm": 1.40625,
"learning_rate": 0.00046731036280300126,
"loss": 5.3191,
"mean_token_accuracy": 0.17959260940551758,
"num_tokens": 39659890.0,
"step": 21505
},
{
"entropy": 5.587081003189087,
"epoch": 1.807141356857803,
"grad_norm": 1.40625,
"learning_rate": 0.00046729481580707846,
"loss": 5.2352,
"mean_token_accuracy": 0.17576223909854888,
"num_tokens": 39669550.0,
"step": 21510
},
{
"entropy": 5.559582233428955,
"epoch": 1.8075614366729678,
"grad_norm": 1.5390625,
"learning_rate": 0.00046727926540475207,
"loss": 5.254,
"mean_token_accuracy": 0.17681893855333328,
"num_tokens": 39678471.0,
"step": 21515
},
{
"entropy": 5.463067817687988,
"epoch": 1.8079815164881328,
"grad_norm": 1.28125,
"learning_rate": 0.0004672637115962974,
"loss": 5.1891,
"mean_token_accuracy": 0.18444477021694183,
"num_tokens": 39686600.0,
"step": 21520
},
{
"entropy": 5.491542530059815,
"epoch": 1.8084015963032978,
"grad_norm": 1.265625,
"learning_rate": 0.00046724815438199007,
"loss": 5.325,
"mean_token_accuracy": 0.17332013547420502,
"num_tokens": 39696848.0,
"step": 21525
},
{
"entropy": 5.5200439453125,
"epoch": 1.8088216761184626,
"grad_norm": 1.3125,
"learning_rate": 0.00046723259376210577,
"loss": 5.2534,
"mean_token_accuracy": 0.1806939423084259,
"num_tokens": 39706051.0,
"step": 21530
},
{
"entropy": 5.647727584838867,
"epoch": 1.8092417559336273,
"grad_norm": 1.3359375,
"learning_rate": 0.00046721702973692,
"loss": 5.3168,
"mean_token_accuracy": 0.1661395773291588,
"num_tokens": 39716035.0,
"step": 21535
},
{
"entropy": 5.58458456993103,
"epoch": 1.8096618357487921,
"grad_norm": 1.4453125,
"learning_rate": 0.00046720146230670853,
"loss": 5.2894,
"mean_token_accuracy": 0.17462138831615448,
"num_tokens": 39725717.0,
"step": 21540
},
{
"entropy": 5.552171087265014,
"epoch": 1.8100819155639571,
"grad_norm": 1.34375,
"learning_rate": 0.0004671858914717471,
"loss": 5.3241,
"mean_token_accuracy": 0.17308111637830734,
"num_tokens": 39734543.0,
"step": 21545
},
{
"entropy": 5.568116855621338,
"epoch": 1.8105019953791222,
"grad_norm": 1.3671875,
"learning_rate": 0.00046717031723231164,
"loss": 5.3251,
"mean_token_accuracy": 0.18243110924959183,
"num_tokens": 39744503.0,
"step": 21550
},
{
"entropy": 5.5745340347290036,
"epoch": 1.810922075194287,
"grad_norm": 1.28125,
"learning_rate": 0.0004671547395886779,
"loss": 5.307,
"mean_token_accuracy": 0.16971287429332732,
"num_tokens": 39753484.0,
"step": 21555
},
{
"entropy": 5.54430890083313,
"epoch": 1.8113421550094517,
"grad_norm": 1.359375,
"learning_rate": 0.0004671391585411219,
"loss": 5.227,
"mean_token_accuracy": 0.1830910786986351,
"num_tokens": 39762673.0,
"step": 21560
},
{
"entropy": 5.587272071838379,
"epoch": 1.8117622348246165,
"grad_norm": 1.21875,
"learning_rate": 0.00046712357408991965,
"loss": 5.3829,
"mean_token_accuracy": 0.1683663472533226,
"num_tokens": 39773138.0,
"step": 21565
},
{
"entropy": 5.631209993362427,
"epoch": 1.8121823146397815,
"grad_norm": 1.359375,
"learning_rate": 0.0004671079862353472,
"loss": 5.3748,
"mean_token_accuracy": 0.16770460456609726,
"num_tokens": 39782282.0,
"step": 21570
},
{
"entropy": 5.601681280136108,
"epoch": 1.8126023944549465,
"grad_norm": 1.3125,
"learning_rate": 0.00046709239497768067,
"loss": 5.2798,
"mean_token_accuracy": 0.1757730782032013,
"num_tokens": 39792035.0,
"step": 21575
},
{
"entropy": 5.700687885284424,
"epoch": 1.8130224742701113,
"grad_norm": 1.21875,
"learning_rate": 0.00046707680031719633,
"loss": 5.3748,
"mean_token_accuracy": 0.1745245486497879,
"num_tokens": 39801696.0,
"step": 21580
},
{
"entropy": 5.67553186416626,
"epoch": 1.813442554085276,
"grad_norm": 1.2734375,
"learning_rate": 0.0004670612022541705,
"loss": 5.3841,
"mean_token_accuracy": 0.1732521340250969,
"num_tokens": 39811449.0,
"step": 21585
},
{
"entropy": 5.563176584243775,
"epoch": 1.8138626339004411,
"grad_norm": 1.3515625,
"learning_rate": 0.0004670456007888792,
"loss": 5.3507,
"mean_token_accuracy": 0.17504584640264512,
"num_tokens": 39820339.0,
"step": 21590
},
{
"entropy": 5.530000305175781,
"epoch": 1.8142827137156061,
"grad_norm": 1.40625,
"learning_rate": 0.0004670299959215989,
"loss": 5.2662,
"mean_token_accuracy": 0.17913613468408585,
"num_tokens": 39829861.0,
"step": 21595
},
{
"entropy": 5.579708480834961,
"epoch": 1.814702793530771,
"grad_norm": 1.3203125,
"learning_rate": 0.0004670143876526062,
"loss": 5.2456,
"mean_token_accuracy": 0.1821893870830536,
"num_tokens": 39838568.0,
"step": 21600
},
{
"entropy": 5.586691045761109,
"epoch": 1.8151228733459357,
"grad_norm": 1.2890625,
"learning_rate": 0.00046699877598217754,
"loss": 5.2516,
"mean_token_accuracy": 0.1782296732068062,
"num_tokens": 39847705.0,
"step": 21605
},
{
"entropy": 5.537769460678101,
"epoch": 1.8155429531611005,
"grad_norm": 1.28125,
"learning_rate": 0.00046698316091058946,
"loss": 5.3419,
"mean_token_accuracy": 0.1734850823879242,
"num_tokens": 39856700.0,
"step": 21610
},
{
"entropy": 5.569580602645874,
"epoch": 1.8159630329762655,
"grad_norm": 1.3046875,
"learning_rate": 0.00046696754243811845,
"loss": 5.2304,
"mean_token_accuracy": 0.17954822778701782,
"num_tokens": 39865647.0,
"step": 21615
},
{
"entropy": 5.636335277557373,
"epoch": 1.8163831127914305,
"grad_norm": 1.359375,
"learning_rate": 0.0004669519205650413,
"loss": 5.3669,
"mean_token_accuracy": 0.1728852078318596,
"num_tokens": 39874705.0,
"step": 21620
},
{
"entropy": 5.562269020080566,
"epoch": 1.8168031926065953,
"grad_norm": 1.25,
"learning_rate": 0.00046693629529163467,
"loss": 5.1758,
"mean_token_accuracy": 0.18285553008317948,
"num_tokens": 39883795.0,
"step": 21625
},
{
"entropy": 5.541683864593506,
"epoch": 1.81722327242176,
"grad_norm": 1.375,
"learning_rate": 0.0004669206666181755,
"loss": 5.2657,
"mean_token_accuracy": 0.18557592630386352,
"num_tokens": 39893165.0,
"step": 21630
},
{
"entropy": 5.480154943466187,
"epoch": 1.8176433522369249,
"grad_norm": 1.34375,
"learning_rate": 0.0004669050345449404,
"loss": 5.2991,
"mean_token_accuracy": 0.1712697207927704,
"num_tokens": 39902241.0,
"step": 21635
},
{
"entropy": 5.562261343002319,
"epoch": 1.8180634320520899,
"grad_norm": 1.3359375,
"learning_rate": 0.0004668893990722066,
"loss": 5.2601,
"mean_token_accuracy": 0.1723608687520027,
"num_tokens": 39911211.0,
"step": 21640
},
{
"entropy": 5.540521192550659,
"epoch": 1.8184835118672549,
"grad_norm": 1.25,
"learning_rate": 0.0004668737602002508,
"loss": 5.2744,
"mean_token_accuracy": 0.17217839658260345,
"num_tokens": 39920192.0,
"step": 21645
},
{
"entropy": 5.586435842514038,
"epoch": 1.8189035916824197,
"grad_norm": 1.21875,
"learning_rate": 0.00046685811792935016,
"loss": 5.298,
"mean_token_accuracy": 0.18180651664733888,
"num_tokens": 39929169.0,
"step": 21650
},
{
"entropy": 5.578384208679199,
"epoch": 1.8193236714975844,
"grad_norm": 1.25,
"learning_rate": 0.00046684247225978176,
"loss": 5.3082,
"mean_token_accuracy": 0.17389074563980103,
"num_tokens": 39939333.0,
"step": 21655
},
{
"entropy": 5.5443199634552,
"epoch": 1.8197437513127495,
"grad_norm": 1.5546875,
"learning_rate": 0.00046682682319182275,
"loss": 5.3012,
"mean_token_accuracy": 0.1689795657992363,
"num_tokens": 39948042.0,
"step": 21660
},
{
"entropy": 5.580575084686279,
"epoch": 1.8201638311279145,
"grad_norm": 1.34375,
"learning_rate": 0.00046681117072575035,
"loss": 5.2447,
"mean_token_accuracy": 0.1803833782672882,
"num_tokens": 39956847.0,
"step": 21665
},
{
"entropy": 5.7115052223205565,
"epoch": 1.8205839109430793,
"grad_norm": 3.25,
"learning_rate": 0.0004667955148618418,
"loss": 5.4863,
"mean_token_accuracy": 0.16371891498565674,
"num_tokens": 39966598.0,
"step": 21670
},
{
"entropy": 5.5138585567474365,
"epoch": 1.821003990758244,
"grad_norm": 1.3125,
"learning_rate": 0.0004667798556003745,
"loss": 5.1433,
"mean_token_accuracy": 0.17832039296627045,
"num_tokens": 39975236.0,
"step": 21675
},
{
"entropy": 5.537216472625732,
"epoch": 1.8214240705734088,
"grad_norm": 1.2578125,
"learning_rate": 0.0004667641929416258,
"loss": 5.2974,
"mean_token_accuracy": 0.17921656221151352,
"num_tokens": 39984582.0,
"step": 21680
},
{
"entropy": 5.569884777069092,
"epoch": 1.8218441503885738,
"grad_norm": 1.21875,
"learning_rate": 0.0004667485268858731,
"loss": 5.3052,
"mean_token_accuracy": 0.17619964182376863,
"num_tokens": 39993122.0,
"step": 21685
},
{
"entropy": 5.549847030639649,
"epoch": 1.8222642302037388,
"grad_norm": 1.3984375,
"learning_rate": 0.00046673285743339406,
"loss": 5.2584,
"mean_token_accuracy": 0.1777494266629219,
"num_tokens": 40002974.0,
"step": 21690
},
{
"entropy": 5.542484521865845,
"epoch": 1.8226843100189036,
"grad_norm": 1.3125,
"learning_rate": 0.00046671718458446616,
"loss": 5.3139,
"mean_token_accuracy": 0.17635403722524642,
"num_tokens": 40011790.0,
"step": 21695
},
{
"entropy": 5.64197301864624,
"epoch": 1.8231043898340684,
"grad_norm": 1.375,
"learning_rate": 0.0004667015083393671,
"loss": 5.3046,
"mean_token_accuracy": 0.17413092255592347,
"num_tokens": 40021327.0,
"step": 21700
},
{
"entropy": 5.618719530105591,
"epoch": 1.8235244696492332,
"grad_norm": 1.375,
"learning_rate": 0.0004666858286983744,
"loss": 5.3122,
"mean_token_accuracy": 0.1722617894411087,
"num_tokens": 40030471.0,
"step": 21705
},
{
"entropy": 5.636075210571289,
"epoch": 1.8239445494643982,
"grad_norm": 1.3984375,
"learning_rate": 0.0004666701456617661,
"loss": 5.3179,
"mean_token_accuracy": 0.17273977100849153,
"num_tokens": 40039305.0,
"step": 21710
},
{
"entropy": 5.575518178939819,
"epoch": 1.8243646292795632,
"grad_norm": 1.4765625,
"learning_rate": 0.00046665445922981975,
"loss": 5.2308,
"mean_token_accuracy": 0.18458376079797745,
"num_tokens": 40047389.0,
"step": 21715
},
{
"entropy": 5.592567300796508,
"epoch": 1.824784709094728,
"grad_norm": 1.3359375,
"learning_rate": 0.0004666387694028134,
"loss": 5.2963,
"mean_token_accuracy": 0.17612065076828004,
"num_tokens": 40057640.0,
"step": 21720
},
{
"entropy": 5.462196016311646,
"epoch": 1.8252047889098928,
"grad_norm": 1.328125,
"learning_rate": 0.0004666230761810249,
"loss": 5.2838,
"mean_token_accuracy": 0.17370318472385407,
"num_tokens": 40066770.0,
"step": 21725
},
{
"entropy": 5.495664405822754,
"epoch": 1.8256248687250578,
"grad_norm": 1.3046875,
"learning_rate": 0.0004666073795647323,
"loss": 5.2498,
"mean_token_accuracy": 0.17879889160394669,
"num_tokens": 40075902.0,
"step": 21730
},
{
"entropy": 5.5509308815002445,
"epoch": 1.8260449485402226,
"grad_norm": 1.2265625,
"learning_rate": 0.00046659167955421366,
"loss": 5.2917,
"mean_token_accuracy": 0.1705416351556778,
"num_tokens": 40084945.0,
"step": 21735
},
{
"entropy": 5.511363697052002,
"epoch": 1.8264650283553876,
"grad_norm": 1.2578125,
"learning_rate": 0.000466575976149747,
"loss": 5.1994,
"mean_token_accuracy": 0.18209475576877593,
"num_tokens": 40095104.0,
"step": 21740
},
{
"entropy": 5.572790336608887,
"epoch": 1.8268851081705524,
"grad_norm": 1.4375,
"learning_rate": 0.0004665602693516106,
"loss": 5.3469,
"mean_token_accuracy": 0.17504333555698395,
"num_tokens": 40105329.0,
"step": 21745
},
{
"entropy": 5.495387840270996,
"epoch": 1.8273051879857172,
"grad_norm": 1.2890625,
"learning_rate": 0.0004665445591600827,
"loss": 5.1619,
"mean_token_accuracy": 0.18247357308864592,
"num_tokens": 40114555.0,
"step": 21750
},
{
"entropy": 5.562460851669312,
"epoch": 1.8277252678008822,
"grad_norm": 1.2890625,
"learning_rate": 0.0004665288455754415,
"loss": 5.2213,
"mean_token_accuracy": 0.1824244573712349,
"num_tokens": 40123314.0,
"step": 21755
},
{
"entropy": 5.525253438949585,
"epoch": 1.8281453476160472,
"grad_norm": 1.1953125,
"learning_rate": 0.0004665131285979655,
"loss": 5.2627,
"mean_token_accuracy": 0.17575850039720536,
"num_tokens": 40132483.0,
"step": 21760
},
{
"entropy": 5.55298547744751,
"epoch": 1.828565427431212,
"grad_norm": 1.390625,
"learning_rate": 0.00046649740822793303,
"loss": 5.2853,
"mean_token_accuracy": 0.1744897410273552,
"num_tokens": 40141800.0,
"step": 21765
},
{
"entropy": 5.651994705200195,
"epoch": 1.8289855072463768,
"grad_norm": 1.6328125,
"learning_rate": 0.0004664816844656225,
"loss": 5.2851,
"mean_token_accuracy": 0.18408445715904237,
"num_tokens": 40149892.0,
"step": 21770
},
{
"entropy": 5.562488842010498,
"epoch": 1.8294055870615415,
"grad_norm": 1.234375,
"learning_rate": 0.00046646595731131263,
"loss": 5.2503,
"mean_token_accuracy": 0.17643408924341203,
"num_tokens": 40159376.0,
"step": 21775
},
{
"entropy": 5.516074466705322,
"epoch": 1.8298256668767066,
"grad_norm": 1.3515625,
"learning_rate": 0.0004664502267652819,
"loss": 5.2136,
"mean_token_accuracy": 0.1830064058303833,
"num_tokens": 40168497.0,
"step": 21780
},
{
"entropy": 5.587344694137573,
"epoch": 1.8302457466918716,
"grad_norm": 1.3671875,
"learning_rate": 0.00046643449282780894,
"loss": 5.3048,
"mean_token_accuracy": 0.17306350022554398,
"num_tokens": 40177432.0,
"step": 21785
},
{
"entropy": 5.570412015914917,
"epoch": 1.8306658265070364,
"grad_norm": 1.1953125,
"learning_rate": 0.0004664187554991725,
"loss": 5.1867,
"mean_token_accuracy": 0.18107624650001525,
"num_tokens": 40186582.0,
"step": 21790
},
{
"entropy": 5.524451637268067,
"epoch": 1.8310859063222011,
"grad_norm": 1.4609375,
"learning_rate": 0.0004664030147796514,
"loss": 5.239,
"mean_token_accuracy": 0.1814667671918869,
"num_tokens": 40196094.0,
"step": 21795
},
{
"entropy": 5.487248706817627,
"epoch": 1.8315059861373661,
"grad_norm": 1.34375,
"learning_rate": 0.0004663872706695244,
"loss": 5.3207,
"mean_token_accuracy": 0.1775876447558403,
"num_tokens": 40205239.0,
"step": 21800
},
{
"entropy": 5.573007202148437,
"epoch": 1.831926065952531,
"grad_norm": 1.4140625,
"learning_rate": 0.0004663715231690706,
"loss": 5.3783,
"mean_token_accuracy": 0.17704021483659743,
"num_tokens": 40213908.0,
"step": 21805
},
{
"entropy": 5.626622581481934,
"epoch": 1.832346145767696,
"grad_norm": 1.4453125,
"learning_rate": 0.00046635577227856873,
"loss": 5.3253,
"mean_token_accuracy": 0.1763758897781372,
"num_tokens": 40223370.0,
"step": 21810
},
{
"entropy": 5.61353907585144,
"epoch": 1.8327662255828607,
"grad_norm": 1.1640625,
"learning_rate": 0.0004663400179982978,
"loss": 5.4076,
"mean_token_accuracy": 0.17073974311351775,
"num_tokens": 40233934.0,
"step": 21815
},
{
"entropy": 5.627803707122803,
"epoch": 1.8331863053980255,
"grad_norm": 1.3828125,
"learning_rate": 0.00046632426032853705,
"loss": 5.2648,
"mean_token_accuracy": 0.17407817989587784,
"num_tokens": 40244335.0,
"step": 21820
},
{
"entropy": 5.523148536682129,
"epoch": 1.8336063852131905,
"grad_norm": 1.296875,
"learning_rate": 0.00046630849926956555,
"loss": 5.2274,
"mean_token_accuracy": 0.174884133040905,
"num_tokens": 40254354.0,
"step": 21825
},
{
"entropy": 5.533200979232788,
"epoch": 1.8340264650283555,
"grad_norm": 1.3203125,
"learning_rate": 0.00046629273482166244,
"loss": 5.2338,
"mean_token_accuracy": 0.17594105154275894,
"num_tokens": 40262748.0,
"step": 21830
},
{
"entropy": 5.6067948818206785,
"epoch": 1.8344465448435203,
"grad_norm": 1.578125,
"learning_rate": 0.00046627696698510706,
"loss": 5.3254,
"mean_token_accuracy": 0.1756878077983856,
"num_tokens": 40271818.0,
"step": 21835
},
{
"entropy": 5.567719411849976,
"epoch": 1.834866624658685,
"grad_norm": 1.28125,
"learning_rate": 0.0004662611957601788,
"loss": 5.3372,
"mean_token_accuracy": 0.17135182470083238,
"num_tokens": 40280552.0,
"step": 21840
},
{
"entropy": 5.582840538024902,
"epoch": 1.83528670447385,
"grad_norm": 1.5078125,
"learning_rate": 0.00046624542114715687,
"loss": 5.237,
"mean_token_accuracy": 0.18171630948781967,
"num_tokens": 40289368.0,
"step": 21845
},
{
"entropy": 5.690359497070313,
"epoch": 1.835706784289015,
"grad_norm": 1.25,
"learning_rate": 0.0004662296431463208,
"loss": 5.412,
"mean_token_accuracy": 0.1636722281575203,
"num_tokens": 40298884.0,
"step": 21850
},
{
"entropy": 5.632392024993896,
"epoch": 1.83612686410418,
"grad_norm": 1.28125,
"learning_rate": 0.00046621386175795,
"loss": 5.3541,
"mean_token_accuracy": 0.1686980739235878,
"num_tokens": 40307886.0,
"step": 21855
},
{
"entropy": 5.570592308044434,
"epoch": 1.8365469439193447,
"grad_norm": 1.34375,
"learning_rate": 0.00046619807698232413,
"loss": 5.2639,
"mean_token_accuracy": 0.17661840617656707,
"num_tokens": 40317688.0,
"step": 21860
},
{
"entropy": 5.606667232513428,
"epoch": 1.8369670237345095,
"grad_norm": 1.46875,
"learning_rate": 0.0004661822888197228,
"loss": 5.3051,
"mean_token_accuracy": 0.174074749648571,
"num_tokens": 40327630.0,
"step": 21865
},
{
"entropy": 5.560684251785278,
"epoch": 1.8373871035496743,
"grad_norm": 1.46875,
"learning_rate": 0.00046616649727042564,
"loss": 5.288,
"mean_token_accuracy": 0.1768094927072525,
"num_tokens": 40336613.0,
"step": 21870
},
{
"entropy": 5.575845956802368,
"epoch": 1.8378071833648393,
"grad_norm": 1.4453125,
"learning_rate": 0.00046615070233471244,
"loss": 5.3694,
"mean_token_accuracy": 0.1752162218093872,
"num_tokens": 40346582.0,
"step": 21875
},
{
"entropy": 5.665771961212158,
"epoch": 1.8382272631800043,
"grad_norm": 1.296875,
"learning_rate": 0.00046613490401286304,
"loss": 5.4,
"mean_token_accuracy": 0.1690703310072422,
"num_tokens": 40355960.0,
"step": 21880
},
{
"entropy": 5.6712233543396,
"epoch": 1.838647342995169,
"grad_norm": 1.34375,
"learning_rate": 0.00046611910230515716,
"loss": 5.2364,
"mean_token_accuracy": 0.174868805706501,
"num_tokens": 40366043.0,
"step": 21885
},
{
"entropy": 5.609152221679688,
"epoch": 1.8390674228103339,
"grad_norm": 1.328125,
"learning_rate": 0.0004661032972118748,
"loss": 5.3123,
"mean_token_accuracy": 0.1770208790898323,
"num_tokens": 40374919.0,
"step": 21890
},
{
"entropy": 5.531170177459717,
"epoch": 1.8394875026254989,
"grad_norm": 1.25,
"learning_rate": 0.00046608748873329587,
"loss": 5.2689,
"mean_token_accuracy": 0.1823518306016922,
"num_tokens": 40383415.0,
"step": 21895
},
{
"entropy": 5.657415151596069,
"epoch": 1.8399075824406639,
"grad_norm": 1.2734375,
"learning_rate": 0.0004660716768697005,
"loss": 5.3305,
"mean_token_accuracy": 0.16956553012132644,
"num_tokens": 40392252.0,
"step": 21900
},
{
"entropy": 5.520682668685913,
"epoch": 1.8403276622558287,
"grad_norm": 1.3671875,
"learning_rate": 0.0004660558616213689,
"loss": 5.1673,
"mean_token_accuracy": 0.19024722576141356,
"num_tokens": 40400717.0,
"step": 21905
},
{
"entropy": 5.504176998138428,
"epoch": 1.8407477420709935,
"grad_norm": 1.21875,
"learning_rate": 0.00046604004298858093,
"loss": 5.2115,
"mean_token_accuracy": 0.18332263380289077,
"num_tokens": 40409236.0,
"step": 21910
},
{
"entropy": 5.515361166000366,
"epoch": 1.8411678218861582,
"grad_norm": 1.703125,
"learning_rate": 0.0004660242209716171,
"loss": 5.2257,
"mean_token_accuracy": 0.18484469950199128,
"num_tokens": 40419073.0,
"step": 21915
},
{
"entropy": 5.6589983940124515,
"epoch": 1.8415879017013232,
"grad_norm": 1.453125,
"learning_rate": 0.0004660083955707575,
"loss": 5.3303,
"mean_token_accuracy": 0.1793600782752037,
"num_tokens": 40428427.0,
"step": 21920
},
{
"entropy": 5.619227170944214,
"epoch": 1.8420079815164883,
"grad_norm": 1.3125,
"learning_rate": 0.0004659925667862825,
"loss": 5.3111,
"mean_token_accuracy": 0.177922086417675,
"num_tokens": 40437350.0,
"step": 21925
},
{
"entropy": 5.528036451339721,
"epoch": 1.842428061331653,
"grad_norm": 1.3515625,
"learning_rate": 0.0004659767346184725,
"loss": 5.315,
"mean_token_accuracy": 0.18087562918663025,
"num_tokens": 40446059.0,
"step": 21930
},
{
"entropy": 5.584619808197021,
"epoch": 1.8428481411468178,
"grad_norm": 1.296875,
"learning_rate": 0.00046596089906760803,
"loss": 5.3051,
"mean_token_accuracy": 0.17775034308433532,
"num_tokens": 40454959.0,
"step": 21935
},
{
"entropy": 5.608698415756225,
"epoch": 1.8432682209619826,
"grad_norm": 1.265625,
"learning_rate": 0.0004659450601339696,
"loss": 5.3258,
"mean_token_accuracy": 0.17306884974241257,
"num_tokens": 40464202.0,
"step": 21940
},
{
"entropy": 5.587285661697388,
"epoch": 1.8436883007771476,
"grad_norm": 1.25,
"learning_rate": 0.0004659292178178377,
"loss": 5.2491,
"mean_token_accuracy": 0.18372715264558792,
"num_tokens": 40473331.0,
"step": 21945
},
{
"entropy": 5.541270542144775,
"epoch": 1.8441083805923126,
"grad_norm": 1.3359375,
"learning_rate": 0.000465913372119493,
"loss": 5.2311,
"mean_token_accuracy": 0.18020168393850328,
"num_tokens": 40482098.0,
"step": 21950
},
{
"entropy": 5.594487953186035,
"epoch": 1.8445284604074774,
"grad_norm": 1.3828125,
"learning_rate": 0.0004658975230392162,
"loss": 5.2695,
"mean_token_accuracy": 0.18725144863128662,
"num_tokens": 40491134.0,
"step": 21955
},
{
"entropy": 5.628622198104859,
"epoch": 1.8449485402226422,
"grad_norm": 1.46875,
"learning_rate": 0.0004658816705772882,
"loss": 5.3965,
"mean_token_accuracy": 0.17332600951194763,
"num_tokens": 40501488.0,
"step": 21960
},
{
"entropy": 5.4865885257720945,
"epoch": 1.8453686200378072,
"grad_norm": 1.328125,
"learning_rate": 0.0004658658147339896,
"loss": 5.1533,
"mean_token_accuracy": 0.18556201159954072,
"num_tokens": 40510506.0,
"step": 21965
},
{
"entropy": 5.604847478866577,
"epoch": 1.8457886998529722,
"grad_norm": 1.34375,
"learning_rate": 0.00046584995550960146,
"loss": 5.2743,
"mean_token_accuracy": 0.18389804363250734,
"num_tokens": 40520222.0,
"step": 21970
},
{
"entropy": 5.5164261817932125,
"epoch": 1.846208779668137,
"grad_norm": 1.265625,
"learning_rate": 0.00046583409290440453,
"loss": 5.2122,
"mean_token_accuracy": 0.18363425880670547,
"num_tokens": 40528824.0,
"step": 21975
},
{
"entropy": 5.476925992965699,
"epoch": 1.8466288594833018,
"grad_norm": 1.421875,
"learning_rate": 0.0004658182269186799,
"loss": 5.2784,
"mean_token_accuracy": 0.17037064731121063,
"num_tokens": 40538144.0,
"step": 21980
},
{
"entropy": 5.580480241775513,
"epoch": 1.8470489392984666,
"grad_norm": 1.359375,
"learning_rate": 0.0004658023575527087,
"loss": 5.332,
"mean_token_accuracy": 0.17281900197267533,
"num_tokens": 40547457.0,
"step": 21985
},
{
"entropy": 5.628903198242187,
"epoch": 1.8474690191136316,
"grad_norm": 1.40625,
"learning_rate": 0.000465786484806772,
"loss": 5.2025,
"mean_token_accuracy": 0.18342212736606597,
"num_tokens": 40556005.0,
"step": 21990
},
{
"entropy": 5.385280656814575,
"epoch": 1.8478890989287966,
"grad_norm": 1.2421875,
"learning_rate": 0.00046577060868115095,
"loss": 5.1701,
"mean_token_accuracy": 0.18683878481388091,
"num_tokens": 40565018.0,
"step": 21995
},
{
"entropy": 5.514132261276245,
"epoch": 1.8483091787439614,
"grad_norm": 1.3515625,
"learning_rate": 0.0004657547291761268,
"loss": 5.2354,
"mean_token_accuracy": 0.17024336606264115,
"num_tokens": 40574931.0,
"step": 22000
},
{
"entropy": 5.623615646362305,
"epoch": 1.8487292585591262,
"grad_norm": 1.390625,
"learning_rate": 0.00046573884629198077,
"loss": 5.2364,
"mean_token_accuracy": 0.17633750140666962,
"num_tokens": 40584496.0,
"step": 22005
},
{
"entropy": 5.629044485092163,
"epoch": 1.849149338374291,
"grad_norm": 1.484375,
"learning_rate": 0.0004657229600289944,
"loss": 5.3239,
"mean_token_accuracy": 0.17284266501665116,
"num_tokens": 40594363.0,
"step": 22010
},
{
"entropy": 5.581736373901367,
"epoch": 1.849569418189456,
"grad_norm": 1.296875,
"learning_rate": 0.0004657070703874489,
"loss": 5.3432,
"mean_token_accuracy": 0.17227566838264466,
"num_tokens": 40603001.0,
"step": 22015
},
{
"entropy": 5.525221729278565,
"epoch": 1.849989498004621,
"grad_norm": 1.25,
"learning_rate": 0.00046569117736762597,
"loss": 5.2706,
"mean_token_accuracy": 0.18272185027599336,
"num_tokens": 40612660.0,
"step": 22020
},
{
"entropy": 5.5399699211120605,
"epoch": 1.8504095778197858,
"grad_norm": 1.34375,
"learning_rate": 0.00046567528096980686,
"loss": 5.2054,
"mean_token_accuracy": 0.1787782445549965,
"num_tokens": 40622209.0,
"step": 22025
},
{
"entropy": 5.607202053070068,
"epoch": 1.8508296576349506,
"grad_norm": 1.3515625,
"learning_rate": 0.00046565938119427346,
"loss": 5.308,
"mean_token_accuracy": 0.16795604974031447,
"num_tokens": 40632011.0,
"step": 22030
},
{
"entropy": 5.481884098052978,
"epoch": 1.8512497374501156,
"grad_norm": 1.296875,
"learning_rate": 0.0004656434780413073,
"loss": 5.1856,
"mean_token_accuracy": 0.18091701716184616,
"num_tokens": 40641201.0,
"step": 22035
},
{
"entropy": 5.527099227905273,
"epoch": 1.8516698172652803,
"grad_norm": 1.2265625,
"learning_rate": 0.00046562757151119,
"loss": 5.2402,
"mean_token_accuracy": 0.18224609196186065,
"num_tokens": 40650752.0,
"step": 22040
},
{
"entropy": 5.570330381393433,
"epoch": 1.8520898970804454,
"grad_norm": 1.3828125,
"learning_rate": 0.0004656116616042035,
"loss": 5.2551,
"mean_token_accuracy": 0.17235937416553498,
"num_tokens": 40659975.0,
"step": 22045
},
{
"entropy": 5.56846399307251,
"epoch": 1.8525099768956101,
"grad_norm": 1.2421875,
"learning_rate": 0.00046559574832062955,
"loss": 5.26,
"mean_token_accuracy": 0.18384793698787688,
"num_tokens": 40668944.0,
"step": 22050
},
{
"entropy": 5.64082670211792,
"epoch": 1.852930056710775,
"grad_norm": 1.28125,
"learning_rate": 0.00046557983166075,
"loss": 5.4057,
"mean_token_accuracy": 0.17182381600141525,
"num_tokens": 40678333.0,
"step": 22055
},
{
"entropy": 5.493090963363647,
"epoch": 1.85335013652594,
"grad_norm": 1.4375,
"learning_rate": 0.00046556391162484696,
"loss": 5.1516,
"mean_token_accuracy": 0.18200179785490037,
"num_tokens": 40687781.0,
"step": 22060
},
{
"entropy": 5.5217766761779785,
"epoch": 1.853770216341105,
"grad_norm": 1.4375,
"learning_rate": 0.0004655479882132023,
"loss": 5.3189,
"mean_token_accuracy": 0.17790616005659105,
"num_tokens": 40697637.0,
"step": 22065
},
{
"entropy": 5.517022275924683,
"epoch": 1.8541902961562697,
"grad_norm": 1.3515625,
"learning_rate": 0.0004655320614260982,
"loss": 5.2043,
"mean_token_accuracy": 0.18008904606103898,
"num_tokens": 40707097.0,
"step": 22070
},
{
"entropy": 5.673284339904785,
"epoch": 1.8546103759714345,
"grad_norm": 1.4921875,
"learning_rate": 0.00046551613126381673,
"loss": 5.3622,
"mean_token_accuracy": 0.1729786217212677,
"num_tokens": 40716821.0,
"step": 22075
},
{
"entropy": 5.582649612426758,
"epoch": 1.8550304557865993,
"grad_norm": 1.3515625,
"learning_rate": 0.0004655001977266401,
"loss": 5.2567,
"mean_token_accuracy": 0.17789539694786072,
"num_tokens": 40726731.0,
"step": 22080
},
{
"entropy": 5.641197347640992,
"epoch": 1.8554505356017643,
"grad_norm": 1.3359375,
"learning_rate": 0.00046548426081485046,
"loss": 5.2295,
"mean_token_accuracy": 0.17835359424352645,
"num_tokens": 40736935.0,
"step": 22085
},
{
"entropy": 5.570897436141967,
"epoch": 1.8558706154169293,
"grad_norm": 1.3125,
"learning_rate": 0.00046546832052873026,
"loss": 5.3736,
"mean_token_accuracy": 0.17090578228235245,
"num_tokens": 40746643.0,
"step": 22090
},
{
"entropy": 5.704064273834229,
"epoch": 1.8562906952320941,
"grad_norm": 1.21875,
"learning_rate": 0.00046545237686856195,
"loss": 5.4079,
"mean_token_accuracy": 0.1667260468006134,
"num_tokens": 40755713.0,
"step": 22095
},
{
"entropy": 5.6872340679168705,
"epoch": 1.856710775047259,
"grad_norm": 1.2421875,
"learning_rate": 0.00046543642983462775,
"loss": 5.4184,
"mean_token_accuracy": 0.1730544626712799,
"num_tokens": 40764878.0,
"step": 22100
},
{
"entropy": 5.596008825302124,
"epoch": 1.857130854862424,
"grad_norm": 1.3671875,
"learning_rate": 0.00046542047942721025,
"loss": 5.2939,
"mean_token_accuracy": 0.1782037064433098,
"num_tokens": 40774101.0,
"step": 22105
},
{
"entropy": 5.586978578567505,
"epoch": 1.8575509346775887,
"grad_norm": 1.25,
"learning_rate": 0.000465404525646592,
"loss": 5.2751,
"mean_token_accuracy": 0.177348293364048,
"num_tokens": 40783126.0,
"step": 22110
},
{
"entropy": 5.503357076644898,
"epoch": 1.8579710144927537,
"grad_norm": 1.3203125,
"learning_rate": 0.0004653885684930557,
"loss": 5.2501,
"mean_token_accuracy": 0.18224793970584868,
"num_tokens": 40792508.0,
"step": 22115
},
{
"entropy": 5.563293695449829,
"epoch": 1.8583910943079185,
"grad_norm": 1.2578125,
"learning_rate": 0.0004653726079668839,
"loss": 5.2934,
"mean_token_accuracy": 0.17940240651369094,
"num_tokens": 40802252.0,
"step": 22120
},
{
"entropy": 5.514010381698609,
"epoch": 1.8588111741230833,
"grad_norm": 1.3125,
"learning_rate": 0.0004653566440683594,
"loss": 5.1395,
"mean_token_accuracy": 0.18662668913602828,
"num_tokens": 40811041.0,
"step": 22125
},
{
"entropy": 5.532136011123657,
"epoch": 1.8592312539382483,
"grad_norm": 1.5,
"learning_rate": 0.000465340676797765,
"loss": 5.1914,
"mean_token_accuracy": 0.1762915700674057,
"num_tokens": 40819976.0,
"step": 22130
},
{
"entropy": 5.507018375396728,
"epoch": 1.8596513337534133,
"grad_norm": 1.34375,
"learning_rate": 0.00046532470615538344,
"loss": 5.2304,
"mean_token_accuracy": 0.1810604751110077,
"num_tokens": 40828544.0,
"step": 22135
},
{
"entropy": 5.562652349472046,
"epoch": 1.860071413568578,
"grad_norm": 1.2265625,
"learning_rate": 0.00046530873214149776,
"loss": 5.3096,
"mean_token_accuracy": 0.18318777680397033,
"num_tokens": 40838386.0,
"step": 22140
},
{
"entropy": 5.6687541007995605,
"epoch": 1.8604914933837429,
"grad_norm": 1.34375,
"learning_rate": 0.0004652927547563908,
"loss": 5.2972,
"mean_token_accuracy": 0.1770688384771347,
"num_tokens": 40847047.0,
"step": 22145
},
{
"entropy": 5.575789451599121,
"epoch": 1.8609115731989077,
"grad_norm": 1.3125,
"learning_rate": 0.0004652767740003458,
"loss": 5.3297,
"mean_token_accuracy": 0.17693712711334228,
"num_tokens": 40856653.0,
"step": 22150
},
{
"entropy": 5.65937647819519,
"epoch": 1.8613316530140727,
"grad_norm": 1.4765625,
"learning_rate": 0.00046526078987364566,
"loss": 5.3902,
"mean_token_accuracy": 0.1717894196510315,
"num_tokens": 40865176.0,
"step": 22155
},
{
"entropy": 5.671345949172974,
"epoch": 1.8617517328292377,
"grad_norm": 1.3046875,
"learning_rate": 0.0004652448023765736,
"loss": 5.4235,
"mean_token_accuracy": 0.1769840493798256,
"num_tokens": 40874084.0,
"step": 22160
},
{
"entropy": 5.620461654663086,
"epoch": 1.8621718126444025,
"grad_norm": 1.328125,
"learning_rate": 0.0004652288115094129,
"loss": 5.3182,
"mean_token_accuracy": 0.1787420317530632,
"num_tokens": 40883704.0,
"step": 22165
},
{
"entropy": 5.566469192504883,
"epoch": 1.8625918924595672,
"grad_norm": 1.421875,
"learning_rate": 0.0004652128172724466,
"loss": 5.353,
"mean_token_accuracy": 0.17113328129053115,
"num_tokens": 40893232.0,
"step": 22170
},
{
"entropy": 5.52234959602356,
"epoch": 1.8630119722747323,
"grad_norm": 1.265625,
"learning_rate": 0.00046519681966595834,
"loss": 5.2398,
"mean_token_accuracy": 0.1860972002148628,
"num_tokens": 40902242.0,
"step": 22175
},
{
"entropy": 5.570546817779541,
"epoch": 1.863432052089897,
"grad_norm": 1.2734375,
"learning_rate": 0.0004651808186902313,
"loss": 5.2446,
"mean_token_accuracy": 0.17345247715711593,
"num_tokens": 40912349.0,
"step": 22180
},
{
"entropy": 5.553358364105224,
"epoch": 1.863852131905062,
"grad_norm": 1.3984375,
"learning_rate": 0.000465164814345549,
"loss": 5.2661,
"mean_token_accuracy": 0.18283737152814866,
"num_tokens": 40922206.0,
"step": 22185
},
{
"entropy": 5.57627854347229,
"epoch": 1.8642722117202268,
"grad_norm": 1.2578125,
"learning_rate": 0.00046514880663219493,
"loss": 5.2331,
"mean_token_accuracy": 0.1829102337360382,
"num_tokens": 40931145.0,
"step": 22190
},
{
"entropy": 5.511320686340332,
"epoch": 1.8646922915353916,
"grad_norm": 1.265625,
"learning_rate": 0.0004651327955504526,
"loss": 5.1507,
"mean_token_accuracy": 0.18987528681755067,
"num_tokens": 40939917.0,
"step": 22195
},
{
"entropy": 5.500788640975952,
"epoch": 1.8651123713505566,
"grad_norm": 1.3125,
"learning_rate": 0.0004651167811006058,
"loss": 5.2224,
"mean_token_accuracy": 0.1800301715731621,
"num_tokens": 40947972.0,
"step": 22200
},
{
"entropy": 5.435527515411377,
"epoch": 1.8655324511657216,
"grad_norm": 1.28125,
"learning_rate": 0.000465100763282938,
"loss": 5.1615,
"mean_token_accuracy": 0.1882360875606537,
"num_tokens": 40956999.0,
"step": 22205
},
{
"entropy": 5.512074947357178,
"epoch": 1.8659525309808864,
"grad_norm": 1.203125,
"learning_rate": 0.0004650847420977332,
"loss": 5.1485,
"mean_token_accuracy": 0.1899052932858467,
"num_tokens": 40965917.0,
"step": 22210
},
{
"entropy": 5.531978416442871,
"epoch": 1.8663726107960512,
"grad_norm": 1.25,
"learning_rate": 0.00046506871754527495,
"loss": 5.2423,
"mean_token_accuracy": 0.17560838162899017,
"num_tokens": 40976545.0,
"step": 22215
},
{
"entropy": 5.589142990112305,
"epoch": 1.866792690611216,
"grad_norm": 1.359375,
"learning_rate": 0.00046505268962584735,
"loss": 5.2723,
"mean_token_accuracy": 0.18418483436107635,
"num_tokens": 40985890.0,
"step": 22220
},
{
"entropy": 5.616770219802857,
"epoch": 1.867212770426381,
"grad_norm": 1.3671875,
"learning_rate": 0.0004650366583397342,
"loss": 5.2945,
"mean_token_accuracy": 0.178303125500679,
"num_tokens": 40995255.0,
"step": 22225
},
{
"entropy": 5.592275190353393,
"epoch": 1.867632850241546,
"grad_norm": 1.3203125,
"learning_rate": 0.0004650206236872194,
"loss": 5.3353,
"mean_token_accuracy": 0.17213941216468812,
"num_tokens": 41004419.0,
"step": 22230
},
{
"entropy": 5.435424184799194,
"epoch": 1.8680529300567108,
"grad_norm": 1.328125,
"learning_rate": 0.0004650045856685872,
"loss": 5.0964,
"mean_token_accuracy": 0.19888545274734498,
"num_tokens": 41013179.0,
"step": 22235
},
{
"entropy": 5.537402486801147,
"epoch": 1.8684730098718756,
"grad_norm": 1.1953125,
"learning_rate": 0.00046498854428412157,
"loss": 5.1822,
"mean_token_accuracy": 0.18085305392742157,
"num_tokens": 41022307.0,
"step": 22240
},
{
"entropy": 5.531399250030518,
"epoch": 1.8688930896870404,
"grad_norm": 1.28125,
"learning_rate": 0.00046497249953410675,
"loss": 5.3154,
"mean_token_accuracy": 0.17709229290485382,
"num_tokens": 41032331.0,
"step": 22245
},
{
"entropy": 5.595411157608032,
"epoch": 1.8693131695022054,
"grad_norm": 1.296875,
"learning_rate": 0.0004649564514188269,
"loss": 5.3489,
"mean_token_accuracy": 0.17014861702919007,
"num_tokens": 41041895.0,
"step": 22250
},
{
"entropy": 5.541268682479858,
"epoch": 1.8697332493173704,
"grad_norm": 1.2734375,
"learning_rate": 0.0004649403999385662,
"loss": 5.1706,
"mean_token_accuracy": 0.1846340537071228,
"num_tokens": 41051643.0,
"step": 22255
},
{
"entropy": 5.51833438873291,
"epoch": 1.8701533291325352,
"grad_norm": 1.3046875,
"learning_rate": 0.0004649243450936092,
"loss": 5.2034,
"mean_token_accuracy": 0.18533384501934053,
"num_tokens": 41060478.0,
"step": 22260
},
{
"entropy": 5.54597225189209,
"epoch": 1.8705734089477,
"grad_norm": 1.28125,
"learning_rate": 0.0004649082868842403,
"loss": 5.2636,
"mean_token_accuracy": 0.1818034380674362,
"num_tokens": 41069389.0,
"step": 22265
},
{
"entropy": 5.4579092502594,
"epoch": 1.870993488762865,
"grad_norm": 1.4296875,
"learning_rate": 0.00046489222531074376,
"loss": 5.2134,
"mean_token_accuracy": 0.1858450397849083,
"num_tokens": 41078529.0,
"step": 22270
},
{
"entropy": 5.58187346458435,
"epoch": 1.87141356857803,
"grad_norm": 1.3125,
"learning_rate": 0.00046487616037340436,
"loss": 5.3091,
"mean_token_accuracy": 0.1759010672569275,
"num_tokens": 41087593.0,
"step": 22275
},
{
"entropy": 5.619280433654785,
"epoch": 1.8718336483931948,
"grad_norm": 1.1640625,
"learning_rate": 0.0004648600920725065,
"loss": 5.2833,
"mean_token_accuracy": 0.17433829307556153,
"num_tokens": 41098317.0,
"step": 22280
},
{
"entropy": 5.553951120376587,
"epoch": 1.8722537282083596,
"grad_norm": 1.34375,
"learning_rate": 0.00046484402040833486,
"loss": 5.2893,
"mean_token_accuracy": 0.1779267519712448,
"num_tokens": 41108659.0,
"step": 22285
},
{
"entropy": 5.603100824356079,
"epoch": 1.8726738080235243,
"grad_norm": 1.421875,
"learning_rate": 0.00046482794538117413,
"loss": 5.3766,
"mean_token_accuracy": 0.1740705132484436,
"num_tokens": 41117504.0,
"step": 22290
},
{
"entropy": 5.6185633659362795,
"epoch": 1.8730938878386894,
"grad_norm": 1.2578125,
"learning_rate": 0.00046481186699130913,
"loss": 5.2499,
"mean_token_accuracy": 0.1861429214477539,
"num_tokens": 41126249.0,
"step": 22295
},
{
"entropy": 5.434183025360108,
"epoch": 1.8735139676538544,
"grad_norm": 1.4765625,
"learning_rate": 0.0004647957852390247,
"loss": 5.1167,
"mean_token_accuracy": 0.18358826786279678,
"num_tokens": 41134956.0,
"step": 22300
},
{
"entropy": 5.472942972183228,
"epoch": 1.8739340474690191,
"grad_norm": 1.3046875,
"learning_rate": 0.00046477970012460555,
"loss": 5.2354,
"mean_token_accuracy": 0.1792644202709198,
"num_tokens": 41144340.0,
"step": 22305
},
{
"entropy": 5.538014078140259,
"epoch": 1.874354127284184,
"grad_norm": 1.4140625,
"learning_rate": 0.0004647636116483367,
"loss": 5.2502,
"mean_token_accuracy": 0.1767558440566063,
"num_tokens": 41152937.0,
"step": 22310
},
{
"entropy": 5.660906744003296,
"epoch": 1.8747742070993487,
"grad_norm": 1.296875,
"learning_rate": 0.00046474751981050334,
"loss": 5.4039,
"mean_token_accuracy": 0.1673845887184143,
"num_tokens": 41162361.0,
"step": 22315
},
{
"entropy": 5.645029926300049,
"epoch": 1.8751942869145137,
"grad_norm": 1.21875,
"learning_rate": 0.00046473142461139034,
"loss": 5.3775,
"mean_token_accuracy": 0.16849132478237153,
"num_tokens": 41171979.0,
"step": 22320
},
{
"entropy": 5.481945276260376,
"epoch": 1.8756143667296787,
"grad_norm": 1.2890625,
"learning_rate": 0.0004647153260512828,
"loss": 5.2235,
"mean_token_accuracy": 0.1834861844778061,
"num_tokens": 41182145.0,
"step": 22325
},
{
"entropy": 5.558873987197876,
"epoch": 1.8760344465448435,
"grad_norm": 1.34375,
"learning_rate": 0.0004646992241304659,
"loss": 5.2549,
"mean_token_accuracy": 0.1798875406384468,
"num_tokens": 41191522.0,
"step": 22330
},
{
"entropy": 5.618067264556885,
"epoch": 1.8764545263600083,
"grad_norm": 1.296875,
"learning_rate": 0.000464683118849225,
"loss": 5.3533,
"mean_token_accuracy": 0.17255983650684356,
"num_tokens": 41201052.0,
"step": 22335
},
{
"entropy": 5.523107671737671,
"epoch": 1.8768746061751733,
"grad_norm": 1.28125,
"learning_rate": 0.0004646670102078453,
"loss": 5.2166,
"mean_token_accuracy": 0.17975925356149675,
"num_tokens": 41210211.0,
"step": 22340
},
{
"entropy": 5.5728506565094,
"epoch": 1.8772946859903383,
"grad_norm": 1.2734375,
"learning_rate": 0.0004646508982066122,
"loss": 5.3771,
"mean_token_accuracy": 0.16956944912672042,
"num_tokens": 41219778.0,
"step": 22345
},
{
"entropy": 5.585431814193726,
"epoch": 1.8777147658055031,
"grad_norm": 1.28125,
"learning_rate": 0.00046463478284581114,
"loss": 5.3191,
"mean_token_accuracy": 0.1793970361351967,
"num_tokens": 41229550.0,
"step": 22350
},
{
"entropy": 5.58350133895874,
"epoch": 1.878134845620668,
"grad_norm": 1.328125,
"learning_rate": 0.0004646186641257275,
"loss": 5.1927,
"mean_token_accuracy": 0.18735552132129668,
"num_tokens": 41238130.0,
"step": 22355
},
{
"entropy": 5.506263017654419,
"epoch": 1.8785549254358327,
"grad_norm": 1.3125,
"learning_rate": 0.0004646025420466468,
"loss": 5.2106,
"mean_token_accuracy": 0.18112540990114212,
"num_tokens": 41247324.0,
"step": 22360
},
{
"entropy": 5.539952611923217,
"epoch": 1.8789750052509977,
"grad_norm": 1.3125,
"learning_rate": 0.00046458641660885474,
"loss": 5.2959,
"mean_token_accuracy": 0.17860177755355836,
"num_tokens": 41256131.0,
"step": 22365
},
{
"entropy": 5.606580924987793,
"epoch": 1.8793950850661627,
"grad_norm": 1.28125,
"learning_rate": 0.00046457028781263693,
"loss": 5.309,
"mean_token_accuracy": 0.1753476530313492,
"num_tokens": 41265225.0,
"step": 22370
},
{
"entropy": 5.653231859207153,
"epoch": 1.8798151648813275,
"grad_norm": 1.3046875,
"learning_rate": 0.00046455415565827907,
"loss": 5.3096,
"mean_token_accuracy": 0.17372387796640396,
"num_tokens": 41274023.0,
"step": 22375
},
{
"entropy": 5.59863314628601,
"epoch": 1.8802352446964923,
"grad_norm": 1.3671875,
"learning_rate": 0.000464538020146067,
"loss": 5.3253,
"mean_token_accuracy": 0.17805732786655426,
"num_tokens": 41283030.0,
"step": 22380
},
{
"entropy": 5.645105266571045,
"epoch": 1.880655324511657,
"grad_norm": 1.2734375,
"learning_rate": 0.0004645218812762864,
"loss": 5.3924,
"mean_token_accuracy": 0.17092460989952088,
"num_tokens": 41292654.0,
"step": 22385
},
{
"entropy": 5.580280685424805,
"epoch": 1.881075404326822,
"grad_norm": 1.1796875,
"learning_rate": 0.0004645057390492234,
"loss": 5.1424,
"mean_token_accuracy": 0.18847417533397676,
"num_tokens": 41301838.0,
"step": 22390
},
{
"entropy": 5.529407024383545,
"epoch": 1.881495484141987,
"grad_norm": 1.28125,
"learning_rate": 0.0004644895934651638,
"loss": 5.2377,
"mean_token_accuracy": 0.18382182568311692,
"num_tokens": 41311104.0,
"step": 22395
},
{
"entropy": 5.598920583724976,
"epoch": 1.8819155639571519,
"grad_norm": 1.2109375,
"learning_rate": 0.00046447344452439356,
"loss": 5.3397,
"mean_token_accuracy": 0.17356346696615219,
"num_tokens": 41320213.0,
"step": 22400
},
{
"entropy": 5.557065010070801,
"epoch": 1.8823356437723167,
"grad_norm": 1.2109375,
"learning_rate": 0.0004644572922271988,
"loss": 5.2174,
"mean_token_accuracy": 0.1747543677687645,
"num_tokens": 41330027.0,
"step": 22405
},
{
"entropy": 5.61353063583374,
"epoch": 1.8827557235874817,
"grad_norm": 1.34375,
"learning_rate": 0.00046444113657386567,
"loss": 5.3386,
"mean_token_accuracy": 0.17396150529384613,
"num_tokens": 41339481.0,
"step": 22410
},
{
"entropy": 5.6271473407745365,
"epoch": 1.8831758034026465,
"grad_norm": 1.3828125,
"learning_rate": 0.00046442497756468037,
"loss": 5.3438,
"mean_token_accuracy": 0.17416535913944245,
"num_tokens": 41348679.0,
"step": 22415
},
{
"entropy": 5.566278743743896,
"epoch": 1.8835958832178115,
"grad_norm": 1.296875,
"learning_rate": 0.00046440881519992924,
"loss": 5.1934,
"mean_token_accuracy": 0.18635202646255494,
"num_tokens": 41358736.0,
"step": 22420
},
{
"entropy": 5.554736852645874,
"epoch": 1.8840159630329762,
"grad_norm": 1.28125,
"learning_rate": 0.0004643926494798983,
"loss": 5.3649,
"mean_token_accuracy": 0.16926676034927368,
"num_tokens": 41368284.0,
"step": 22425
},
{
"entropy": 5.532454109191894,
"epoch": 1.884436042848141,
"grad_norm": 1.3203125,
"learning_rate": 0.00046437648040487426,
"loss": 5.2535,
"mean_token_accuracy": 0.17568075507879258,
"num_tokens": 41377789.0,
"step": 22430
},
{
"entropy": 5.528148365020752,
"epoch": 1.884856122663306,
"grad_norm": 1.265625,
"learning_rate": 0.00046436030797514325,
"loss": 5.2466,
"mean_token_accuracy": 0.17455026954412461,
"num_tokens": 41386909.0,
"step": 22435
},
{
"entropy": 5.586466598510742,
"epoch": 1.885276202478471,
"grad_norm": 1.1875,
"learning_rate": 0.0004643441321909919,
"loss": 5.2917,
"mean_token_accuracy": 0.17790543884038926,
"num_tokens": 41396693.0,
"step": 22440
},
{
"entropy": 5.614886951446533,
"epoch": 1.8856962822936358,
"grad_norm": 1.40625,
"learning_rate": 0.00046432795305270674,
"loss": 5.3741,
"mean_token_accuracy": 0.1692201390862465,
"num_tokens": 41407193.0,
"step": 22445
},
{
"entropy": 5.585022258758545,
"epoch": 1.8861163621088006,
"grad_norm": 1.140625,
"learning_rate": 0.00046431177056057446,
"loss": 5.3069,
"mean_token_accuracy": 0.18110948652029038,
"num_tokens": 41416567.0,
"step": 22450
},
{
"entropy": 5.474988889694214,
"epoch": 1.8865364419239654,
"grad_norm": 1.421875,
"learning_rate": 0.00046429558471488164,
"loss": 5.1823,
"mean_token_accuracy": 0.1850621894001961,
"num_tokens": 41425328.0,
"step": 22455
},
{
"entropy": 5.636248636245727,
"epoch": 1.8869565217391304,
"grad_norm": 1.171875,
"learning_rate": 0.000464279395515915,
"loss": 5.3275,
"mean_token_accuracy": 0.177673077583313,
"num_tokens": 41435229.0,
"step": 22460
},
{
"entropy": 5.575135803222656,
"epoch": 1.8873766015542954,
"grad_norm": 1.2421875,
"learning_rate": 0.00046426320296396136,
"loss": 5.2575,
"mean_token_accuracy": 0.175504831969738,
"num_tokens": 41445471.0,
"step": 22465
},
{
"entropy": 5.521603965759278,
"epoch": 1.8877966813694602,
"grad_norm": 1.484375,
"learning_rate": 0.00046424700705930745,
"loss": 5.1839,
"mean_token_accuracy": 0.18650398403406143,
"num_tokens": 41454654.0,
"step": 22470
},
{
"entropy": 5.549269580841065,
"epoch": 1.888216761184625,
"grad_norm": 1.3203125,
"learning_rate": 0.0004642308078022403,
"loss": 5.1987,
"mean_token_accuracy": 0.1773221641778946,
"num_tokens": 41463341.0,
"step": 22475
},
{
"entropy": 5.500970268249512,
"epoch": 1.88863684099979,
"grad_norm": 1.359375,
"learning_rate": 0.00046421460519304684,
"loss": 5.2433,
"mean_token_accuracy": 0.17996943444013597,
"num_tokens": 41472677.0,
"step": 22480
},
{
"entropy": 5.573593950271606,
"epoch": 1.8890569208149548,
"grad_norm": 1.25,
"learning_rate": 0.000464198399232014,
"loss": 5.3772,
"mean_token_accuracy": 0.17365273535251619,
"num_tokens": 41482867.0,
"step": 22485
},
{
"entropy": 5.6978404998779295,
"epoch": 1.8894770006301198,
"grad_norm": 1.3046875,
"learning_rate": 0.0004641821899194291,
"loss": 5.3248,
"mean_token_accuracy": 0.17080484330654144,
"num_tokens": 41493432.0,
"step": 22490
},
{
"entropy": 5.676365518569947,
"epoch": 1.8898970804452846,
"grad_norm": 1.265625,
"learning_rate": 0.00046416597725557903,
"loss": 5.3675,
"mean_token_accuracy": 0.17210190147161483,
"num_tokens": 41503807.0,
"step": 22495
},
{
"entropy": 5.562070655822754,
"epoch": 1.8903171602604494,
"grad_norm": 1.296875,
"learning_rate": 0.000464149761240751,
"loss": 5.2482,
"mean_token_accuracy": 0.18205028623342515,
"num_tokens": 41512524.0,
"step": 22500
},
{
"entropy": 5.618170547485351,
"epoch": 1.8907372400756144,
"grad_norm": 1.375,
"learning_rate": 0.00046413354187523244,
"loss": 5.365,
"mean_token_accuracy": 0.17258133441209794,
"num_tokens": 41521915.0,
"step": 22505
},
{
"entropy": 5.57930359840393,
"epoch": 1.8911573198907794,
"grad_norm": 1.21875,
"learning_rate": 0.0004641173191593105,
"loss": 5.2976,
"mean_token_accuracy": 0.1764902427792549,
"num_tokens": 41530293.0,
"step": 22510
},
{
"entropy": 5.611503171920776,
"epoch": 1.8915773997059442,
"grad_norm": 1.3125,
"learning_rate": 0.00046410109309327275,
"loss": 5.3551,
"mean_token_accuracy": 0.17041792273521422,
"num_tokens": 41538660.0,
"step": 22515
},
{
"entropy": 5.581430292129516,
"epoch": 1.891997479521109,
"grad_norm": 1.2890625,
"learning_rate": 0.00046408486367740647,
"loss": 5.3034,
"mean_token_accuracy": 0.1853417456150055,
"num_tokens": 41547952.0,
"step": 22520
},
{
"entropy": 5.566416215896607,
"epoch": 1.8924175593362738,
"grad_norm": 1.203125,
"learning_rate": 0.0004640686309119992,
"loss": 5.2356,
"mean_token_accuracy": 0.18202127665281295,
"num_tokens": 41557093.0,
"step": 22525
},
{
"entropy": 5.513507223129272,
"epoch": 1.8928376391514388,
"grad_norm": 1.21875,
"learning_rate": 0.00046405239479733844,
"loss": 5.2542,
"mean_token_accuracy": 0.18118621706962584,
"num_tokens": 41565836.0,
"step": 22530
},
{
"entropy": 5.538067817687988,
"epoch": 1.8932577189666038,
"grad_norm": 1.3203125,
"learning_rate": 0.0004640361553337119,
"loss": 5.2868,
"mean_token_accuracy": 0.18781596422195435,
"num_tokens": 41575365.0,
"step": 22535
},
{
"entropy": 5.544848108291626,
"epoch": 1.8936777987817686,
"grad_norm": 1.3046875,
"learning_rate": 0.00046401991252140715,
"loss": 5.2605,
"mean_token_accuracy": 0.17988629639148712,
"num_tokens": 41583690.0,
"step": 22540
},
{
"entropy": 5.631093978881836,
"epoch": 1.8940978785969333,
"grad_norm": 1.2265625,
"learning_rate": 0.000464003666360712,
"loss": 5.2473,
"mean_token_accuracy": 0.17940072417259217,
"num_tokens": 41593536.0,
"step": 22545
},
{
"entropy": 5.556439781188965,
"epoch": 1.8945179584120981,
"grad_norm": 1.390625,
"learning_rate": 0.0004639874168519143,
"loss": 5.2396,
"mean_token_accuracy": 0.18124915212392806,
"num_tokens": 41602543.0,
"step": 22550
},
{
"entropy": 5.537446022033691,
"epoch": 1.8949380382272631,
"grad_norm": 1.359375,
"learning_rate": 0.0004639711639953017,
"loss": 5.307,
"mean_token_accuracy": 0.1749298617243767,
"num_tokens": 41611634.0,
"step": 22555
},
{
"entropy": 5.516814231872559,
"epoch": 1.8953581180424282,
"grad_norm": 1.265625,
"learning_rate": 0.0004639549077911623,
"loss": 5.2653,
"mean_token_accuracy": 0.17714737206697465,
"num_tokens": 41621400.0,
"step": 22560
},
{
"entropy": 5.57026481628418,
"epoch": 1.895778197857593,
"grad_norm": 1.34375,
"learning_rate": 0.00046393864823978406,
"loss": 5.2401,
"mean_token_accuracy": 0.18082942366600036,
"num_tokens": 41631070.0,
"step": 22565
},
{
"entropy": 5.63809118270874,
"epoch": 1.8961982776727577,
"grad_norm": 1.28125,
"learning_rate": 0.0004639223853414549,
"loss": 5.3453,
"mean_token_accuracy": 0.17696365267038344,
"num_tokens": 41641442.0,
"step": 22570
},
{
"entropy": 5.606258487701416,
"epoch": 1.8966183574879227,
"grad_norm": 1.2890625,
"learning_rate": 0.000463906119096463,
"loss": 5.345,
"mean_token_accuracy": 0.17257939726114274,
"num_tokens": 41651616.0,
"step": 22575
},
{
"entropy": 5.608662748336792,
"epoch": 1.8970384373030877,
"grad_norm": 1.296875,
"learning_rate": 0.0004638898495050963,
"loss": 5.2537,
"mean_token_accuracy": 0.17851004898548126,
"num_tokens": 41660704.0,
"step": 22580
},
{
"entropy": 5.48230562210083,
"epoch": 1.8974585171182525,
"grad_norm": 1.2734375,
"learning_rate": 0.0004638735765676434,
"loss": 5.3113,
"mean_token_accuracy": 0.17346308678388594,
"num_tokens": 41669824.0,
"step": 22585
},
{
"entropy": 5.538960075378418,
"epoch": 1.8978785969334173,
"grad_norm": 1.25,
"learning_rate": 0.0004638573002843922,
"loss": 5.2357,
"mean_token_accuracy": 0.189484603703022,
"num_tokens": 41680082.0,
"step": 22590
},
{
"entropy": 5.562324333190918,
"epoch": 1.898298676748582,
"grad_norm": 1.59375,
"learning_rate": 0.0004638410206556312,
"loss": 5.1908,
"mean_token_accuracy": 0.18119184523820878,
"num_tokens": 41689282.0,
"step": 22595
},
{
"entropy": 5.571649885177612,
"epoch": 1.8987187565637471,
"grad_norm": 1.2421875,
"learning_rate": 0.0004638247376816489,
"loss": 5.332,
"mean_token_accuracy": 0.17145951092243195,
"num_tokens": 41699059.0,
"step": 22600
},
{
"entropy": 5.697116231918335,
"epoch": 1.8991388363789121,
"grad_norm": 1.2421875,
"learning_rate": 0.0004638084513627335,
"loss": 5.4341,
"mean_token_accuracy": 0.16782100647687911,
"num_tokens": 41708674.0,
"step": 22605
},
{
"entropy": 5.635008859634399,
"epoch": 1.899558916194077,
"grad_norm": 1.4453125,
"learning_rate": 0.00046379216169917356,
"loss": 5.3113,
"mean_token_accuracy": 0.17720879167318343,
"num_tokens": 41718418.0,
"step": 22610
},
{
"entropy": 5.573693513870239,
"epoch": 1.8999789960092417,
"grad_norm": 1.328125,
"learning_rate": 0.0004637758686912577,
"loss": 5.3352,
"mean_token_accuracy": 0.1739557534456253,
"num_tokens": 41728229.0,
"step": 22615
},
{
"entropy": 5.564685392379761,
"epoch": 1.9003990758244065,
"grad_norm": 1.2734375,
"learning_rate": 0.00046375957233927456,
"loss": 5.3012,
"mean_token_accuracy": 0.17882733047008514,
"num_tokens": 41737074.0,
"step": 22620
},
{
"entropy": 5.601039218902588,
"epoch": 1.9008191556395715,
"grad_norm": 1.25,
"learning_rate": 0.00046374327264351277,
"loss": 5.1863,
"mean_token_accuracy": 0.18088234812021256,
"num_tokens": 41745823.0,
"step": 22625
},
{
"entropy": 5.480035972595215,
"epoch": 1.9012392354547365,
"grad_norm": 1.3125,
"learning_rate": 0.00046372696960426116,
"loss": 5.2779,
"mean_token_accuracy": 0.180365127325058,
"num_tokens": 41754591.0,
"step": 22630
},
{
"entropy": 5.54322190284729,
"epoch": 1.9016593152699013,
"grad_norm": 1.5859375,
"learning_rate": 0.00046371066322180846,
"loss": 5.2764,
"mean_token_accuracy": 0.17605713307857512,
"num_tokens": 41763585.0,
"step": 22635
},
{
"entropy": 5.627299880981445,
"epoch": 1.902079395085066,
"grad_norm": 1.3203125,
"learning_rate": 0.00046369435349644344,
"loss": 5.2984,
"mean_token_accuracy": 0.17723716497421266,
"num_tokens": 41772712.0,
"step": 22640
},
{
"entropy": 5.588305234909058,
"epoch": 1.902499474900231,
"grad_norm": 1.375,
"learning_rate": 0.00046367804042845515,
"loss": 5.1966,
"mean_token_accuracy": 0.1873901903629303,
"num_tokens": 41781516.0,
"step": 22645
},
{
"entropy": 5.554796981811523,
"epoch": 1.902919554715396,
"grad_norm": 1.2421875,
"learning_rate": 0.00046366172401813253,
"loss": 5.2808,
"mean_token_accuracy": 0.17495377361774445,
"num_tokens": 41790731.0,
"step": 22650
},
{
"entropy": 5.577728605270385,
"epoch": 1.9033396345305609,
"grad_norm": 1.25,
"learning_rate": 0.0004636454042657647,
"loss": 5.2884,
"mean_token_accuracy": 0.17949878424406052,
"num_tokens": 41799654.0,
"step": 22655
},
{
"entropy": 5.488429546356201,
"epoch": 1.9037597143457257,
"grad_norm": 1.1875,
"learning_rate": 0.00046362908117164055,
"loss": 5.1628,
"mean_token_accuracy": 0.18480223864316941,
"num_tokens": 41809408.0,
"step": 22660
},
{
"entropy": 5.521701526641846,
"epoch": 1.9041797941608904,
"grad_norm": 1.21875,
"learning_rate": 0.0004636127547360494,
"loss": 5.3496,
"mean_token_accuracy": 0.17237123399972915,
"num_tokens": 41818868.0,
"step": 22665
},
{
"entropy": 5.576029348373413,
"epoch": 1.9045998739760555,
"grad_norm": 1.2578125,
"learning_rate": 0.0004635964249592804,
"loss": 5.2525,
"mean_token_accuracy": 0.18107630610466002,
"num_tokens": 41827156.0,
"step": 22670
},
{
"entropy": 5.64979100227356,
"epoch": 1.9050199537912205,
"grad_norm": 1.3359375,
"learning_rate": 0.0004635800918416229,
"loss": 5.362,
"mean_token_accuracy": 0.16546206325292587,
"num_tokens": 41837025.0,
"step": 22675
},
{
"entropy": 5.649572420120239,
"epoch": 1.9054400336063853,
"grad_norm": 1.28125,
"learning_rate": 0.00046356375538336616,
"loss": 5.278,
"mean_token_accuracy": 0.18009049147367479,
"num_tokens": 41846196.0,
"step": 22680
},
{
"entropy": 5.456643199920654,
"epoch": 1.90586011342155,
"grad_norm": 2.546875,
"learning_rate": 0.00046354741558479956,
"loss": 5.2553,
"mean_token_accuracy": 0.17622273415327072,
"num_tokens": 41855030.0,
"step": 22685
},
{
"entropy": 5.493377017974853,
"epoch": 1.9062801932367148,
"grad_norm": 1.1953125,
"learning_rate": 0.0004635310724462126,
"loss": 5.1425,
"mean_token_accuracy": 0.18287856876850128,
"num_tokens": 41863740.0,
"step": 22690
},
{
"entropy": 5.552933311462402,
"epoch": 1.9067002730518798,
"grad_norm": 1.2890625,
"learning_rate": 0.0004635147259678948,
"loss": 5.2806,
"mean_token_accuracy": 0.17719440758228303,
"num_tokens": 41873376.0,
"step": 22695
},
{
"entropy": 5.637425708770752,
"epoch": 1.9071203528670448,
"grad_norm": 1.4140625,
"learning_rate": 0.00046349837615013563,
"loss": 5.3832,
"mean_token_accuracy": 0.1679088518023491,
"num_tokens": 41882491.0,
"step": 22700
},
{
"entropy": 5.555142211914062,
"epoch": 1.9075404326822096,
"grad_norm": 1.3125,
"learning_rate": 0.0004634820229932248,
"loss": 5.2501,
"mean_token_accuracy": 0.1784417062997818,
"num_tokens": 41891357.0,
"step": 22705
},
{
"entropy": 5.590807676315308,
"epoch": 1.9079605124973744,
"grad_norm": 1.4296875,
"learning_rate": 0.00046346566649745205,
"loss": 5.3155,
"mean_token_accuracy": 0.1760914534330368,
"num_tokens": 41899874.0,
"step": 22710
},
{
"entropy": 5.537205505371094,
"epoch": 1.9083805923125394,
"grad_norm": 1.4140625,
"learning_rate": 0.000463449306663107,
"loss": 5.2856,
"mean_token_accuracy": 0.18188251107931136,
"num_tokens": 41909673.0,
"step": 22715
},
{
"entropy": 5.609598159790039,
"epoch": 1.9088006721277042,
"grad_norm": 1.296875,
"learning_rate": 0.0004634329434904796,
"loss": 5.3604,
"mean_token_accuracy": 0.17319812774658203,
"num_tokens": 41919126.0,
"step": 22720
},
{
"entropy": 5.51084885597229,
"epoch": 1.9092207519428692,
"grad_norm": 1.2890625,
"learning_rate": 0.0004634165769798596,
"loss": 5.1818,
"mean_token_accuracy": 0.18380433768033982,
"num_tokens": 41927751.0,
"step": 22725
},
{
"entropy": 5.5393534183502195,
"epoch": 1.909640831758034,
"grad_norm": 1.421875,
"learning_rate": 0.0004634002071315369,
"loss": 5.272,
"mean_token_accuracy": 0.18170064091682434,
"num_tokens": 41937290.0,
"step": 22730
},
{
"entropy": 5.534055423736572,
"epoch": 1.9100609115731988,
"grad_norm": 2.28125,
"learning_rate": 0.00046338383394580157,
"loss": 5.2234,
"mean_token_accuracy": 0.18273229897022247,
"num_tokens": 41947186.0,
"step": 22735
},
{
"entropy": 5.513985204696655,
"epoch": 1.9104809913883638,
"grad_norm": 1.265625,
"learning_rate": 0.00046336745742294366,
"loss": 5.2248,
"mean_token_accuracy": 0.1748688653111458,
"num_tokens": 41956197.0,
"step": 22740
},
{
"entropy": 5.5728717803955075,
"epoch": 1.9109010712035288,
"grad_norm": 1.234375,
"learning_rate": 0.00046335107756325316,
"loss": 5.2236,
"mean_token_accuracy": 0.1744801178574562,
"num_tokens": 41965881.0,
"step": 22745
},
{
"entropy": 5.616703462600708,
"epoch": 1.9113211510186936,
"grad_norm": 1.2890625,
"learning_rate": 0.0004633346943670204,
"loss": 5.2847,
"mean_token_accuracy": 0.1695892408490181,
"num_tokens": 41975031.0,
"step": 22750
},
{
"entropy": 5.564615392684937,
"epoch": 1.9117412308338584,
"grad_norm": 1.328125,
"learning_rate": 0.0004633183078345355,
"loss": 5.2408,
"mean_token_accuracy": 0.17534803748130798,
"num_tokens": 41984187.0,
"step": 22755
},
{
"entropy": 5.578935289382935,
"epoch": 1.9121613106490232,
"grad_norm": 1.3046875,
"learning_rate": 0.00046330191796608867,
"loss": 5.3617,
"mean_token_accuracy": 0.16856226176023484,
"num_tokens": 41993185.0,
"step": 22760
},
{
"entropy": 5.608171606063843,
"epoch": 1.9125813904641882,
"grad_norm": 1.21875,
"learning_rate": 0.0004632855247619704,
"loss": 5.2995,
"mean_token_accuracy": 0.18448079228401185,
"num_tokens": 42002521.0,
"step": 22765
},
{
"entropy": 5.624209642410278,
"epoch": 1.9130014702793532,
"grad_norm": 1.578125,
"learning_rate": 0.000463269128222471,
"loss": 5.419,
"mean_token_accuracy": 0.17294985055923462,
"num_tokens": 42011444.0,
"step": 22770
},
{
"entropy": 5.546602201461792,
"epoch": 1.913421550094518,
"grad_norm": 1.3828125,
"learning_rate": 0.0004632527283478809,
"loss": 5.2959,
"mean_token_accuracy": 0.17635179907083512,
"num_tokens": 42020916.0,
"step": 22775
},
{
"entropy": 5.632197761535645,
"epoch": 1.9138416299096828,
"grad_norm": 1.2734375,
"learning_rate": 0.00046323632513849063,
"loss": 5.3224,
"mean_token_accuracy": 0.17547445893287658,
"num_tokens": 42029467.0,
"step": 22780
},
{
"entropy": 5.467329978942871,
"epoch": 1.9142617097248478,
"grad_norm": 1.3828125,
"learning_rate": 0.0004632199185945908,
"loss": 5.0814,
"mean_token_accuracy": 0.18911270648241044,
"num_tokens": 42037435.0,
"step": 22785
},
{
"entropy": 5.6035833835601805,
"epoch": 1.9146817895400126,
"grad_norm": 1.6796875,
"learning_rate": 0.0004632035087164721,
"loss": 5.323,
"mean_token_accuracy": 0.17639073580503464,
"num_tokens": 42046943.0,
"step": 22790
},
{
"entropy": 5.524356889724731,
"epoch": 1.9151018693551776,
"grad_norm": 1.2578125,
"learning_rate": 0.0004631870955044251,
"loss": 5.198,
"mean_token_accuracy": 0.18404520452022552,
"num_tokens": 42055804.0,
"step": 22795
},
{
"entropy": 5.541313982009887,
"epoch": 1.9155219491703424,
"grad_norm": 1.265625,
"learning_rate": 0.00046317067895874063,
"loss": 5.1974,
"mean_token_accuracy": 0.18954436331987382,
"num_tokens": 42064655.0,
"step": 22800
},
{
"entropy": 5.578824901580811,
"epoch": 1.9159420289855071,
"grad_norm": 1.234375,
"learning_rate": 0.00046315425907970947,
"loss": 5.2583,
"mean_token_accuracy": 0.17812697291374208,
"num_tokens": 42073663.0,
"step": 22805
},
{
"entropy": 5.608169984817505,
"epoch": 1.9163621088006721,
"grad_norm": 1.2421875,
"learning_rate": 0.0004631378358676225,
"loss": 5.3252,
"mean_token_accuracy": 0.17986929714679717,
"num_tokens": 42083931.0,
"step": 22810
},
{
"entropy": 5.630350828170776,
"epoch": 1.9167821886158372,
"grad_norm": 1.25,
"learning_rate": 0.0004631214093227706,
"loss": 5.2855,
"mean_token_accuracy": 0.17463573366403579,
"num_tokens": 42093782.0,
"step": 22815
},
{
"entropy": 5.545824670791626,
"epoch": 1.917202268431002,
"grad_norm": 1.2578125,
"learning_rate": 0.0004631049794454448,
"loss": 5.274,
"mean_token_accuracy": 0.18353514969348908,
"num_tokens": 42103392.0,
"step": 22820
},
{
"entropy": 5.566473913192749,
"epoch": 1.9176223482461667,
"grad_norm": 1.2734375,
"learning_rate": 0.0004630885462359362,
"loss": 5.1972,
"mean_token_accuracy": 0.18071317225694655,
"num_tokens": 42112051.0,
"step": 22825
},
{
"entropy": 5.503907489776611,
"epoch": 1.9180424280613315,
"grad_norm": 1.3828125,
"learning_rate": 0.0004630721096945358,
"loss": 5.2329,
"mean_token_accuracy": 0.1836079403758049,
"num_tokens": 42120156.0,
"step": 22830
},
{
"entropy": 5.614422369003296,
"epoch": 1.9184625078764965,
"grad_norm": 1.40625,
"learning_rate": 0.0004630556698215349,
"loss": 5.3326,
"mean_token_accuracy": 0.18454811125993728,
"num_tokens": 42129564.0,
"step": 22835
},
{
"entropy": 5.621020698547364,
"epoch": 1.9188825876916615,
"grad_norm": 1.375,
"learning_rate": 0.00046303922661722466,
"loss": 5.3852,
"mean_token_accuracy": 0.1710334151983261,
"num_tokens": 42138144.0,
"step": 22840
},
{
"entropy": 5.560887956619263,
"epoch": 1.9193026675068263,
"grad_norm": 1.359375,
"learning_rate": 0.00046302278008189627,
"loss": 5.2235,
"mean_token_accuracy": 0.1777787208557129,
"num_tokens": 42147701.0,
"step": 22845
},
{
"entropy": 5.489139127731323,
"epoch": 1.919722747321991,
"grad_norm": 1.1953125,
"learning_rate": 0.0004630063302158412,
"loss": 5.1869,
"mean_token_accuracy": 0.1873548299074173,
"num_tokens": 42156772.0,
"step": 22850
},
{
"entropy": 5.483901643753052,
"epoch": 1.920142827137156,
"grad_norm": 1.2578125,
"learning_rate": 0.00046298987701935066,
"loss": 5.1339,
"mean_token_accuracy": 0.18444730788469316,
"num_tokens": 42165227.0,
"step": 22855
},
{
"entropy": 5.5027961254119875,
"epoch": 1.920562906952321,
"grad_norm": 1.3203125,
"learning_rate": 0.0004629734204927164,
"loss": 5.1758,
"mean_token_accuracy": 0.18389471173286437,
"num_tokens": 42174800.0,
"step": 22860
},
{
"entropy": 5.592266321182251,
"epoch": 1.920982986767486,
"grad_norm": 1.3125,
"learning_rate": 0.0004629569606362298,
"loss": 5.2532,
"mean_token_accuracy": 0.17747886031866072,
"num_tokens": 42184301.0,
"step": 22865
},
{
"entropy": 5.572080326080322,
"epoch": 1.9214030665826507,
"grad_norm": 1.3046875,
"learning_rate": 0.0004629404974501823,
"loss": 5.2451,
"mean_token_accuracy": 0.17935437709093094,
"num_tokens": 42193266.0,
"step": 22870
},
{
"entropy": 5.4917596817016605,
"epoch": 1.9218231463978155,
"grad_norm": 1.4375,
"learning_rate": 0.0004629240309348658,
"loss": 5.195,
"mean_token_accuracy": 0.17495142966508864,
"num_tokens": 42202051.0,
"step": 22875
},
{
"entropy": 5.502635431289673,
"epoch": 1.9222432262129805,
"grad_norm": 1.3203125,
"learning_rate": 0.0004629075610905717,
"loss": 5.0974,
"mean_token_accuracy": 0.19193665981292723,
"num_tokens": 42210716.0,
"step": 22880
},
{
"entropy": 5.477285480499267,
"epoch": 1.9226633060281455,
"grad_norm": 1.1796875,
"learning_rate": 0.000462891087917592,
"loss": 5.1838,
"mean_token_accuracy": 0.1799885168671608,
"num_tokens": 42219930.0,
"step": 22885
},
{
"entropy": 5.561040019989013,
"epoch": 1.9230833858433103,
"grad_norm": 1.3828125,
"learning_rate": 0.00046287461141621844,
"loss": 5.2383,
"mean_token_accuracy": 0.18622140735387802,
"num_tokens": 42228864.0,
"step": 22890
},
{
"entropy": 5.54845495223999,
"epoch": 1.923503465658475,
"grad_norm": 1.3046875,
"learning_rate": 0.0004628581315867429,
"loss": 5.2972,
"mean_token_accuracy": 0.17803694009780885,
"num_tokens": 42238030.0,
"step": 22895
},
{
"entropy": 5.639510869979858,
"epoch": 1.9239235454736399,
"grad_norm": 1.453125,
"learning_rate": 0.00046284164842945723,
"loss": 5.2747,
"mean_token_accuracy": 0.17670065313577651,
"num_tokens": 42247818.0,
"step": 22900
},
{
"entropy": 5.578675270080566,
"epoch": 1.9243436252888049,
"grad_norm": 1.390625,
"learning_rate": 0.0004628251619446536,
"loss": 5.2275,
"mean_token_accuracy": 0.1789063736796379,
"num_tokens": 42256772.0,
"step": 22905
},
{
"entropy": 5.510556554794311,
"epoch": 1.9247637051039699,
"grad_norm": 1.3046875,
"learning_rate": 0.00046280867213262385,
"loss": 5.3041,
"mean_token_accuracy": 0.16972119808197023,
"num_tokens": 42265620.0,
"step": 22910
},
{
"entropy": 5.651352739334106,
"epoch": 1.9251837849191347,
"grad_norm": 1.3125,
"learning_rate": 0.0004627921789936602,
"loss": 5.3404,
"mean_token_accuracy": 0.17744974344968795,
"num_tokens": 42274998.0,
"step": 22915
},
{
"entropy": 5.646574974060059,
"epoch": 1.9256038647342995,
"grad_norm": 1.1953125,
"learning_rate": 0.00046277568252805476,
"loss": 5.284,
"mean_token_accuracy": 0.18072347491979598,
"num_tokens": 42284849.0,
"step": 22920
},
{
"entropy": 5.531821537017822,
"epoch": 1.9260239445494642,
"grad_norm": 1.359375,
"learning_rate": 0.0004627591827360998,
"loss": 5.2655,
"mean_token_accuracy": 0.1810305044054985,
"num_tokens": 42294133.0,
"step": 22925
},
{
"entropy": 5.528905248641967,
"epoch": 1.9264440243646292,
"grad_norm": 1.25,
"learning_rate": 0.0004627426796180876,
"loss": 5.2572,
"mean_token_accuracy": 0.17881035208702087,
"num_tokens": 42302765.0,
"step": 22930
},
{
"entropy": 5.588286113739014,
"epoch": 1.9268641041797943,
"grad_norm": 1.4140625,
"learning_rate": 0.00046272617317431056,
"loss": 5.2148,
"mean_token_accuracy": 0.17467682361602782,
"num_tokens": 42311829.0,
"step": 22935
},
{
"entropy": 5.629926109313965,
"epoch": 1.927284183994959,
"grad_norm": 1.25,
"learning_rate": 0.00046270966340506087,
"loss": 5.3278,
"mean_token_accuracy": 0.1830289915204048,
"num_tokens": 42321294.0,
"step": 22940
},
{
"entropy": 5.575673294067383,
"epoch": 1.9277042638101238,
"grad_norm": 1.3359375,
"learning_rate": 0.00046269315031063137,
"loss": 5.1622,
"mean_token_accuracy": 0.1837637633085251,
"num_tokens": 42329379.0,
"step": 22945
},
{
"entropy": 5.575409269332885,
"epoch": 1.9281243436252888,
"grad_norm": 1.25,
"learning_rate": 0.00046267663389131425,
"loss": 5.3279,
"mean_token_accuracy": 0.16886840760707855,
"num_tokens": 42339867.0,
"step": 22950
},
{
"entropy": 5.578000688552857,
"epoch": 1.9285444234404538,
"grad_norm": 1.5078125,
"learning_rate": 0.00046266011414740213,
"loss": 5.3496,
"mean_token_accuracy": 0.17497621178627015,
"num_tokens": 42350174.0,
"step": 22955
},
{
"entropy": 5.552455091476441,
"epoch": 1.9289645032556186,
"grad_norm": 1.3828125,
"learning_rate": 0.0004626435910791878,
"loss": 5.2141,
"mean_token_accuracy": 0.1810801714658737,
"num_tokens": 42359214.0,
"step": 22960
},
{
"entropy": 5.564496994018555,
"epoch": 1.9293845830707834,
"grad_norm": 1.390625,
"learning_rate": 0.00046262706468696386,
"loss": 5.2899,
"mean_token_accuracy": 0.1764672428369522,
"num_tokens": 42367965.0,
"step": 22965
},
{
"entropy": 5.615366458892822,
"epoch": 1.9298046628859482,
"grad_norm": 1.46875,
"learning_rate": 0.0004626105349710231,
"loss": 5.2991,
"mean_token_accuracy": 0.173054276406765,
"num_tokens": 42377233.0,
"step": 22970
},
{
"entropy": 5.750267505645752,
"epoch": 1.9302247427011132,
"grad_norm": 1.265625,
"learning_rate": 0.0004625940019316584,
"loss": 5.3203,
"mean_token_accuracy": 0.17039854824543,
"num_tokens": 42386060.0,
"step": 22975
},
{
"entropy": 5.575665044784546,
"epoch": 1.9306448225162782,
"grad_norm": 1.3671875,
"learning_rate": 0.00046257746556916236,
"loss": 5.3134,
"mean_token_accuracy": 0.17894930243492127,
"num_tokens": 42395659.0,
"step": 22980
},
{
"entropy": 5.593395519256592,
"epoch": 1.931064902331443,
"grad_norm": 1.296875,
"learning_rate": 0.00046256092588382825,
"loss": 5.3089,
"mean_token_accuracy": 0.1758568286895752,
"num_tokens": 42403531.0,
"step": 22985
},
{
"entropy": 5.596629667282104,
"epoch": 1.9314849821466078,
"grad_norm": 1.2734375,
"learning_rate": 0.00046254438287594884,
"loss": 5.2539,
"mean_token_accuracy": 0.18161848336458206,
"num_tokens": 42412364.0,
"step": 22990
},
{
"entropy": 5.533616590499878,
"epoch": 1.9319050619617726,
"grad_norm": 1.3828125,
"learning_rate": 0.00046252783654581733,
"loss": 5.2532,
"mean_token_accuracy": 0.17354945093393326,
"num_tokens": 42422276.0,
"step": 22995
},
{
"entropy": 5.621906805038452,
"epoch": 1.9323251417769376,
"grad_norm": 1.3828125,
"learning_rate": 0.0004625112868937267,
"loss": 5.2763,
"mean_token_accuracy": 0.18066140562295913,
"num_tokens": 42430853.0,
"step": 23000
},
{
"entropy": 5.507568359375,
"epoch": 1.9327452215921026,
"grad_norm": 1.1875,
"learning_rate": 0.0004624947339199702,
"loss": 5.1618,
"mean_token_accuracy": 0.181491519510746,
"num_tokens": 42439034.0,
"step": 23005
},
{
"entropy": 5.583389854431152,
"epoch": 1.9331653014072674,
"grad_norm": 1.1796875,
"learning_rate": 0.000462478177624841,
"loss": 5.3344,
"mean_token_accuracy": 0.17821798473596573,
"num_tokens": 42448494.0,
"step": 23010
},
{
"entropy": 5.62125039100647,
"epoch": 1.9335853812224322,
"grad_norm": 1.1875,
"learning_rate": 0.00046246161800863244,
"loss": 5.2478,
"mean_token_accuracy": 0.1788298413157463,
"num_tokens": 42457188.0,
"step": 23015
},
{
"entropy": 5.566138172149659,
"epoch": 1.9340054610375972,
"grad_norm": 1.2265625,
"learning_rate": 0.0004624450550716379,
"loss": 5.3213,
"mean_token_accuracy": 0.17829720973968505,
"num_tokens": 42466321.0,
"step": 23020
},
{
"entropy": 5.5836546421051025,
"epoch": 1.934425540852762,
"grad_norm": 1.171875,
"learning_rate": 0.0004624284888141507,
"loss": 5.2612,
"mean_token_accuracy": 0.17595684081315993,
"num_tokens": 42475879.0,
"step": 23025
},
{
"entropy": 5.634458112716675,
"epoch": 1.934845620667927,
"grad_norm": 1.2890625,
"learning_rate": 0.0004624119192364643,
"loss": 5.3969,
"mean_token_accuracy": 0.17100559324026107,
"num_tokens": 42484988.0,
"step": 23030
},
{
"entropy": 5.5727685451507565,
"epoch": 1.9352657004830918,
"grad_norm": 1.2890625,
"learning_rate": 0.00046239534633887223,
"loss": 5.2141,
"mean_token_accuracy": 0.17606755048036576,
"num_tokens": 42493764.0,
"step": 23035
},
{
"entropy": 5.64964599609375,
"epoch": 1.9356857802982566,
"grad_norm": 1.2890625,
"learning_rate": 0.0004623787701216682,
"loss": 5.4362,
"mean_token_accuracy": 0.17573302090168,
"num_tokens": 42503312.0,
"step": 23040
},
{
"entropy": 5.577664566040039,
"epoch": 1.9361058601134216,
"grad_norm": 1.234375,
"learning_rate": 0.00046236219058514566,
"loss": 5.2767,
"mean_token_accuracy": 0.1744777515530586,
"num_tokens": 42512303.0,
"step": 23045
},
{
"entropy": 5.4790503025054935,
"epoch": 1.9365259399285866,
"grad_norm": 1.125,
"learning_rate": 0.0004623456077295984,
"loss": 5.1452,
"mean_token_accuracy": 0.18964465856552123,
"num_tokens": 42520928.0,
"step": 23050
},
{
"entropy": 5.500149250030518,
"epoch": 1.9369460197437514,
"grad_norm": 1.28125,
"learning_rate": 0.0004623290215553201,
"loss": 5.217,
"mean_token_accuracy": 0.18687326908111573,
"num_tokens": 42529945.0,
"step": 23055
},
{
"entropy": 5.5279261589050295,
"epoch": 1.9373660995589161,
"grad_norm": 1.3984375,
"learning_rate": 0.0004623124320626048,
"loss": 5.2598,
"mean_token_accuracy": 0.18028688579797744,
"num_tokens": 42539078.0,
"step": 23060
},
{
"entropy": 5.576948356628418,
"epoch": 1.937786179374081,
"grad_norm": 1.3515625,
"learning_rate": 0.0004622958392517461,
"loss": 5.2134,
"mean_token_accuracy": 0.18364020735025405,
"num_tokens": 42547842.0,
"step": 23065
},
{
"entropy": 5.555529451370239,
"epoch": 1.938206259189246,
"grad_norm": 1.40625,
"learning_rate": 0.0004622792431230381,
"loss": 5.1664,
"mean_token_accuracy": 0.18663084357976914,
"num_tokens": 42556574.0,
"step": 23070
},
{
"entropy": 5.5518426418304445,
"epoch": 1.938626339004411,
"grad_norm": 1.4375,
"learning_rate": 0.00046226264367677476,
"loss": 5.2732,
"mean_token_accuracy": 0.17023178189992905,
"num_tokens": 42565906.0,
"step": 23075
},
{
"entropy": 5.566661691665649,
"epoch": 1.9390464188195757,
"grad_norm": 1.28125,
"learning_rate": 0.0004622460409132501,
"loss": 5.236,
"mean_token_accuracy": 0.18457957804203035,
"num_tokens": 42574929.0,
"step": 23080
},
{
"entropy": 5.618380928039551,
"epoch": 1.9394664986347405,
"grad_norm": 1.2890625,
"learning_rate": 0.0004622294348327582,
"loss": 5.289,
"mean_token_accuracy": 0.17092613279819488,
"num_tokens": 42585185.0,
"step": 23085
},
{
"entropy": 5.524164485931396,
"epoch": 1.9398865784499055,
"grad_norm": 1.3203125,
"learning_rate": 0.00046221282543559334,
"loss": 5.2999,
"mean_token_accuracy": 0.17540085613727568,
"num_tokens": 42594272.0,
"step": 23090
},
{
"entropy": 5.504875946044922,
"epoch": 1.9403066582650703,
"grad_norm": 1.3359375,
"learning_rate": 0.00046219621272204967,
"loss": 5.2143,
"mean_token_accuracy": 0.1792544975876808,
"num_tokens": 42603410.0,
"step": 23095
},
{
"entropy": 5.556964588165283,
"epoch": 1.9407267380802353,
"grad_norm": 1.2578125,
"learning_rate": 0.00046217959669242145,
"loss": 5.4036,
"mean_token_accuracy": 0.16618091911077498,
"num_tokens": 42613879.0,
"step": 23100
},
{
"entropy": 5.583468198776245,
"epoch": 1.9411468178954001,
"grad_norm": 1.2734375,
"learning_rate": 0.000462162977347003,
"loss": 5.2039,
"mean_token_accuracy": 0.18091069161891937,
"num_tokens": 42623323.0,
"step": 23105
},
{
"entropy": 5.636310768127442,
"epoch": 1.941566897710565,
"grad_norm": 1.234375,
"learning_rate": 0.00046214635468608885,
"loss": 5.265,
"mean_token_accuracy": 0.1810379669070244,
"num_tokens": 42632365.0,
"step": 23110
},
{
"entropy": 5.554380321502686,
"epoch": 1.94198697752573,
"grad_norm": 1.1875,
"learning_rate": 0.00046212972870997336,
"loss": 5.2256,
"mean_token_accuracy": 0.18930767923593522,
"num_tokens": 42641872.0,
"step": 23115
},
{
"entropy": 5.553452634811402,
"epoch": 1.942407057340895,
"grad_norm": 1.15625,
"learning_rate": 0.0004621130994189511,
"loss": 5.3119,
"mean_token_accuracy": 0.18180701732635499,
"num_tokens": 42652031.0,
"step": 23120
},
{
"entropy": 5.517439985275269,
"epoch": 1.9428271371560597,
"grad_norm": 1.265625,
"learning_rate": 0.0004620964668133166,
"loss": 5.2446,
"mean_token_accuracy": 0.17820723354816437,
"num_tokens": 42661040.0,
"step": 23125
},
{
"entropy": 5.5945065975189205,
"epoch": 1.9432472169712245,
"grad_norm": 1.234375,
"learning_rate": 0.0004620798308933646,
"loss": 5.2697,
"mean_token_accuracy": 0.17377900779247285,
"num_tokens": 42670559.0,
"step": 23130
},
{
"entropy": 5.6079041957855225,
"epoch": 1.9436672967863893,
"grad_norm": 1.21875,
"learning_rate": 0.0004620631916593897,
"loss": 5.2744,
"mean_token_accuracy": 0.1778078034520149,
"num_tokens": 42679883.0,
"step": 23135
},
{
"entropy": 5.667489004135132,
"epoch": 1.9440873766015543,
"grad_norm": 1.1171875,
"learning_rate": 0.0004620465491116867,
"loss": 5.3962,
"mean_token_accuracy": 0.16636696457862854,
"num_tokens": 42689746.0,
"step": 23140
},
{
"entropy": 5.6248866558074955,
"epoch": 1.9445074564167193,
"grad_norm": 1.21875,
"learning_rate": 0.00046202990325055034,
"loss": 5.3102,
"mean_token_accuracy": 0.1707261472940445,
"num_tokens": 42699685.0,
"step": 23145
},
{
"entropy": 5.536729764938355,
"epoch": 1.944927536231884,
"grad_norm": 1.2109375,
"learning_rate": 0.0004620132540762756,
"loss": 5.1763,
"mean_token_accuracy": 0.1770971119403839,
"num_tokens": 42708873.0,
"step": 23150
},
{
"entropy": 5.508062744140625,
"epoch": 1.9453476160470489,
"grad_norm": 1.4609375,
"learning_rate": 0.00046199660158915734,
"loss": 5.2328,
"mean_token_accuracy": 0.1747881919145584,
"num_tokens": 42717807.0,
"step": 23155
},
{
"entropy": 5.51996636390686,
"epoch": 1.9457676958622139,
"grad_norm": 1.3203125,
"learning_rate": 0.00046197994578949056,
"loss": 5.2955,
"mean_token_accuracy": 0.17850422114133835,
"num_tokens": 42726674.0,
"step": 23160
},
{
"entropy": 5.605596446990967,
"epoch": 1.9461877756773787,
"grad_norm": 1.4296875,
"learning_rate": 0.0004619632866775704,
"loss": 5.3616,
"mean_token_accuracy": 0.18147125244140624,
"num_tokens": 42735621.0,
"step": 23165
},
{
"entropy": 5.541454076766968,
"epoch": 1.9466078554925437,
"grad_norm": 1.296875,
"learning_rate": 0.0004619466242536918,
"loss": 5.2547,
"mean_token_accuracy": 0.17949382066726685,
"num_tokens": 42744945.0,
"step": 23170
},
{
"entropy": 5.5860639095306395,
"epoch": 1.9470279353077085,
"grad_norm": 1.296875,
"learning_rate": 0.0004619299585181501,
"loss": 5.3531,
"mean_token_accuracy": 0.17689019143581391,
"num_tokens": 42754906.0,
"step": 23175
},
{
"entropy": 5.640099143981933,
"epoch": 1.9474480151228732,
"grad_norm": 1.390625,
"learning_rate": 0.00046191328947124027,
"loss": 5.25,
"mean_token_accuracy": 0.17939385771751404,
"num_tokens": 42764673.0,
"step": 23180
},
{
"entropy": 5.454021310806274,
"epoch": 1.9478680949380383,
"grad_norm": 1.3359375,
"learning_rate": 0.00046189661711325784,
"loss": 5.2441,
"mean_token_accuracy": 0.18985147029161453,
"num_tokens": 42774528.0,
"step": 23185
},
{
"entropy": 5.488689374923706,
"epoch": 1.9482881747532033,
"grad_norm": 1.3125,
"learning_rate": 0.00046187994144449815,
"loss": 5.1613,
"mean_token_accuracy": 0.18611329793930054,
"num_tokens": 42783813.0,
"step": 23190
},
{
"entropy": 5.5707391738891605,
"epoch": 1.948708254568368,
"grad_norm": 1.359375,
"learning_rate": 0.0004618632624652565,
"loss": 5.2434,
"mean_token_accuracy": 0.17330222129821776,
"num_tokens": 42793483.0,
"step": 23195
},
{
"entropy": 5.552345180511475,
"epoch": 1.9491283343835328,
"grad_norm": 1.34375,
"learning_rate": 0.0004618465801758283,
"loss": 5.3134,
"mean_token_accuracy": 0.17470603734254836,
"num_tokens": 42803177.0,
"step": 23200
},
{
"entropy": 5.588451957702636,
"epoch": 1.9495484141986976,
"grad_norm": 1.28125,
"learning_rate": 0.00046182989457650925,
"loss": 5.3165,
"mean_token_accuracy": 0.17706115841865538,
"num_tokens": 42812395.0,
"step": 23205
},
{
"entropy": 5.577643632888794,
"epoch": 1.9499684940138626,
"grad_norm": 1.3203125,
"learning_rate": 0.00046181320566759476,
"loss": 5.2848,
"mean_token_accuracy": 0.18093040585517883,
"num_tokens": 42821495.0,
"step": 23210
},
{
"entropy": 5.585937976837158,
"epoch": 1.9503885738290276,
"grad_norm": 1.375,
"learning_rate": 0.00046179651344938055,
"loss": 5.2538,
"mean_token_accuracy": 0.17803705632686614,
"num_tokens": 42832219.0,
"step": 23215
},
{
"entropy": 5.524721097946167,
"epoch": 1.9508086536441924,
"grad_norm": 1.3828125,
"learning_rate": 0.00046177981792216234,
"loss": 5.1873,
"mean_token_accuracy": 0.18072481602430343,
"num_tokens": 42841368.0,
"step": 23220
},
{
"entropy": 5.499928712844849,
"epoch": 1.9512287334593572,
"grad_norm": 1.171875,
"learning_rate": 0.00046176311908623574,
"loss": 5.2436,
"mean_token_accuracy": 0.18720989674329758,
"num_tokens": 42850512.0,
"step": 23225
},
{
"entropy": 5.5761292457580565,
"epoch": 1.951648813274522,
"grad_norm": 1.3046875,
"learning_rate": 0.0004617464169418967,
"loss": 5.283,
"mean_token_accuracy": 0.18036168068647385,
"num_tokens": 42860749.0,
"step": 23230
},
{
"entropy": 5.550933122634888,
"epoch": 1.952068893089687,
"grad_norm": 1.1953125,
"learning_rate": 0.00046172971148944106,
"loss": 5.2403,
"mean_token_accuracy": 0.18047946840524673,
"num_tokens": 42869880.0,
"step": 23235
},
{
"entropy": 5.517881441116333,
"epoch": 1.952488972904852,
"grad_norm": 1.3046875,
"learning_rate": 0.00046171300272916465,
"loss": 5.2423,
"mean_token_accuracy": 0.18660469204187394,
"num_tokens": 42879001.0,
"step": 23240
},
{
"entropy": 5.471151638031006,
"epoch": 1.9529090527200168,
"grad_norm": 1.3359375,
"learning_rate": 0.00046169629066136357,
"loss": 5.1603,
"mean_token_accuracy": 0.1858905389904976,
"num_tokens": 42888036.0,
"step": 23245
},
{
"entropy": 5.562799787521362,
"epoch": 1.9533291325351816,
"grad_norm": 1.4140625,
"learning_rate": 0.00046167957528633387,
"loss": 5.271,
"mean_token_accuracy": 0.17963351905345917,
"num_tokens": 42897460.0,
"step": 23250
},
{
"entropy": 5.559833526611328,
"epoch": 1.9537492123503466,
"grad_norm": 1.2578125,
"learning_rate": 0.00046166285660437164,
"loss": 5.281,
"mean_token_accuracy": 0.181427900493145,
"num_tokens": 42907010.0,
"step": 23255
},
{
"entropy": 5.577475738525391,
"epoch": 1.9541692921655116,
"grad_norm": 1.3828125,
"learning_rate": 0.000461646134615773,
"loss": 5.2437,
"mean_token_accuracy": 0.17573917806148528,
"num_tokens": 42915684.0,
"step": 23260
},
{
"entropy": 5.555012941360474,
"epoch": 1.9545893719806764,
"grad_norm": 1.4296875,
"learning_rate": 0.00046162940932083414,
"loss": 5.2444,
"mean_token_accuracy": 0.18114184141159057,
"num_tokens": 42924903.0,
"step": 23265
},
{
"entropy": 5.537017297744751,
"epoch": 1.9550094517958412,
"grad_norm": 1.234375,
"learning_rate": 0.00046161268071985144,
"loss": 5.2532,
"mean_token_accuracy": 0.177630053460598,
"num_tokens": 42935234.0,
"step": 23270
},
{
"entropy": 5.405535125732422,
"epoch": 1.955429531611006,
"grad_norm": 1.4140625,
"learning_rate": 0.0004615959488131212,
"loss": 5.1856,
"mean_token_accuracy": 0.1869486778974533,
"num_tokens": 42944093.0,
"step": 23275
},
{
"entropy": 5.5357222080230715,
"epoch": 1.955849611426171,
"grad_norm": 1.25,
"learning_rate": 0.0004615792136009398,
"loss": 5.2067,
"mean_token_accuracy": 0.17876064479351045,
"num_tokens": 42953504.0,
"step": 23280
},
{
"entropy": 5.583680486679077,
"epoch": 1.956269691241336,
"grad_norm": 1.34375,
"learning_rate": 0.00046156247508360375,
"loss": 5.2223,
"mean_token_accuracy": 0.18174861669540404,
"num_tokens": 42962205.0,
"step": 23285
},
{
"entropy": 5.491861724853516,
"epoch": 1.9566897710565008,
"grad_norm": 1.375,
"learning_rate": 0.0004615457332614095,
"loss": 5.1932,
"mean_token_accuracy": 0.18289777636528015,
"num_tokens": 42971240.0,
"step": 23290
},
{
"entropy": 5.6076373100280765,
"epoch": 1.9571098508716656,
"grad_norm": 1.125,
"learning_rate": 0.00046152898813465353,
"loss": 5.3373,
"mean_token_accuracy": 0.17631391435861588,
"num_tokens": 42981573.0,
"step": 23295
},
{
"entropy": 5.577195310592652,
"epoch": 1.9575299306868303,
"grad_norm": 1.2421875,
"learning_rate": 0.0004615122397036327,
"loss": 5.2601,
"mean_token_accuracy": 0.1730024203658104,
"num_tokens": 42991383.0,
"step": 23300
},
{
"entropy": 5.519239854812622,
"epoch": 1.9579500105019954,
"grad_norm": 1.1796875,
"learning_rate": 0.00046149548796864355,
"loss": 5.2092,
"mean_token_accuracy": 0.17875238209962846,
"num_tokens": 43000029.0,
"step": 23305
},
{
"entropy": 5.564729261398315,
"epoch": 1.9583700903171604,
"grad_norm": 1.28125,
"learning_rate": 0.00046147873292998285,
"loss": 5.294,
"mean_token_accuracy": 0.1774600401520729,
"num_tokens": 43008880.0,
"step": 23310
},
{
"entropy": 5.507511472702026,
"epoch": 1.9587901701323251,
"grad_norm": 1.25,
"learning_rate": 0.0004614619745879475,
"loss": 5.2463,
"mean_token_accuracy": 0.18070561736822127,
"num_tokens": 43017417.0,
"step": 23315
},
{
"entropy": 5.507794284820557,
"epoch": 1.95921024994749,
"grad_norm": 1.3359375,
"learning_rate": 0.0004614452129428342,
"loss": 5.1643,
"mean_token_accuracy": 0.186017145216465,
"num_tokens": 43025738.0,
"step": 23320
},
{
"entropy": 5.596672487258911,
"epoch": 1.959630329762655,
"grad_norm": 1.3828125,
"learning_rate": 0.0004614284479949399,
"loss": 5.2772,
"mean_token_accuracy": 0.18011400997638702,
"num_tokens": 43035485.0,
"step": 23325
},
{
"entropy": 5.60905966758728,
"epoch": 1.96005040957782,
"grad_norm": 1.28125,
"learning_rate": 0.0004614116797445617,
"loss": 5.3051,
"mean_token_accuracy": 0.1812348783016205,
"num_tokens": 43044627.0,
"step": 23330
},
{
"entropy": 5.495553207397461,
"epoch": 1.9604704893929847,
"grad_norm": 1.296875,
"learning_rate": 0.00046139490819199666,
"loss": 5.2232,
"mean_token_accuracy": 0.17964406907558442,
"num_tokens": 43053790.0,
"step": 23335
},
{
"entropy": 5.509322214126587,
"epoch": 1.9608905692081495,
"grad_norm": 1.2734375,
"learning_rate": 0.0004613781333375417,
"loss": 5.208,
"mean_token_accuracy": 0.19156275391578675,
"num_tokens": 43063511.0,
"step": 23340
},
{
"entropy": 5.444080877304077,
"epoch": 1.9613106490233143,
"grad_norm": 1.328125,
"learning_rate": 0.0004613613551814941,
"loss": 5.171,
"mean_token_accuracy": 0.18634732216596603,
"num_tokens": 43072349.0,
"step": 23345
},
{
"entropy": 5.609320974349975,
"epoch": 1.9617307288384793,
"grad_norm": 1.296875,
"learning_rate": 0.0004613445737241511,
"loss": 5.2739,
"mean_token_accuracy": 0.1769397720694542,
"num_tokens": 43081552.0,
"step": 23350
},
{
"entropy": 5.626247262954712,
"epoch": 1.9621508086536443,
"grad_norm": 1.1796875,
"learning_rate": 0.00046132778896581,
"loss": 5.285,
"mean_token_accuracy": 0.18145032376050949,
"num_tokens": 43092321.0,
"step": 23355
},
{
"entropy": 5.535122680664062,
"epoch": 1.9625708884688091,
"grad_norm": 1.28125,
"learning_rate": 0.0004613110009067679,
"loss": 5.2639,
"mean_token_accuracy": 0.17941856980323792,
"num_tokens": 43102326.0,
"step": 23360
},
{
"entropy": 5.5370419979095455,
"epoch": 1.962990968283974,
"grad_norm": 1.234375,
"learning_rate": 0.00046129420954732237,
"loss": 5.3018,
"mean_token_accuracy": 0.1739087775349617,
"num_tokens": 43110895.0,
"step": 23365
},
{
"entropy": 5.5126872062683105,
"epoch": 1.9634110480991387,
"grad_norm": 1.21875,
"learning_rate": 0.0004612774148877709,
"loss": 5.1474,
"mean_token_accuracy": 0.18664878159761428,
"num_tokens": 43119948.0,
"step": 23370
},
{
"entropy": 5.631476259231567,
"epoch": 1.9638311279143037,
"grad_norm": 1.3515625,
"learning_rate": 0.000461260616928411,
"loss": 5.3368,
"mean_token_accuracy": 0.18249114006757736,
"num_tokens": 43129876.0,
"step": 23375
},
{
"entropy": 5.569365978240967,
"epoch": 1.9642512077294687,
"grad_norm": 1.25,
"learning_rate": 0.00046124381566954006,
"loss": 5.2741,
"mean_token_accuracy": 0.18068390488624572,
"num_tokens": 43138831.0,
"step": 23380
},
{
"entropy": 5.587001371383667,
"epoch": 1.9646712875446335,
"grad_norm": 1.3359375,
"learning_rate": 0.00046122701111145587,
"loss": 5.262,
"mean_token_accuracy": 0.1764961764216423,
"num_tokens": 43147338.0,
"step": 23385
},
{
"entropy": 5.517053031921387,
"epoch": 1.9650913673597983,
"grad_norm": 1.15625,
"learning_rate": 0.0004612102032544561,
"loss": 5.2205,
"mean_token_accuracy": 0.17913646399974822,
"num_tokens": 43158587.0,
"step": 23390
},
{
"entropy": 5.558087730407715,
"epoch": 1.9655114471749633,
"grad_norm": 1.3828125,
"learning_rate": 0.00046119339209883846,
"loss": 5.1978,
"mean_token_accuracy": 0.18679269105196,
"num_tokens": 43167610.0,
"step": 23395
},
{
"entropy": 5.41092357635498,
"epoch": 1.965931526990128,
"grad_norm": 1.5859375,
"learning_rate": 0.0004611765776449007,
"loss": 5.1862,
"mean_token_accuracy": 0.1797805055975914,
"num_tokens": 43176374.0,
"step": 23400
},
{
"entropy": 5.513606309890747,
"epoch": 1.966351606805293,
"grad_norm": 1.3125,
"learning_rate": 0.00046115975989294083,
"loss": 5.3466,
"mean_token_accuracy": 0.17182406634092331,
"num_tokens": 43187038.0,
"step": 23405
},
{
"entropy": 5.709100437164307,
"epoch": 1.9667716866204579,
"grad_norm": 1.4140625,
"learning_rate": 0.0004611429388432566,
"loss": 5.3426,
"mean_token_accuracy": 0.17381678372621537,
"num_tokens": 43197868.0,
"step": 23410
},
{
"entropy": 5.61912727355957,
"epoch": 1.9671917664356227,
"grad_norm": 1.328125,
"learning_rate": 0.00046112611449614603,
"loss": 5.3039,
"mean_token_accuracy": 0.1759103298187256,
"num_tokens": 43207675.0,
"step": 23415
},
{
"entropy": 5.56065001487732,
"epoch": 1.9676118462507877,
"grad_norm": 1.3359375,
"learning_rate": 0.0004611092868519072,
"loss": 5.2861,
"mean_token_accuracy": 0.17672661542892457,
"num_tokens": 43217154.0,
"step": 23420
},
{
"entropy": 5.542627668380737,
"epoch": 1.9680319260659527,
"grad_norm": 1.375,
"learning_rate": 0.0004610924559108383,
"loss": 5.2817,
"mean_token_accuracy": 0.18128888458013534,
"num_tokens": 43226912.0,
"step": 23425
},
{
"entropy": 5.575988626480102,
"epoch": 1.9684520058811175,
"grad_norm": 1.21875,
"learning_rate": 0.0004610756216732372,
"loss": 5.2799,
"mean_token_accuracy": 0.179672272503376,
"num_tokens": 43236711.0,
"step": 23430
},
{
"entropy": 5.59436936378479,
"epoch": 1.9688720856962822,
"grad_norm": 1.3359375,
"learning_rate": 0.00046105878413940237,
"loss": 5.2758,
"mean_token_accuracy": 0.18565800338983535,
"num_tokens": 43247005.0,
"step": 23435
},
{
"entropy": 5.381244802474976,
"epoch": 1.969292165511447,
"grad_norm": 1.28125,
"learning_rate": 0.000461041943309632,
"loss": 5.1112,
"mean_token_accuracy": 0.18909744173288345,
"num_tokens": 43255868.0,
"step": 23440
},
{
"entropy": 5.5130963802337645,
"epoch": 1.969712245326612,
"grad_norm": 1.25,
"learning_rate": 0.0004610250991842244,
"loss": 5.2366,
"mean_token_accuracy": 0.1798836976289749,
"num_tokens": 43265727.0,
"step": 23445
},
{
"entropy": 5.596400308609009,
"epoch": 1.970132325141777,
"grad_norm": 1.3828125,
"learning_rate": 0.00046100825176347796,
"loss": 5.2831,
"mean_token_accuracy": 0.1786339282989502,
"num_tokens": 43274530.0,
"step": 23450
},
{
"entropy": 5.447563409805298,
"epoch": 1.9705524049569418,
"grad_norm": 1.3203125,
"learning_rate": 0.000460991401047691,
"loss": 5.1713,
"mean_token_accuracy": 0.17677199393510817,
"num_tokens": 43285130.0,
"step": 23455
},
{
"entropy": 5.534085607528686,
"epoch": 1.9709724847721066,
"grad_norm": 1.3046875,
"learning_rate": 0.0004609745470371622,
"loss": 5.2533,
"mean_token_accuracy": 0.1815151169896126,
"num_tokens": 43293574.0,
"step": 23460
},
{
"entropy": 5.474059581756592,
"epoch": 1.9713925645872716,
"grad_norm": 1.265625,
"learning_rate": 0.0004609576897321902,
"loss": 5.0841,
"mean_token_accuracy": 0.18597435653209687,
"num_tokens": 43301989.0,
"step": 23465
},
{
"entropy": 5.593522310256958,
"epoch": 1.9718126444024364,
"grad_norm": 1.3046875,
"learning_rate": 0.00046094082913307336,
"loss": 5.2717,
"mean_token_accuracy": 0.17969490885734557,
"num_tokens": 43310934.0,
"step": 23470
},
{
"entropy": 5.559579706192016,
"epoch": 1.9722327242176014,
"grad_norm": 1.328125,
"learning_rate": 0.0004609239652401104,
"loss": 5.2053,
"mean_token_accuracy": 0.18085471391677857,
"num_tokens": 43320703.0,
"step": 23475
},
{
"entropy": 5.496149778366089,
"epoch": 1.9726528040327662,
"grad_norm": 1.25,
"learning_rate": 0.00046090709805360027,
"loss": 5.1785,
"mean_token_accuracy": 0.1868622049689293,
"num_tokens": 43329444.0,
"step": 23480
},
{
"entropy": 5.544989204406738,
"epoch": 1.973072883847931,
"grad_norm": 1.3359375,
"learning_rate": 0.0004608902275738416,
"loss": 5.2777,
"mean_token_accuracy": 0.18707772791385652,
"num_tokens": 43337853.0,
"step": 23485
},
{
"entropy": 5.60792179107666,
"epoch": 1.973492963663096,
"grad_norm": 1.3359375,
"learning_rate": 0.0004608733538011333,
"loss": 5.3394,
"mean_token_accuracy": 0.17572330981492995,
"num_tokens": 43347901.0,
"step": 23490
},
{
"entropy": 5.503194141387939,
"epoch": 1.973913043478261,
"grad_norm": 1.2265625,
"learning_rate": 0.0004608564767357741,
"loss": 5.1996,
"mean_token_accuracy": 0.180150805413723,
"num_tokens": 43357358.0,
"step": 23495
},
{
"entropy": 5.548695707321167,
"epoch": 1.9743331232934258,
"grad_norm": 1.234375,
"learning_rate": 0.0004608395963780632,
"loss": 5.3027,
"mean_token_accuracy": 0.1780621290206909,
"num_tokens": 43366749.0,
"step": 23500
},
{
"entropy": 5.518323850631714,
"epoch": 1.9747532031085906,
"grad_norm": 1.375,
"learning_rate": 0.0004608227127282996,
"loss": 5.2541,
"mean_token_accuracy": 0.17648564130067826,
"num_tokens": 43375243.0,
"step": 23505
},
{
"entropy": 5.601040697097778,
"epoch": 1.9751732829237554,
"grad_norm": 1.28125,
"learning_rate": 0.0004608058257867823,
"loss": 5.224,
"mean_token_accuracy": 0.17953041940927505,
"num_tokens": 43383470.0,
"step": 23510
},
{
"entropy": 5.598457956314087,
"epoch": 1.9755933627389204,
"grad_norm": 1.203125,
"learning_rate": 0.0004607889355538105,
"loss": 5.3277,
"mean_token_accuracy": 0.17315026819705964,
"num_tokens": 43393527.0,
"step": 23515
},
{
"entropy": 5.5307587623596195,
"epoch": 1.9760134425540854,
"grad_norm": 1.34375,
"learning_rate": 0.00046077204202968325,
"loss": 5.1993,
"mean_token_accuracy": 0.18025186210870742,
"num_tokens": 43402390.0,
"step": 23520
},
{
"entropy": 5.488351678848266,
"epoch": 1.9764335223692502,
"grad_norm": 1.359375,
"learning_rate": 0.00046075514521470005,
"loss": 5.1978,
"mean_token_accuracy": 0.17698446810245513,
"num_tokens": 43411479.0,
"step": 23525
},
{
"entropy": 5.496196269989014,
"epoch": 1.976853602184415,
"grad_norm": 1.3125,
"learning_rate": 0.00046073824510916005,
"loss": 5.1366,
"mean_token_accuracy": 0.18785955011844635,
"num_tokens": 43420402.0,
"step": 23530
},
{
"entropy": 5.523839664459229,
"epoch": 1.9772736819995798,
"grad_norm": 1.359375,
"learning_rate": 0.00046072134171336267,
"loss": 5.2577,
"mean_token_accuracy": 0.17183273434638976,
"num_tokens": 43429011.0,
"step": 23535
},
{
"entropy": 5.511778926849365,
"epoch": 1.9776937618147448,
"grad_norm": 1.234375,
"learning_rate": 0.0004607044350276074,
"loss": 5.1658,
"mean_token_accuracy": 0.18932248800992965,
"num_tokens": 43438548.0,
"step": 23540
},
{
"entropy": 5.558135509490967,
"epoch": 1.9781138416299098,
"grad_norm": 1.3359375,
"learning_rate": 0.00046068752505219366,
"loss": 5.2699,
"mean_token_accuracy": 0.18133103102445602,
"num_tokens": 43448332.0,
"step": 23545
},
{
"entropy": 5.5836029052734375,
"epoch": 1.9785339214450746,
"grad_norm": 1.328125,
"learning_rate": 0.000460670611787421,
"loss": 5.3268,
"mean_token_accuracy": 0.17510336637496948,
"num_tokens": 43457726.0,
"step": 23550
},
{
"entropy": 5.507944631576538,
"epoch": 1.9789540012602393,
"grad_norm": 1.21875,
"learning_rate": 0.0004606536952335891,
"loss": 5.2476,
"mean_token_accuracy": 0.18158409744501114,
"num_tokens": 43466617.0,
"step": 23555
},
{
"entropy": 5.486865711212158,
"epoch": 1.9793740810754044,
"grad_norm": 1.203125,
"learning_rate": 0.00046063677539099756,
"loss": 5.237,
"mean_token_accuracy": 0.17666604220867158,
"num_tokens": 43476044.0,
"step": 23560
},
{
"entropy": 5.53165340423584,
"epoch": 1.9797941608905694,
"grad_norm": 1.4375,
"learning_rate": 0.00046061985225994616,
"loss": 5.2038,
"mean_token_accuracy": 0.18216431885957718,
"num_tokens": 43485488.0,
"step": 23565
},
{
"entropy": 5.589370965957642,
"epoch": 1.9802142407057342,
"grad_norm": 1.1484375,
"learning_rate": 0.00046060292584073465,
"loss": 5.2293,
"mean_token_accuracy": 0.1830082431435585,
"num_tokens": 43494423.0,
"step": 23570
},
{
"entropy": 5.500078439712524,
"epoch": 1.980634320520899,
"grad_norm": 1.296875,
"learning_rate": 0.00046058599613366287,
"loss": 5.1179,
"mean_token_accuracy": 0.18929339498281478,
"num_tokens": 43502874.0,
"step": 23575
},
{
"entropy": 5.566606569290161,
"epoch": 1.9810544003360637,
"grad_norm": 1.421875,
"learning_rate": 0.0004605690631390308,
"loss": 5.3524,
"mean_token_accuracy": 0.1767713323235512,
"num_tokens": 43512222.0,
"step": 23580
},
{
"entropy": 5.568696212768555,
"epoch": 1.9814744801512287,
"grad_norm": 1.34375,
"learning_rate": 0.0004605521268571382,
"loss": 5.2807,
"mean_token_accuracy": 0.17911923676729202,
"num_tokens": 43521577.0,
"step": 23585
},
{
"entropy": 5.622945070266724,
"epoch": 1.9818945599663937,
"grad_norm": 1.53125,
"learning_rate": 0.00046053518728828534,
"loss": 5.2631,
"mean_token_accuracy": 0.18404691517353058,
"num_tokens": 43529763.0,
"step": 23590
},
{
"entropy": 5.5807476997375485,
"epoch": 1.9823146397815585,
"grad_norm": 1.5546875,
"learning_rate": 0.0004605182444327721,
"loss": 5.2877,
"mean_token_accuracy": 0.17415323853492737,
"num_tokens": 43538663.0,
"step": 23595
},
{
"entropy": 5.452884578704834,
"epoch": 1.9827347195967233,
"grad_norm": 1.3046875,
"learning_rate": 0.0004605012982908987,
"loss": 5.0965,
"mean_token_accuracy": 0.19306595772504806,
"num_tokens": 43547302.0,
"step": 23600
},
{
"entropy": 5.495189428329468,
"epoch": 1.983154799411888,
"grad_norm": 1.28125,
"learning_rate": 0.00046048434886296536,
"loss": 5.3009,
"mean_token_accuracy": 0.1724704310297966,
"num_tokens": 43557222.0,
"step": 23605
},
{
"entropy": 5.5371397018432615,
"epoch": 1.9835748792270531,
"grad_norm": 1.265625,
"learning_rate": 0.0004604673961492722,
"loss": 5.1988,
"mean_token_accuracy": 0.18466370403766633,
"num_tokens": 43566210.0,
"step": 23610
},
{
"entropy": 5.546426439285279,
"epoch": 1.9839949590422181,
"grad_norm": 1.3125,
"learning_rate": 0.00046045044015011975,
"loss": 5.1715,
"mean_token_accuracy": 0.18057484179735184,
"num_tokens": 43576275.0,
"step": 23615
},
{
"entropy": 5.50583062171936,
"epoch": 1.984415038857383,
"grad_norm": 1.328125,
"learning_rate": 0.0004604334808658081,
"loss": 5.2986,
"mean_token_accuracy": 0.17860207259654998,
"num_tokens": 43585480.0,
"step": 23620
},
{
"entropy": 5.5586997985839846,
"epoch": 1.9848351186725477,
"grad_norm": 1.7890625,
"learning_rate": 0.00046041651829663787,
"loss": 5.3196,
"mean_token_accuracy": 0.17786380052566528,
"num_tokens": 43593911.0,
"step": 23625
},
{
"entropy": 5.572823762893677,
"epoch": 1.9852551984877127,
"grad_norm": 1.2734375,
"learning_rate": 0.00046039955244290957,
"loss": 5.2485,
"mean_token_accuracy": 0.1797892764210701,
"num_tokens": 43604029.0,
"step": 23630
},
{
"entropy": 5.581953811645508,
"epoch": 1.9856752783028777,
"grad_norm": 1.3125,
"learning_rate": 0.00046038258330492363,
"loss": 5.2857,
"mean_token_accuracy": 0.18273638635873796,
"num_tokens": 43613248.0,
"step": 23635
},
{
"entropy": 5.553310298919678,
"epoch": 1.9860953581180425,
"grad_norm": 1.265625,
"learning_rate": 0.0004603656108829806,
"loss": 5.2327,
"mean_token_accuracy": 0.18504705727100373,
"num_tokens": 43623232.0,
"step": 23640
},
{
"entropy": 5.582189273834229,
"epoch": 1.9865154379332073,
"grad_norm": 1.40625,
"learning_rate": 0.00046034863517738136,
"loss": 5.2854,
"mean_token_accuracy": 0.17108808904886247,
"num_tokens": 43632999.0,
"step": 23645
},
{
"entropy": 5.525351572036743,
"epoch": 1.986935517748372,
"grad_norm": 1.3125,
"learning_rate": 0.00046033165618842637,
"loss": 5.2611,
"mean_token_accuracy": 0.1772943764925003,
"num_tokens": 43641492.0,
"step": 23650
},
{
"entropy": 5.644568061828613,
"epoch": 1.987355597563537,
"grad_norm": 1.2578125,
"learning_rate": 0.00046031467391641657,
"loss": 5.2464,
"mean_token_accuracy": 0.17870327681303025,
"num_tokens": 43650999.0,
"step": 23655
},
{
"entropy": 5.56144905090332,
"epoch": 1.987775677378702,
"grad_norm": 1.265625,
"learning_rate": 0.0004602976883616527,
"loss": 5.3056,
"mean_token_accuracy": 0.1710178107023239,
"num_tokens": 43660777.0,
"step": 23660
},
{
"entropy": 5.446328592300415,
"epoch": 1.9881957571938669,
"grad_norm": 1.4140625,
"learning_rate": 0.00046028069952443575,
"loss": 5.2281,
"mean_token_accuracy": 0.18412451893091203,
"num_tokens": 43670404.0,
"step": 23665
},
{
"entropy": 5.524843692779541,
"epoch": 1.9886158370090317,
"grad_norm": 1.4140625,
"learning_rate": 0.00046026370740506663,
"loss": 5.1717,
"mean_token_accuracy": 0.1866544798016548,
"num_tokens": 43679183.0,
"step": 23670
},
{
"entropy": 5.503794479370117,
"epoch": 1.9890359168241964,
"grad_norm": 1.3671875,
"learning_rate": 0.0004602467120038463,
"loss": 5.189,
"mean_token_accuracy": 0.18368219584226608,
"num_tokens": 43688080.0,
"step": 23675
},
{
"entropy": 5.512891244888306,
"epoch": 1.9894559966393615,
"grad_norm": 1.3046875,
"learning_rate": 0.00046022971332107586,
"loss": 5.1568,
"mean_token_accuracy": 0.1884944662451744,
"num_tokens": 43697271.0,
"step": 23680
},
{
"entropy": 5.487449502944946,
"epoch": 1.9898760764545265,
"grad_norm": 1.296875,
"learning_rate": 0.00046021271135705637,
"loss": 5.187,
"mean_token_accuracy": 0.18660755902528764,
"num_tokens": 43705541.0,
"step": 23685
},
{
"entropy": 5.545259857177735,
"epoch": 1.9902961562696913,
"grad_norm": 1.4140625,
"learning_rate": 0.0004601957061120891,
"loss": 5.324,
"mean_token_accuracy": 0.17005800753831862,
"num_tokens": 43713701.0,
"step": 23690
},
{
"entropy": 5.496766996383667,
"epoch": 1.990716236084856,
"grad_norm": 1.140625,
"learning_rate": 0.0004601786975864753,
"loss": 5.2497,
"mean_token_accuracy": 0.18931407034397124,
"num_tokens": 43723050.0,
"step": 23695
},
{
"entropy": 5.533319473266602,
"epoch": 1.991136315900021,
"grad_norm": 1.265625,
"learning_rate": 0.0004601616857805162,
"loss": 5.2436,
"mean_token_accuracy": 0.18444208353757857,
"num_tokens": 43733029.0,
"step": 23700
},
{
"entropy": 5.521509599685669,
"epoch": 1.9915563957151858,
"grad_norm": 1.2421875,
"learning_rate": 0.0004601446706945132,
"loss": 5.2072,
"mean_token_accuracy": 0.1786271870136261,
"num_tokens": 43741818.0,
"step": 23705
},
{
"entropy": 5.55954213142395,
"epoch": 1.9919764755303508,
"grad_norm": 1.34375,
"learning_rate": 0.00046012765232876767,
"loss": 5.2388,
"mean_token_accuracy": 0.18061397671699525,
"num_tokens": 43750755.0,
"step": 23710
},
{
"entropy": 5.471874046325683,
"epoch": 1.9923965553455156,
"grad_norm": 1.3515625,
"learning_rate": 0.0004601106306835811,
"loss": 5.1472,
"mean_token_accuracy": 0.18681754022836686,
"num_tokens": 43759135.0,
"step": 23715
},
{
"entropy": 5.48162431716919,
"epoch": 1.9928166351606804,
"grad_norm": 1.2890625,
"learning_rate": 0.0004600936057592551,
"loss": 5.0895,
"mean_token_accuracy": 0.19534640461206437,
"num_tokens": 43767629.0,
"step": 23720
},
{
"entropy": 5.516308116912842,
"epoch": 1.9932367149758454,
"grad_norm": 1.265625,
"learning_rate": 0.00046007657755609113,
"loss": 5.2767,
"mean_token_accuracy": 0.17762725800275803,
"num_tokens": 43776561.0,
"step": 23725
},
{
"entropy": 5.620246505737304,
"epoch": 1.9936567947910104,
"grad_norm": 1.2265625,
"learning_rate": 0.0004600595460743908,
"loss": 5.3561,
"mean_token_accuracy": 0.17195882350206376,
"num_tokens": 43786569.0,
"step": 23730
},
{
"entropy": 5.55762734413147,
"epoch": 1.9940768746061752,
"grad_norm": 1.34375,
"learning_rate": 0.000460042511314456,
"loss": 5.3001,
"mean_token_accuracy": 0.17150032818317412,
"num_tokens": 43795621.0,
"step": 23735
},
{
"entropy": 5.610937213897705,
"epoch": 1.99449695442134,
"grad_norm": 1.296875,
"learning_rate": 0.00046002547327658847,
"loss": 5.2928,
"mean_token_accuracy": 0.18292475789785384,
"num_tokens": 43804728.0,
"step": 23740
},
{
"entropy": 5.494181442260742,
"epoch": 1.9949170342365048,
"grad_norm": 1.390625,
"learning_rate": 0.0004600084319610898,
"loss": 5.2004,
"mean_token_accuracy": 0.18475257158279418,
"num_tokens": 43813495.0,
"step": 23745
},
{
"entropy": 5.460838413238525,
"epoch": 1.9953371140516698,
"grad_norm": 1.2578125,
"learning_rate": 0.0004599913873682621,
"loss": 5.1327,
"mean_token_accuracy": 0.18193830251693727,
"num_tokens": 43823791.0,
"step": 23750
},
{
"entropy": 5.505008554458618,
"epoch": 1.9957571938668348,
"grad_norm": 1.25,
"learning_rate": 0.00045997433949840724,
"loss": 5.2101,
"mean_token_accuracy": 0.1830847218632698,
"num_tokens": 43833904.0,
"step": 23755
},
{
"entropy": 5.601008939743042,
"epoch": 1.9961772736819996,
"grad_norm": 1.46875,
"learning_rate": 0.00045995728835182716,
"loss": 5.2879,
"mean_token_accuracy": 0.18085382878780365,
"num_tokens": 43843430.0,
"step": 23760
},
{
"entropy": 5.58181529045105,
"epoch": 1.9965973534971644,
"grad_norm": 1.3125,
"learning_rate": 0.00045994023392882395,
"loss": 5.2553,
"mean_token_accuracy": 0.18672484606504441,
"num_tokens": 43851405.0,
"step": 23765
},
{
"entropy": 5.523421669006348,
"epoch": 1.9970174333123294,
"grad_norm": 1.3359375,
"learning_rate": 0.00045992317622969977,
"loss": 5.338,
"mean_token_accuracy": 0.1749192401766777,
"num_tokens": 43860034.0,
"step": 23770
},
{
"entropy": 5.503814315795898,
"epoch": 1.9974375131274942,
"grad_norm": 1.4453125,
"learning_rate": 0.00045990611525475675,
"loss": 5.2494,
"mean_token_accuracy": 0.17984959930181504,
"num_tokens": 43869371.0,
"step": 23775
},
{
"entropy": 5.563949346542358,
"epoch": 1.9978575929426592,
"grad_norm": 1.3984375,
"learning_rate": 0.0004598890510042971,
"loss": 5.2979,
"mean_token_accuracy": 0.1790134772658348,
"num_tokens": 43878462.0,
"step": 23780
},
{
"entropy": 5.600993633270264,
"epoch": 1.998277672757824,
"grad_norm": 1.2734375,
"learning_rate": 0.000459871983478623,
"loss": 5.218,
"mean_token_accuracy": 0.17783397436141968,
"num_tokens": 43887435.0,
"step": 23785
},
{
"entropy": 5.5280242443084715,
"epoch": 1.9986977525729888,
"grad_norm": 1.6953125,
"learning_rate": 0.00045985491267803703,
"loss": 5.2788,
"mean_token_accuracy": 0.18128907680511475,
"num_tokens": 43896720.0,
"step": 23790
},
{
"entropy": 5.491922998428345,
"epoch": 1.9991178323881538,
"grad_norm": 1.578125,
"learning_rate": 0.00045983783860284146,
"loss": 5.2788,
"mean_token_accuracy": 0.17401405721902846,
"num_tokens": 43906403.0,
"step": 23795
},
{
"entropy": 5.617269992828369,
"epoch": 1.9995379122033188,
"grad_norm": 1.328125,
"learning_rate": 0.00045982076125333874,
"loss": 5.3166,
"mean_token_accuracy": 0.1729935258626938,
"num_tokens": 43915059.0,
"step": 23800
},
{
"entropy": 5.683682775497436,
"epoch": 1.9999579920184836,
"grad_norm": 1.375,
"learning_rate": 0.00045980368062983147,
"loss": 5.3257,
"mean_token_accuracy": 0.1737267106771469,
"num_tokens": 43925598.0,
"step": 23805
},
{
"entropy": 5.539677884843615,
"epoch": 2.000336063852132,
"grad_norm": 1.3359375,
"learning_rate": 0.0004597865967326221,
"loss": 5.147,
"mean_token_accuracy": 0.1816287057267295,
"num_tokens": 43934471.0,
"step": 23810
},
{
"entropy": 5.509443950653076,
"epoch": 2.0007561436672967,
"grad_norm": 1.4140625,
"learning_rate": 0.00045976950956201325,
"loss": 5.2393,
"mean_token_accuracy": 0.17548753321170807,
"num_tokens": 43944451.0,
"step": 23815
},
{
"entropy": 5.6157149314880375,
"epoch": 2.0011762234824615,
"grad_norm": 1.34375,
"learning_rate": 0.0004597524191183078,
"loss": 5.1858,
"mean_token_accuracy": 0.1888144552707672,
"num_tokens": 43953892.0,
"step": 23820
},
{
"entropy": 5.58864049911499,
"epoch": 2.0015963032976267,
"grad_norm": 1.2265625,
"learning_rate": 0.0004597353254018082,
"loss": 5.2729,
"mean_token_accuracy": 0.1762720078229904,
"num_tokens": 43963155.0,
"step": 23825
},
{
"entropy": 5.529189825057983,
"epoch": 2.0020163831127915,
"grad_norm": 1.359375,
"learning_rate": 0.0004597182284128177,
"loss": 5.1273,
"mean_token_accuracy": 0.1903434917330742,
"num_tokens": 43972468.0,
"step": 23830
},
{
"entropy": 5.662293243408203,
"epoch": 2.0024364629279563,
"grad_norm": 1.296875,
"learning_rate": 0.0004597011281516387,
"loss": 5.3684,
"mean_token_accuracy": 0.17243614196777343,
"num_tokens": 43982709.0,
"step": 23835
},
{
"entropy": 5.5285533428192135,
"epoch": 2.002856542743121,
"grad_norm": 1.2109375,
"learning_rate": 0.00045968402461857435,
"loss": 5.1527,
"mean_token_accuracy": 0.1910782963037491,
"num_tokens": 43992607.0,
"step": 23840
},
{
"entropy": 5.488125467300415,
"epoch": 2.003276622558286,
"grad_norm": 1.203125,
"learning_rate": 0.00045966691781392763,
"loss": 5.1287,
"mean_token_accuracy": 0.1822839632630348,
"num_tokens": 44001265.0,
"step": 23845
},
{
"entropy": 5.5621116161346436,
"epoch": 2.003696702373451,
"grad_norm": 1.3046875,
"learning_rate": 0.00045964980773800156,
"loss": 5.3353,
"mean_token_accuracy": 0.1766609862446785,
"num_tokens": 44010440.0,
"step": 23850
},
{
"entropy": 5.57194242477417,
"epoch": 2.004116782188616,
"grad_norm": 1.2890625,
"learning_rate": 0.0004596326943910993,
"loss": 5.1618,
"mean_token_accuracy": 0.18098720908164978,
"num_tokens": 44020237.0,
"step": 23855
},
{
"entropy": 5.537049674987793,
"epoch": 2.0045368620037807,
"grad_norm": 1.2578125,
"learning_rate": 0.00045961557777352376,
"loss": 5.2545,
"mean_token_accuracy": 0.17600449174642563,
"num_tokens": 44028976.0,
"step": 23860
},
{
"entropy": 5.562431144714355,
"epoch": 2.0049569418189455,
"grad_norm": 1.265625,
"learning_rate": 0.00045959845788557844,
"loss": 5.2331,
"mean_token_accuracy": 0.18209305554628372,
"num_tokens": 44038186.0,
"step": 23865
},
{
"entropy": 5.582473087310791,
"epoch": 2.0053770216341107,
"grad_norm": 1.3203125,
"learning_rate": 0.0004595813347275665,
"loss": 5.1801,
"mean_token_accuracy": 0.17700323611497878,
"num_tokens": 44047780.0,
"step": 23870
},
{
"entropy": 5.519444274902344,
"epoch": 2.0057971014492755,
"grad_norm": 1.40625,
"learning_rate": 0.0004595642082997912,
"loss": 5.1509,
"mean_token_accuracy": 0.18667900562286377,
"num_tokens": 44056678.0,
"step": 23875
},
{
"entropy": 5.481790065765381,
"epoch": 2.0062171812644403,
"grad_norm": 1.2734375,
"learning_rate": 0.000459547078602556,
"loss": 5.183,
"mean_token_accuracy": 0.1797319084405899,
"num_tokens": 44066428.0,
"step": 23880
},
{
"entropy": 5.525395202636719,
"epoch": 2.006637261079605,
"grad_norm": 1.2890625,
"learning_rate": 0.00045952994563616434,
"loss": 5.1714,
"mean_token_accuracy": 0.18301728814840318,
"num_tokens": 44075285.0,
"step": 23885
},
{
"entropy": 5.555371475219727,
"epoch": 2.00705734089477,
"grad_norm": 1.515625,
"learning_rate": 0.0004595128094009197,
"loss": 5.1823,
"mean_token_accuracy": 0.1789303481578827,
"num_tokens": 44084333.0,
"step": 23890
},
{
"entropy": 5.535375690460205,
"epoch": 2.007477420709935,
"grad_norm": 1.2890625,
"learning_rate": 0.0004594956698971256,
"loss": 5.1915,
"mean_token_accuracy": 0.1770729437470436,
"num_tokens": 44093504.0,
"step": 23895
},
{
"entropy": 5.544629383087158,
"epoch": 2.0078975005251,
"grad_norm": 1.21875,
"learning_rate": 0.0004594785271250858,
"loss": 5.2156,
"mean_token_accuracy": 0.17561513483524321,
"num_tokens": 44102887.0,
"step": 23900
},
{
"entropy": 5.505322599411011,
"epoch": 2.0083175803402646,
"grad_norm": 1.46875,
"learning_rate": 0.0004594613810851039,
"loss": 5.1887,
"mean_token_accuracy": 0.1797704353928566,
"num_tokens": 44113074.0,
"step": 23905
},
{
"entropy": 5.491182327270508,
"epoch": 2.0087376601554294,
"grad_norm": 1.4453125,
"learning_rate": 0.00045944423177748353,
"loss": 5.2044,
"mean_token_accuracy": 0.1880872830748558,
"num_tokens": 44122557.0,
"step": 23910
},
{
"entropy": 5.5398579120635985,
"epoch": 2.009157739970594,
"grad_norm": 1.2421875,
"learning_rate": 0.00045942707920252864,
"loss": 5.2387,
"mean_token_accuracy": 0.17462565749883652,
"num_tokens": 44130198.0,
"step": 23915
},
{
"entropy": 5.598012161254883,
"epoch": 2.0095778197857594,
"grad_norm": 1.390625,
"learning_rate": 0.000459409923360543,
"loss": 5.1639,
"mean_token_accuracy": 0.19009662419557571,
"num_tokens": 44139267.0,
"step": 23920
},
{
"entropy": 5.5575823307037355,
"epoch": 2.0099978996009242,
"grad_norm": 1.15625,
"learning_rate": 0.0004593927642518305,
"loss": 5.2349,
"mean_token_accuracy": 0.1753944844007492,
"num_tokens": 44149620.0,
"step": 23925
},
{
"entropy": 5.455867528915405,
"epoch": 2.010417979416089,
"grad_norm": 1.2421875,
"learning_rate": 0.0004593756018766951,
"loss": 5.0999,
"mean_token_accuracy": 0.18190970569849013,
"num_tokens": 44158678.0,
"step": 23930
},
{
"entropy": 5.462258434295654,
"epoch": 2.010838059231254,
"grad_norm": 1.3125,
"learning_rate": 0.00045935843623544093,
"loss": 5.0803,
"mean_token_accuracy": 0.18610764145851136,
"num_tokens": 44167376.0,
"step": 23935
},
{
"entropy": 5.561698961257934,
"epoch": 2.011258139046419,
"grad_norm": 1.234375,
"learning_rate": 0.0004593412673283719,
"loss": 5.2036,
"mean_token_accuracy": 0.18106955736875535,
"num_tokens": 44176001.0,
"step": 23940
},
{
"entropy": 5.617484045028687,
"epoch": 2.011678218861584,
"grad_norm": 1.2578125,
"learning_rate": 0.00045932409515579226,
"loss": 5.24,
"mean_token_accuracy": 0.1776907339692116,
"num_tokens": 44185132.0,
"step": 23945
},
{
"entropy": 5.500185537338257,
"epoch": 2.0120982986767486,
"grad_norm": 1.296875,
"learning_rate": 0.00045930691971800627,
"loss": 5.2007,
"mean_token_accuracy": 0.18316864222288132,
"num_tokens": 44193256.0,
"step": 23950
},
{
"entropy": 5.621542119979859,
"epoch": 2.0125183784919134,
"grad_norm": 1.4609375,
"learning_rate": 0.00045928974101531805,
"loss": 5.2773,
"mean_token_accuracy": 0.17884039282798767,
"num_tokens": 44202884.0,
"step": 23955
},
{
"entropy": 5.615364694595337,
"epoch": 2.012938458307078,
"grad_norm": 1.2265625,
"learning_rate": 0.0004592725590480319,
"loss": 5.2623,
"mean_token_accuracy": 0.17376187741756438,
"num_tokens": 44212826.0,
"step": 23960
},
{
"entropy": 5.549553871154785,
"epoch": 2.0133585381222434,
"grad_norm": 1.421875,
"learning_rate": 0.0004592553738164524,
"loss": 5.2229,
"mean_token_accuracy": 0.17238864451646804,
"num_tokens": 44222369.0,
"step": 23965
},
{
"entropy": 5.503224897384643,
"epoch": 2.013778617937408,
"grad_norm": 1.46875,
"learning_rate": 0.0004592381853208837,
"loss": 5.1298,
"mean_token_accuracy": 0.18299996107816696,
"num_tokens": 44230964.0,
"step": 23970
},
{
"entropy": 5.571483898162842,
"epoch": 2.014198697752573,
"grad_norm": 1.3125,
"learning_rate": 0.0004592209935616304,
"loss": 5.214,
"mean_token_accuracy": 0.18138472288846968,
"num_tokens": 44240199.0,
"step": 23975
},
{
"entropy": 5.563494634628296,
"epoch": 2.0146187775677378,
"grad_norm": 1.515625,
"learning_rate": 0.0004592037985389971,
"loss": 5.1698,
"mean_token_accuracy": 0.1851072758436203,
"num_tokens": 44249857.0,
"step": 23980
},
{
"entropy": 5.48805832862854,
"epoch": 2.0150388573829026,
"grad_norm": 1.3984375,
"learning_rate": 0.0004591866002532885,
"loss": 5.1619,
"mean_token_accuracy": 0.18001382797956467,
"num_tokens": 44258364.0,
"step": 23985
},
{
"entropy": 5.449322748184204,
"epoch": 2.015458937198068,
"grad_norm": 1.3203125,
"learning_rate": 0.00045916939870480896,
"loss": 5.0948,
"mean_token_accuracy": 0.18665988743305206,
"num_tokens": 44267473.0,
"step": 23990
},
{
"entropy": 5.540905570983886,
"epoch": 2.0158790170132326,
"grad_norm": 1.328125,
"learning_rate": 0.00045915219389386336,
"loss": 5.1238,
"mean_token_accuracy": 0.18890341967344285,
"num_tokens": 44276665.0,
"step": 23995
},
{
"entropy": 5.5050639629364015,
"epoch": 2.0162990968283974,
"grad_norm": 1.2578125,
"learning_rate": 0.0004591349858207565,
"loss": 5.1866,
"mean_token_accuracy": 0.18196779787540435,
"num_tokens": 44285928.0,
"step": 24000
},
{
"epoch": 2.0162990968283974,
"eval_entropy": 5.33277647185443,
"eval_loss": 5.301941394805908,
"eval_mean_token_accuracy": 0.18649984510414888,
"eval_num_tokens": 44285928.0,
"eval_runtime": 27.4266,
"eval_samples_per_second": 1362.398,
"eval_steps_per_second": 170.309,
"step": 24000
}
],
"logging_steps": 5,
"max_steps": 119020,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.4819220631552e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}