Files
swe-latn-100mb-after-ppt-sh…/checkpoint-3000/trainer_state.json
ModelHub XC 380cd5d987 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/swe-latn-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407
Source: Original Platform
2026-06-30 03:57:20 +08:00

6046 lines
164 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2520478890989288,
"eval_steps": 3000,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 4.791903686523438,
"epoch": 0.0004200798151648813,
"grad_norm": 14.6875,
"learning_rate": 2e-06,
"loss": 14.3853,
"mean_token_accuracy": 0.0001464128843508661,
"num_tokens": 8348.0,
"step": 5
},
{
"entropy": 4.770248889923096,
"epoch": 0.0008401596303297626,
"grad_norm": 13.5,
"learning_rate": 4.5e-06,
"loss": 14.3427,
"mean_token_accuracy": 0.00011325028026476502,
"num_tokens": 17465.0,
"step": 10
},
{
"entropy": 4.853783750534058,
"epoch": 0.001260239445494644,
"grad_norm": 19.875,
"learning_rate": 7e-06,
"loss": 14.1261,
"mean_token_accuracy": 0.00010341261513531208,
"num_tokens": 26627.0,
"step": 15
},
{
"entropy": 5.086610746383667,
"epoch": 0.0016803192606595252,
"grad_norm": 28.375,
"learning_rate": 9.5e-06,
"loss": 13.5157,
"mean_token_accuracy": 0.0,
"num_tokens": 36069.0,
"step": 20
},
{
"entropy": 7.3999251365661625,
"epoch": 0.002100399075824407,
"grad_norm": 12.0625,
"learning_rate": 1.2e-05,
"loss": 11.7927,
"mean_token_accuracy": 0.0,
"num_tokens": 44967.0,
"step": 25
},
{
"entropy": 10.45841064453125,
"epoch": 0.002520478890989288,
"grad_norm": 3.015625,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.7852,
"mean_token_accuracy": 9.009009227156639e-05,
"num_tokens": 55132.0,
"step": 30
},
{
"entropy": 10.697011375427246,
"epoch": 0.0029405587061541692,
"grad_norm": 2.75,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.5553,
"mean_token_accuracy": 0.006346415734151378,
"num_tokens": 65141.0,
"step": 35
},
{
"entropy": 10.697507858276367,
"epoch": 0.0033606385213190504,
"grad_norm": 2.40625,
"learning_rate": 1.95e-05,
"loss": 10.2781,
"mean_token_accuracy": 0.039602359384298326,
"num_tokens": 74007.0,
"step": 40
},
{
"entropy": 10.668922233581544,
"epoch": 0.003780718336483932,
"grad_norm": 2.296875,
"learning_rate": 2.2e-05,
"loss": 9.9863,
"mean_token_accuracy": 0.04357385709881782,
"num_tokens": 83736.0,
"step": 45
},
{
"entropy": 10.582208251953125,
"epoch": 0.004200798151648814,
"grad_norm": 2.09375,
"learning_rate": 2.4500000000000003e-05,
"loss": 9.7817,
"mean_token_accuracy": 0.044408387318253514,
"num_tokens": 92525.0,
"step": 50
},
{
"entropy": 10.561583709716796,
"epoch": 0.004620877966813695,
"grad_norm": 2.109375,
"learning_rate": 2.7e-05,
"loss": 9.6616,
"mean_token_accuracy": 0.042681990377604964,
"num_tokens": 102015.0,
"step": 55
},
{
"entropy": 10.598667430877686,
"epoch": 0.005040957781978576,
"grad_norm": 1.8671875,
"learning_rate": 2.95e-05,
"loss": 9.6152,
"mean_token_accuracy": 0.03954915180802345,
"num_tokens": 110887.0,
"step": 60
},
{
"entropy": 10.602967929840087,
"epoch": 0.005461037597143457,
"grad_norm": 2.0,
"learning_rate": 3.2e-05,
"loss": 9.5219,
"mean_token_accuracy": 0.04232911877334118,
"num_tokens": 120442.0,
"step": 65
},
{
"entropy": 10.583982849121094,
"epoch": 0.0058811174123083385,
"grad_norm": 2.0625,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.4106,
"mean_token_accuracy": 0.041194649040699007,
"num_tokens": 129297.0,
"step": 70
},
{
"entropy": 10.561800956726074,
"epoch": 0.00630119722747322,
"grad_norm": 1.921875,
"learning_rate": 3.7e-05,
"loss": 9.3205,
"mean_token_accuracy": 0.04238409399986267,
"num_tokens": 138305.0,
"step": 75
},
{
"entropy": 10.55169849395752,
"epoch": 0.006721277042638101,
"grad_norm": 2.03125,
"learning_rate": 3.95e-05,
"loss": 9.2587,
"mean_token_accuracy": 0.04415187537670136,
"num_tokens": 147640.0,
"step": 80
},
{
"entropy": 10.53596887588501,
"epoch": 0.007141356857802983,
"grad_norm": 1.9140625,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.156,
"mean_token_accuracy": 0.05057476349174976,
"num_tokens": 157633.0,
"step": 85
},
{
"entropy": 10.498776817321778,
"epoch": 0.007561436672967864,
"grad_norm": 1.6796875,
"learning_rate": 4.45e-05,
"loss": 9.1055,
"mean_token_accuracy": 0.043933939374983313,
"num_tokens": 167984.0,
"step": 90
},
{
"entropy": 10.410062026977538,
"epoch": 0.007981516488132745,
"grad_norm": 1.84375,
"learning_rate": 4.7000000000000004e-05,
"loss": 8.9796,
"mean_token_accuracy": 0.058869444206357,
"num_tokens": 176984.0,
"step": 95
},
{
"entropy": 10.331822967529297,
"epoch": 0.008401596303297627,
"grad_norm": 2.265625,
"learning_rate": 4.9500000000000004e-05,
"loss": 8.8274,
"mean_token_accuracy": 0.05482011772692204,
"num_tokens": 185931.0,
"step": 100
},
{
"entropy": 10.239299774169922,
"epoch": 0.008821676118462508,
"grad_norm": 2.265625,
"learning_rate": 5.2e-05,
"loss": 8.7979,
"mean_token_accuracy": 0.05091267079114914,
"num_tokens": 195065.0,
"step": 105
},
{
"entropy": 10.274462223052979,
"epoch": 0.00924175593362739,
"grad_norm": 1.609375,
"learning_rate": 5.45e-05,
"loss": 8.653,
"mean_token_accuracy": 0.053716998919844626,
"num_tokens": 203687.0,
"step": 110
},
{
"entropy": 10.133169841766357,
"epoch": 0.00966183574879227,
"grad_norm": 1.4453125,
"learning_rate": 5.7e-05,
"loss": 8.5503,
"mean_token_accuracy": 0.057077201455831526,
"num_tokens": 212847.0,
"step": 115
},
{
"entropy": 10.040187549591064,
"epoch": 0.010081915563957152,
"grad_norm": 1.6796875,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.4092,
"mean_token_accuracy": 0.05596162416040897,
"num_tokens": 222593.0,
"step": 120
},
{
"entropy": 9.901103496551514,
"epoch": 0.010501995379122032,
"grad_norm": 1.4140625,
"learning_rate": 6.2e-05,
"loss": 8.2565,
"mean_token_accuracy": 0.05481334701180458,
"num_tokens": 231174.0,
"step": 125
},
{
"entropy": 9.70338020324707,
"epoch": 0.010922075194286915,
"grad_norm": 1.234375,
"learning_rate": 6.450000000000001e-05,
"loss": 8.1158,
"mean_token_accuracy": 0.058756759762763976,
"num_tokens": 239833.0,
"step": 130
},
{
"entropy": 9.529671382904052,
"epoch": 0.011342155009451797,
"grad_norm": 1.1640625,
"learning_rate": 6.7e-05,
"loss": 8.1187,
"mean_token_accuracy": 0.05617605000734329,
"num_tokens": 248794.0,
"step": 135
},
{
"entropy": 9.257420444488526,
"epoch": 0.011762234824616677,
"grad_norm": 1.09375,
"learning_rate": 6.950000000000001e-05,
"loss": 8.0561,
"mean_token_accuracy": 0.056496378034353256,
"num_tokens": 257123.0,
"step": 140
},
{
"entropy": 9.085350036621094,
"epoch": 0.012182314639781559,
"grad_norm": 1.0,
"learning_rate": 7.2e-05,
"loss": 7.7656,
"mean_token_accuracy": 0.05573978051543236,
"num_tokens": 266088.0,
"step": 145
},
{
"entropy": 8.806443977355958,
"epoch": 0.01260239445494644,
"grad_norm": 0.81640625,
"learning_rate": 7.45e-05,
"loss": 7.9249,
"mean_token_accuracy": 0.05259395204484463,
"num_tokens": 276074.0,
"step": 150
},
{
"entropy": 8.64165735244751,
"epoch": 0.013022474270111321,
"grad_norm": 0.82421875,
"learning_rate": 7.7e-05,
"loss": 7.8106,
"mean_token_accuracy": 0.0555482916533947,
"num_tokens": 285280.0,
"step": 155
},
{
"entropy": 8.47070608139038,
"epoch": 0.013442554085276202,
"grad_norm": 0.80859375,
"learning_rate": 7.950000000000001e-05,
"loss": 7.8078,
"mean_token_accuracy": 0.05328563526272774,
"num_tokens": 296115.0,
"step": 160
},
{
"entropy": 8.336323738098145,
"epoch": 0.013862633900441084,
"grad_norm": 0.83984375,
"learning_rate": 8.2e-05,
"loss": 7.6742,
"mean_token_accuracy": 0.055481255799531934,
"num_tokens": 305483.0,
"step": 165
},
{
"entropy": 8.235133361816406,
"epoch": 0.014282713715605966,
"grad_norm": 0.82421875,
"learning_rate": 8.450000000000001e-05,
"loss": 7.6978,
"mean_token_accuracy": 0.05532576628029347,
"num_tokens": 314000.0,
"step": 170
},
{
"entropy": 8.127510929107666,
"epoch": 0.014702793530770846,
"grad_norm": 0.96484375,
"learning_rate": 8.7e-05,
"loss": 7.6891,
"mean_token_accuracy": 0.05845912620425224,
"num_tokens": 323667.0,
"step": 175
},
{
"entropy": 8.085121822357177,
"epoch": 0.015122873345935728,
"grad_norm": 1.3359375,
"learning_rate": 8.95e-05,
"loss": 7.6917,
"mean_token_accuracy": 0.05741722546517849,
"num_tokens": 332695.0,
"step": 180
},
{
"entropy": 8.051828241348266,
"epoch": 0.015542953161100609,
"grad_norm": 1.1640625,
"learning_rate": 9.2e-05,
"loss": 7.5384,
"mean_token_accuracy": 0.059001700952649117,
"num_tokens": 342428.0,
"step": 185
},
{
"entropy": 8.029651880264282,
"epoch": 0.01596303297626549,
"grad_norm": 0.76953125,
"learning_rate": 9.45e-05,
"loss": 7.6438,
"mean_token_accuracy": 0.05830682367086411,
"num_tokens": 353587.0,
"step": 190
},
{
"entropy": 8.02134084701538,
"epoch": 0.01638311279143037,
"grad_norm": 1.2265625,
"learning_rate": 9.7e-05,
"loss": 7.5846,
"mean_token_accuracy": 0.06251729987561702,
"num_tokens": 362997.0,
"step": 195
},
{
"entropy": 7.912688684463501,
"epoch": 0.016803192606595255,
"grad_norm": 1.0703125,
"learning_rate": 9.95e-05,
"loss": 7.6088,
"mean_token_accuracy": 0.06746274717152119,
"num_tokens": 372346.0,
"step": 200
},
{
"entropy": 8.052432775497437,
"epoch": 0.017223272421760135,
"grad_norm": 1.234375,
"learning_rate": 0.000102,
"loss": 7.4944,
"mean_token_accuracy": 0.06457214206457138,
"num_tokens": 381575.0,
"step": 205
},
{
"entropy": 7.941053867340088,
"epoch": 0.017643352236925015,
"grad_norm": 1.1171875,
"learning_rate": 0.00010449999999999999,
"loss": 7.561,
"mean_token_accuracy": 0.06631600968539715,
"num_tokens": 390706.0,
"step": 210
},
{
"entropy": 7.876999855041504,
"epoch": 0.018063432052089896,
"grad_norm": 0.9140625,
"learning_rate": 0.000107,
"loss": 7.5529,
"mean_token_accuracy": 0.0665743712335825,
"num_tokens": 400000.0,
"step": 215
},
{
"entropy": 7.916263389587402,
"epoch": 0.01848351186725478,
"grad_norm": 0.9765625,
"learning_rate": 0.0001095,
"loss": 7.5184,
"mean_token_accuracy": 0.06940894797444344,
"num_tokens": 409447.0,
"step": 220
},
{
"entropy": 7.911146783828736,
"epoch": 0.01890359168241966,
"grad_norm": 1.0859375,
"learning_rate": 0.000112,
"loss": 7.4837,
"mean_token_accuracy": 0.06915329694747925,
"num_tokens": 418417.0,
"step": 225
},
{
"entropy": 7.839998531341553,
"epoch": 0.01932367149758454,
"grad_norm": 1.2265625,
"learning_rate": 0.0001145,
"loss": 7.4502,
"mean_token_accuracy": 0.06963084377348423,
"num_tokens": 427619.0,
"step": 230
},
{
"entropy": 7.8366899490356445,
"epoch": 0.019743751312749424,
"grad_norm": 1.15625,
"learning_rate": 0.00011700000000000001,
"loss": 7.5047,
"mean_token_accuracy": 0.06452232897281647,
"num_tokens": 437931.0,
"step": 235
},
{
"entropy": 7.8866432189941404,
"epoch": 0.020163831127914304,
"grad_norm": 1.0234375,
"learning_rate": 0.00011949999999999999,
"loss": 7.546,
"mean_token_accuracy": 0.06933673955500126,
"num_tokens": 447595.0,
"step": 240
},
{
"entropy": 7.882558917999267,
"epoch": 0.020583910943079185,
"grad_norm": 1.03125,
"learning_rate": 0.000122,
"loss": 7.4174,
"mean_token_accuracy": 0.06523367092013359,
"num_tokens": 457062.0,
"step": 245
},
{
"entropy": 7.839541244506836,
"epoch": 0.021003990758244065,
"grad_norm": 1.6796875,
"learning_rate": 0.0001245,
"loss": 7.5086,
"mean_token_accuracy": 0.06696470156311989,
"num_tokens": 466191.0,
"step": 250
},
{
"entropy": 7.8005016326904295,
"epoch": 0.02142407057340895,
"grad_norm": 1.2265625,
"learning_rate": 0.000127,
"loss": 7.451,
"mean_token_accuracy": 0.06885578371584415,
"num_tokens": 475693.0,
"step": 255
},
{
"entropy": 7.889404010772705,
"epoch": 0.02184415038857383,
"grad_norm": 1.0625,
"learning_rate": 0.0001295,
"loss": 7.4804,
"mean_token_accuracy": 0.06973730027675629,
"num_tokens": 485173.0,
"step": 260
},
{
"entropy": 7.815570974349976,
"epoch": 0.02226423020373871,
"grad_norm": 1.1640625,
"learning_rate": 0.000132,
"loss": 7.3829,
"mean_token_accuracy": 0.07314893454313279,
"num_tokens": 493985.0,
"step": 265
},
{
"entropy": 7.816596174240113,
"epoch": 0.022684310018903593,
"grad_norm": 1.5546875,
"learning_rate": 0.00013450000000000002,
"loss": 7.4224,
"mean_token_accuracy": 0.07099288031458854,
"num_tokens": 502837.0,
"step": 270
},
{
"entropy": 7.785638856887817,
"epoch": 0.023104389834068473,
"grad_norm": 1.1796875,
"learning_rate": 0.00013700000000000002,
"loss": 7.3589,
"mean_token_accuracy": 0.07525993324816227,
"num_tokens": 511503.0,
"step": 275
},
{
"entropy": 7.709826803207397,
"epoch": 0.023524469649233354,
"grad_norm": 1.0,
"learning_rate": 0.0001395,
"loss": 7.5429,
"mean_token_accuracy": 0.07307531610131264,
"num_tokens": 521499.0,
"step": 280
},
{
"entropy": 7.790460062026978,
"epoch": 0.023944549464398234,
"grad_norm": 1.234375,
"learning_rate": 0.00014199999999999998,
"loss": 7.3077,
"mean_token_accuracy": 0.07697631418704987,
"num_tokens": 530067.0,
"step": 285
},
{
"entropy": 7.686764717102051,
"epoch": 0.024364629279563118,
"grad_norm": 1.03125,
"learning_rate": 0.0001445,
"loss": 7.3034,
"mean_token_accuracy": 0.0773412711918354,
"num_tokens": 538559.0,
"step": 290
},
{
"entropy": 7.772013425827026,
"epoch": 0.024784709094728,
"grad_norm": 1.15625,
"learning_rate": 0.000147,
"loss": 7.4929,
"mean_token_accuracy": 0.07045175209641456,
"num_tokens": 547288.0,
"step": 295
},
{
"entropy": 7.7699915885925295,
"epoch": 0.02520478890989288,
"grad_norm": 0.91796875,
"learning_rate": 0.0001495,
"loss": 7.3747,
"mean_token_accuracy": 0.07021207436919212,
"num_tokens": 557269.0,
"step": 300
},
{
"entropy": 7.7625007152557375,
"epoch": 0.025624868725057762,
"grad_norm": 1.0625,
"learning_rate": 0.000152,
"loss": 7.3902,
"mean_token_accuracy": 0.07074716240167618,
"num_tokens": 567280.0,
"step": 305
},
{
"entropy": 7.732419967651367,
"epoch": 0.026044948540222643,
"grad_norm": 1.296875,
"learning_rate": 0.00015450000000000001,
"loss": 7.2546,
"mean_token_accuracy": 0.07402584217488765,
"num_tokens": 576609.0,
"step": 310
},
{
"entropy": 7.543032264709472,
"epoch": 0.026465028355387523,
"grad_norm": 1.3203125,
"learning_rate": 0.000157,
"loss": 7.1466,
"mean_token_accuracy": 0.08402636647224426,
"num_tokens": 586053.0,
"step": 315
},
{
"entropy": 7.557563781738281,
"epoch": 0.026885108170552403,
"grad_norm": 1.5703125,
"learning_rate": 0.0001595,
"loss": 7.353,
"mean_token_accuracy": 0.07976382523775101,
"num_tokens": 594649.0,
"step": 320
},
{
"entropy": 7.672474765777588,
"epoch": 0.027305187985717287,
"grad_norm": 1.3828125,
"learning_rate": 0.000162,
"loss": 7.299,
"mean_token_accuracy": 0.0721965666860342,
"num_tokens": 603445.0,
"step": 325
},
{
"entropy": 7.649927425384521,
"epoch": 0.027725267800882167,
"grad_norm": 1.21875,
"learning_rate": 0.00016450000000000001,
"loss": 7.3816,
"mean_token_accuracy": 0.07200038619339466,
"num_tokens": 613611.0,
"step": 330
},
{
"entropy": 7.817860317230225,
"epoch": 0.028145347616047048,
"grad_norm": 1.8828125,
"learning_rate": 0.00016700000000000002,
"loss": 7.5608,
"mean_token_accuracy": 0.07428344339132309,
"num_tokens": 623024.0,
"step": 335
},
{
"entropy": 7.615134525299072,
"epoch": 0.02856542743121193,
"grad_norm": 1.140625,
"learning_rate": 0.00016950000000000003,
"loss": 7.241,
"mean_token_accuracy": 0.08163552284240723,
"num_tokens": 631624.0,
"step": 340
},
{
"entropy": 7.6425103664398195,
"epoch": 0.028985507246376812,
"grad_norm": 1.25,
"learning_rate": 0.00017199999999999998,
"loss": 7.1878,
"mean_token_accuracy": 0.08015607595443726,
"num_tokens": 640473.0,
"step": 345
},
{
"entropy": 7.680424308776855,
"epoch": 0.029405587061541692,
"grad_norm": 1.171875,
"learning_rate": 0.00017449999999999999,
"loss": 7.3678,
"mean_token_accuracy": 0.07545013800263405,
"num_tokens": 649692.0,
"step": 350
},
{
"entropy": 7.654160070419311,
"epoch": 0.029825666876706573,
"grad_norm": 1.640625,
"learning_rate": 0.000177,
"loss": 7.319,
"mean_token_accuracy": 0.07563115619122981,
"num_tokens": 658236.0,
"step": 355
},
{
"entropy": 7.521349573135376,
"epoch": 0.030245746691871456,
"grad_norm": 0.96875,
"learning_rate": 0.0001795,
"loss": 7.1117,
"mean_token_accuracy": 0.08184256851673126,
"num_tokens": 667175.0,
"step": 360
},
{
"entropy": 7.665167379379272,
"epoch": 0.030665826507036337,
"grad_norm": 1.015625,
"learning_rate": 0.000182,
"loss": 7.3978,
"mean_token_accuracy": 0.07722308114171028,
"num_tokens": 676456.0,
"step": 365
},
{
"entropy": 7.747646379470825,
"epoch": 0.031085906322201217,
"grad_norm": 1.0078125,
"learning_rate": 0.0001845,
"loss": 7.3621,
"mean_token_accuracy": 0.07609389498829841,
"num_tokens": 686881.0,
"step": 370
},
{
"entropy": 7.486468362808227,
"epoch": 0.0315059861373661,
"grad_norm": 0.921875,
"learning_rate": 0.000187,
"loss": 7.1472,
"mean_token_accuracy": 0.08116972967982292,
"num_tokens": 696045.0,
"step": 375
},
{
"entropy": 7.507930612564087,
"epoch": 0.03192606595253098,
"grad_norm": 1.3046875,
"learning_rate": 0.0001895,
"loss": 7.1262,
"mean_token_accuracy": 0.08457142487168312,
"num_tokens": 704729.0,
"step": 380
},
{
"entropy": 7.4088475704193115,
"epoch": 0.032346145767695865,
"grad_norm": 0.92578125,
"learning_rate": 0.000192,
"loss": 7.1832,
"mean_token_accuracy": 0.07680457159876823,
"num_tokens": 714331.0,
"step": 385
},
{
"entropy": 7.597209930419922,
"epoch": 0.03276622558286074,
"grad_norm": 1.25,
"learning_rate": 0.0001945,
"loss": 7.139,
"mean_token_accuracy": 0.08005330711603165,
"num_tokens": 722788.0,
"step": 390
},
{
"entropy": 7.544012260437012,
"epoch": 0.033186305398025626,
"grad_norm": 1.28125,
"learning_rate": 0.00019700000000000002,
"loss": 7.2038,
"mean_token_accuracy": 0.08137390315532685,
"num_tokens": 731417.0,
"step": 395
},
{
"entropy": 7.463483619689941,
"epoch": 0.03360638521319051,
"grad_norm": 1.0703125,
"learning_rate": 0.00019950000000000002,
"loss": 7.1774,
"mean_token_accuracy": 0.08162600994110107,
"num_tokens": 741034.0,
"step": 400
},
{
"entropy": 7.472280406951905,
"epoch": 0.034026465028355386,
"grad_norm": 1.15625,
"learning_rate": 0.000202,
"loss": 7.1863,
"mean_token_accuracy": 0.08051239103078842,
"num_tokens": 749596.0,
"step": 405
},
{
"entropy": 7.539561700820923,
"epoch": 0.03444654484352027,
"grad_norm": 0.97265625,
"learning_rate": 0.00020449999999999998,
"loss": 7.1293,
"mean_token_accuracy": 0.08054611459374428,
"num_tokens": 758931.0,
"step": 410
},
{
"entropy": 7.323099613189697,
"epoch": 0.03486662465868515,
"grad_norm": 0.953125,
"learning_rate": 0.000207,
"loss": 7.0321,
"mean_token_accuracy": 0.08574115931987762,
"num_tokens": 767534.0,
"step": 415
},
{
"entropy": 7.436507081985473,
"epoch": 0.03528670447385003,
"grad_norm": 1.390625,
"learning_rate": 0.0002095,
"loss": 7.1087,
"mean_token_accuracy": 0.0788518838584423,
"num_tokens": 776456.0,
"step": 420
},
{
"entropy": 7.4387647151947025,
"epoch": 0.035706784289014915,
"grad_norm": 1.03125,
"learning_rate": 0.000212,
"loss": 7.1659,
"mean_token_accuracy": 0.07921701893210412,
"num_tokens": 786172.0,
"step": 425
},
{
"entropy": 7.382072401046753,
"epoch": 0.03612686410417979,
"grad_norm": 1.1171875,
"learning_rate": 0.0002145,
"loss": 7.045,
"mean_token_accuracy": 0.08349293395876885,
"num_tokens": 795081.0,
"step": 430
},
{
"entropy": 7.354331922531128,
"epoch": 0.036546943919344675,
"grad_norm": 1.390625,
"learning_rate": 0.00021700000000000002,
"loss": 7.0961,
"mean_token_accuracy": 0.07894284576177597,
"num_tokens": 804259.0,
"step": 435
},
{
"entropy": 7.416359519958496,
"epoch": 0.03696702373450956,
"grad_norm": 1.25,
"learning_rate": 0.0002195,
"loss": 7.1173,
"mean_token_accuracy": 0.08117417171597481,
"num_tokens": 813463.0,
"step": 440
},
{
"entropy": 7.3510298252105715,
"epoch": 0.037387103549674436,
"grad_norm": 1.171875,
"learning_rate": 0.000222,
"loss": 7.0365,
"mean_token_accuracy": 0.08668158128857613,
"num_tokens": 823029.0,
"step": 445
},
{
"entropy": 7.395490074157715,
"epoch": 0.03780718336483932,
"grad_norm": 1.1484375,
"learning_rate": 0.0002245,
"loss": 7.0882,
"mean_token_accuracy": 0.07999400310218334,
"num_tokens": 832902.0,
"step": 450
},
{
"entropy": 7.301269912719727,
"epoch": 0.0382272631800042,
"grad_norm": 0.99609375,
"learning_rate": 0.00022700000000000002,
"loss": 7.0386,
"mean_token_accuracy": 0.08621552512049675,
"num_tokens": 842162.0,
"step": 455
},
{
"entropy": 7.3866761207580565,
"epoch": 0.03864734299516908,
"grad_norm": 1.1171875,
"learning_rate": 0.00022950000000000002,
"loss": 7.0749,
"mean_token_accuracy": 0.08230168521404266,
"num_tokens": 852328.0,
"step": 460
},
{
"entropy": 7.298245000839233,
"epoch": 0.039067422810333964,
"grad_norm": 1.03125,
"learning_rate": 0.00023200000000000003,
"loss": 7.0332,
"mean_token_accuracy": 0.08720984831452369,
"num_tokens": 860929.0,
"step": 465
},
{
"entropy": 7.339401197433472,
"epoch": 0.03948750262549885,
"grad_norm": 1.3046875,
"learning_rate": 0.00023449999999999998,
"loss": 7.1147,
"mean_token_accuracy": 0.07953860089182854,
"num_tokens": 869144.0,
"step": 470
},
{
"entropy": 7.407509994506836,
"epoch": 0.039907582440663725,
"grad_norm": 1.1328125,
"learning_rate": 0.000237,
"loss": 7.0553,
"mean_token_accuracy": 0.08522386401891709,
"num_tokens": 877447.0,
"step": 475
},
{
"entropy": 7.326080799102783,
"epoch": 0.04032766225582861,
"grad_norm": 1.1875,
"learning_rate": 0.0002395,
"loss": 7.0219,
"mean_token_accuracy": 0.07903883457183838,
"num_tokens": 887020.0,
"step": 480
},
{
"entropy": 7.214662790298462,
"epoch": 0.040747742070993485,
"grad_norm": 1.1953125,
"learning_rate": 0.000242,
"loss": 7.0658,
"mean_token_accuracy": 0.08200340047478676,
"num_tokens": 895937.0,
"step": 485
},
{
"entropy": 7.282938385009766,
"epoch": 0.04116782188615837,
"grad_norm": 1.109375,
"learning_rate": 0.0002445,
"loss": 7.0681,
"mean_token_accuracy": 0.07768266201019287,
"num_tokens": 905446.0,
"step": 490
},
{
"entropy": 7.278123235702514,
"epoch": 0.04158790170132325,
"grad_norm": 1.234375,
"learning_rate": 0.000247,
"loss": 6.9754,
"mean_token_accuracy": 0.08815655037760735,
"num_tokens": 914547.0,
"step": 495
},
{
"entropy": 7.246780204772949,
"epoch": 0.04200798151648813,
"grad_norm": 1.1171875,
"learning_rate": 0.0002495,
"loss": 6.971,
"mean_token_accuracy": 0.08642620444297791,
"num_tokens": 922900.0,
"step": 500
},
{
"entropy": 7.250076103210449,
"epoch": 0.042428061331653014,
"grad_norm": 1.046875,
"learning_rate": 0.000252,
"loss": 6.9895,
"mean_token_accuracy": 0.09132884815335274,
"num_tokens": 930876.0,
"step": 505
},
{
"entropy": 7.281206130981445,
"epoch": 0.0428481411468179,
"grad_norm": 1.0703125,
"learning_rate": 0.0002545,
"loss": 7.0298,
"mean_token_accuracy": 0.08785640895366668,
"num_tokens": 939871.0,
"step": 510
},
{
"entropy": 7.217110443115234,
"epoch": 0.043268220961982774,
"grad_norm": 1.1640625,
"learning_rate": 0.000257,
"loss": 7.0255,
"mean_token_accuracy": 0.08548255637288094,
"num_tokens": 948673.0,
"step": 515
},
{
"entropy": 7.166579723358154,
"epoch": 0.04368830077714766,
"grad_norm": 1.296875,
"learning_rate": 0.0002595,
"loss": 6.9755,
"mean_token_accuracy": 0.08237149193882942,
"num_tokens": 957603.0,
"step": 520
},
{
"entropy": 7.258489608764648,
"epoch": 0.04410838059231254,
"grad_norm": 1.1015625,
"learning_rate": 0.000262,
"loss": 7.0438,
"mean_token_accuracy": 0.08337677642703056,
"num_tokens": 967731.0,
"step": 525
},
{
"entropy": 7.215053987503052,
"epoch": 0.04452846040747742,
"grad_norm": 1.34375,
"learning_rate": 0.00026450000000000003,
"loss": 7.0538,
"mean_token_accuracy": 0.08425389714539051,
"num_tokens": 977427.0,
"step": 530
},
{
"entropy": 7.323162364959717,
"epoch": 0.0449485402226423,
"grad_norm": 1.4296875,
"learning_rate": 0.00026700000000000004,
"loss": 7.0242,
"mean_token_accuracy": 0.0810987412929535,
"num_tokens": 986758.0,
"step": 535
},
{
"entropy": 7.260653781890869,
"epoch": 0.045368620037807186,
"grad_norm": 1.265625,
"learning_rate": 0.00026950000000000005,
"loss": 7.0153,
"mean_token_accuracy": 0.0921817146241665,
"num_tokens": 996377.0,
"step": 540
},
{
"entropy": 7.14100399017334,
"epoch": 0.04578869985297206,
"grad_norm": 1.171875,
"learning_rate": 0.00027200000000000005,
"loss": 7.0719,
"mean_token_accuracy": 0.07598314173519612,
"num_tokens": 1006483.0,
"step": 545
},
{
"entropy": 7.1814124584198,
"epoch": 0.04620877966813695,
"grad_norm": 0.99609375,
"learning_rate": 0.0002745,
"loss": 6.9586,
"mean_token_accuracy": 0.08432785160839558,
"num_tokens": 1016132.0,
"step": 550
},
{
"entropy": 7.212322998046875,
"epoch": 0.04662885948330183,
"grad_norm": 1.0625,
"learning_rate": 0.000277,
"loss": 6.9099,
"mean_token_accuracy": 0.08570380732417107,
"num_tokens": 1024970.0,
"step": 555
},
{
"entropy": 7.235566568374634,
"epoch": 0.04704893929846671,
"grad_norm": 1.0703125,
"learning_rate": 0.0002795,
"loss": 6.986,
"mean_token_accuracy": 0.08736011460423469,
"num_tokens": 1034335.0,
"step": 560
},
{
"entropy": 7.172399663925171,
"epoch": 0.04746901911363159,
"grad_norm": 1.0859375,
"learning_rate": 0.00028199999999999997,
"loss": 7.0432,
"mean_token_accuracy": 0.09397755041718484,
"num_tokens": 1043954.0,
"step": 565
},
{
"entropy": 7.211180973052978,
"epoch": 0.04788909892879647,
"grad_norm": 1.0625,
"learning_rate": 0.0002845,
"loss": 6.9855,
"mean_token_accuracy": 0.08458798602223397,
"num_tokens": 1053554.0,
"step": 570
},
{
"entropy": 7.182736825942993,
"epoch": 0.04830917874396135,
"grad_norm": 1.1875,
"learning_rate": 0.000287,
"loss": 6.9441,
"mean_token_accuracy": 0.0878808081150055,
"num_tokens": 1062008.0,
"step": 575
},
{
"entropy": 7.098301124572754,
"epoch": 0.048729258559126236,
"grad_norm": 1.25,
"learning_rate": 0.0002895,
"loss": 7.0042,
"mean_token_accuracy": 0.09225907325744628,
"num_tokens": 1070740.0,
"step": 580
},
{
"entropy": 7.182776641845703,
"epoch": 0.04914933837429111,
"grad_norm": 1.328125,
"learning_rate": 0.000292,
"loss": 6.9919,
"mean_token_accuracy": 0.08668612986803055,
"num_tokens": 1079681.0,
"step": 585
},
{
"entropy": 7.184729337692261,
"epoch": 0.049569418189456,
"grad_norm": 1.3046875,
"learning_rate": 0.0002945,
"loss": 6.8713,
"mean_token_accuracy": 0.08976615592837334,
"num_tokens": 1088979.0,
"step": 590
},
{
"entropy": 7.06590256690979,
"epoch": 0.04998949800462088,
"grad_norm": 1.28125,
"learning_rate": 0.000297,
"loss": 6.8795,
"mean_token_accuracy": 0.09069397076964378,
"num_tokens": 1097870.0,
"step": 595
},
{
"entropy": 7.103166151046753,
"epoch": 0.05040957781978576,
"grad_norm": 1.1875,
"learning_rate": 0.0002995,
"loss": 6.9879,
"mean_token_accuracy": 0.08688322901725769,
"num_tokens": 1107948.0,
"step": 600
},
{
"entropy": 7.1195960521698,
"epoch": 0.05082965763495064,
"grad_norm": 1.125,
"learning_rate": 0.000302,
"loss": 6.9085,
"mean_token_accuracy": 0.08996571898460388,
"num_tokens": 1117032.0,
"step": 605
},
{
"entropy": 7.0409345626831055,
"epoch": 0.051249737450115525,
"grad_norm": 1.2578125,
"learning_rate": 0.0003045,
"loss": 6.8922,
"mean_token_accuracy": 0.09091109931468963,
"num_tokens": 1127834.0,
"step": 610
},
{
"entropy": 7.202378034591675,
"epoch": 0.0516698172652804,
"grad_norm": 1.359375,
"learning_rate": 0.000307,
"loss": 6.9702,
"mean_token_accuracy": 0.10021311864256859,
"num_tokens": 1137382.0,
"step": 615
},
{
"entropy": 6.999694728851319,
"epoch": 0.052089897080445285,
"grad_norm": 1.0703125,
"learning_rate": 0.0003095,
"loss": 6.8129,
"mean_token_accuracy": 0.09540090411901474,
"num_tokens": 1146095.0,
"step": 620
},
{
"entropy": 7.04736361503601,
"epoch": 0.05250997689561017,
"grad_norm": 1.0703125,
"learning_rate": 0.000312,
"loss": 6.8486,
"mean_token_accuracy": 0.09502546936273575,
"num_tokens": 1154981.0,
"step": 625
},
{
"entropy": 6.99720253944397,
"epoch": 0.052930056710775046,
"grad_norm": 1.140625,
"learning_rate": 0.0003145,
"loss": 6.8621,
"mean_token_accuracy": 0.09437942430377007,
"num_tokens": 1164939.0,
"step": 630
},
{
"entropy": 7.151091146469116,
"epoch": 0.05335013652593993,
"grad_norm": 1.3125,
"learning_rate": 0.000317,
"loss": 6.9914,
"mean_token_accuracy": 0.0866759791970253,
"num_tokens": 1174991.0,
"step": 635
},
{
"entropy": 7.180017423629761,
"epoch": 0.05377021634110481,
"grad_norm": 1.0234375,
"learning_rate": 0.0003195,
"loss": 7.0331,
"mean_token_accuracy": 0.0841725155711174,
"num_tokens": 1184885.0,
"step": 640
},
{
"entropy": 6.973786115646362,
"epoch": 0.05419029615626969,
"grad_norm": 1.2421875,
"learning_rate": 0.000322,
"loss": 6.9191,
"mean_token_accuracy": 0.08975687026977539,
"num_tokens": 1193637.0,
"step": 645
},
{
"entropy": 6.9996246814727785,
"epoch": 0.054610375971434574,
"grad_norm": 1.25,
"learning_rate": 0.00032450000000000003,
"loss": 6.7105,
"mean_token_accuracy": 0.09813873320817948,
"num_tokens": 1202188.0,
"step": 650
},
{
"entropy": 7.099790334701538,
"epoch": 0.05503045578659945,
"grad_norm": 1.125,
"learning_rate": 0.00032700000000000003,
"loss": 6.8407,
"mean_token_accuracy": 0.08720196485519409,
"num_tokens": 1210768.0,
"step": 655
},
{
"entropy": 7.041568231582642,
"epoch": 0.055450535601764335,
"grad_norm": 1.1875,
"learning_rate": 0.00032950000000000004,
"loss": 6.8421,
"mean_token_accuracy": 0.0917449563741684,
"num_tokens": 1219819.0,
"step": 660
},
{
"entropy": 7.046403980255127,
"epoch": 0.05587061541692922,
"grad_norm": 0.921875,
"learning_rate": 0.00033200000000000005,
"loss": 6.8979,
"mean_token_accuracy": 0.08456902354955673,
"num_tokens": 1229703.0,
"step": 665
},
{
"entropy": 7.123572778701782,
"epoch": 0.056290695232094096,
"grad_norm": 1.234375,
"learning_rate": 0.00033450000000000005,
"loss": 6.9292,
"mean_token_accuracy": 0.08853036314249038,
"num_tokens": 1238942.0,
"step": 670
},
{
"entropy": 7.159795522689819,
"epoch": 0.05671077504725898,
"grad_norm": 1.25,
"learning_rate": 0.000337,
"loss": 6.9738,
"mean_token_accuracy": 0.08909042924642563,
"num_tokens": 1248943.0,
"step": 675
},
{
"entropy": 6.958240079879761,
"epoch": 0.05713085486242386,
"grad_norm": 1.0703125,
"learning_rate": 0.0003395,
"loss": 6.8634,
"mean_token_accuracy": 0.09144520461559295,
"num_tokens": 1257761.0,
"step": 680
},
{
"entropy": 6.8688782215118405,
"epoch": 0.05755093467758874,
"grad_norm": 1.140625,
"learning_rate": 0.000342,
"loss": 6.7949,
"mean_token_accuracy": 0.08892206028103829,
"num_tokens": 1267216.0,
"step": 685
},
{
"entropy": 7.068194055557251,
"epoch": 0.057971014492753624,
"grad_norm": 1.125,
"learning_rate": 0.00034449999999999997,
"loss": 6.8935,
"mean_token_accuracy": 0.0898799903690815,
"num_tokens": 1277210.0,
"step": 690
},
{
"entropy": 7.016180753707886,
"epoch": 0.05839109430791851,
"grad_norm": 1.0546875,
"learning_rate": 0.000347,
"loss": 6.818,
"mean_token_accuracy": 0.08777436465024949,
"num_tokens": 1285310.0,
"step": 695
},
{
"entropy": 6.991688251495361,
"epoch": 0.058811174123083385,
"grad_norm": 1.265625,
"learning_rate": 0.0003495,
"loss": 6.83,
"mean_token_accuracy": 0.09071314185857773,
"num_tokens": 1294421.0,
"step": 700
},
{
"entropy": 6.878597545623779,
"epoch": 0.05923125393824827,
"grad_norm": 1.1953125,
"learning_rate": 0.000352,
"loss": 6.6618,
"mean_token_accuracy": 0.09866252094507218,
"num_tokens": 1303281.0,
"step": 705
},
{
"entropy": 6.936507320404052,
"epoch": 0.059651333753413145,
"grad_norm": 1.1953125,
"learning_rate": 0.0003545,
"loss": 6.824,
"mean_token_accuracy": 0.0997501090168953,
"num_tokens": 1312280.0,
"step": 710
},
{
"entropy": 6.8826006889343265,
"epoch": 0.06007141356857803,
"grad_norm": 0.98828125,
"learning_rate": 0.000357,
"loss": 6.7922,
"mean_token_accuracy": 0.09014676585793495,
"num_tokens": 1321243.0,
"step": 715
},
{
"entropy": 6.928562927246094,
"epoch": 0.06049149338374291,
"grad_norm": 1.0625,
"learning_rate": 0.0003595,
"loss": 6.8825,
"mean_token_accuracy": 0.09469160959124565,
"num_tokens": 1330324.0,
"step": 720
},
{
"entropy": 6.990442323684692,
"epoch": 0.06091157319890779,
"grad_norm": 1.140625,
"learning_rate": 0.000362,
"loss": 6.7224,
"mean_token_accuracy": 0.09678644239902497,
"num_tokens": 1339485.0,
"step": 725
},
{
"entropy": 6.953311347961426,
"epoch": 0.06133165301407267,
"grad_norm": 1.1796875,
"learning_rate": 0.0003645,
"loss": 6.8803,
"mean_token_accuracy": 0.08837029710412025,
"num_tokens": 1348640.0,
"step": 730
},
{
"entropy": 6.882500171661377,
"epoch": 0.06175173282923756,
"grad_norm": 1.1796875,
"learning_rate": 0.000367,
"loss": 6.7691,
"mean_token_accuracy": 0.09767747819423675,
"num_tokens": 1357581.0,
"step": 735
},
{
"entropy": 6.97215313911438,
"epoch": 0.062171812644402434,
"grad_norm": 1.1171875,
"learning_rate": 0.0003695,
"loss": 6.8411,
"mean_token_accuracy": 0.0938787505030632,
"num_tokens": 1367883.0,
"step": 740
},
{
"entropy": 6.919119882583618,
"epoch": 0.06259189245956731,
"grad_norm": 1.0546875,
"learning_rate": 0.000372,
"loss": 6.7914,
"mean_token_accuracy": 0.09219447746872902,
"num_tokens": 1376936.0,
"step": 745
},
{
"entropy": 6.825827884674072,
"epoch": 0.0630119722747322,
"grad_norm": 1.0859375,
"learning_rate": 0.0003745,
"loss": 6.7125,
"mean_token_accuracy": 0.09528392925858498,
"num_tokens": 1386359.0,
"step": 750
},
{
"entropy": 6.892624235153198,
"epoch": 0.06343205208989708,
"grad_norm": 1.0078125,
"learning_rate": 0.000377,
"loss": 6.7627,
"mean_token_accuracy": 0.09940937235951423,
"num_tokens": 1395223.0,
"step": 755
},
{
"entropy": 7.047525787353516,
"epoch": 0.06385213190506196,
"grad_norm": 1.09375,
"learning_rate": 0.0003795,
"loss": 6.9106,
"mean_token_accuracy": 0.09024005718529224,
"num_tokens": 1404917.0,
"step": 760
},
{
"entropy": 6.961672592163086,
"epoch": 0.06427221172022685,
"grad_norm": 1.1328125,
"learning_rate": 0.000382,
"loss": 6.8159,
"mean_token_accuracy": 0.10144984871149063,
"num_tokens": 1413348.0,
"step": 765
},
{
"entropy": 6.793653059005737,
"epoch": 0.06469229153539173,
"grad_norm": 1.1484375,
"learning_rate": 0.0003845,
"loss": 6.7916,
"mean_token_accuracy": 0.09195128381252289,
"num_tokens": 1421726.0,
"step": 770
},
{
"entropy": 6.895196437835693,
"epoch": 0.0651123713505566,
"grad_norm": 1.0546875,
"learning_rate": 0.00038700000000000003,
"loss": 6.7955,
"mean_token_accuracy": 0.09626475274562836,
"num_tokens": 1430686.0,
"step": 775
},
{
"entropy": 6.93384485244751,
"epoch": 0.06553245116572148,
"grad_norm": 1.0703125,
"learning_rate": 0.00038950000000000003,
"loss": 6.7897,
"mean_token_accuracy": 0.09465737789869308,
"num_tokens": 1439499.0,
"step": 780
},
{
"entropy": 6.955707168579101,
"epoch": 0.06595253098088637,
"grad_norm": 1.3203125,
"learning_rate": 0.00039200000000000004,
"loss": 6.7769,
"mean_token_accuracy": 0.09800057783722878,
"num_tokens": 1448220.0,
"step": 785
},
{
"entropy": 6.76906795501709,
"epoch": 0.06637261079605125,
"grad_norm": 1.046875,
"learning_rate": 0.00039450000000000005,
"loss": 6.7919,
"mean_token_accuracy": 0.08977739810943604,
"num_tokens": 1458217.0,
"step": 790
},
{
"entropy": 6.814671993255615,
"epoch": 0.06679269061121614,
"grad_norm": 1.03125,
"learning_rate": 0.00039700000000000005,
"loss": 6.7075,
"mean_token_accuracy": 0.09342081621289253,
"num_tokens": 1467422.0,
"step": 795
},
{
"entropy": 6.887504005432129,
"epoch": 0.06721277042638102,
"grad_norm": 1.125,
"learning_rate": 0.0003995,
"loss": 6.6819,
"mean_token_accuracy": 0.10001382231712341,
"num_tokens": 1476152.0,
"step": 800
},
{
"entropy": 6.807573080062866,
"epoch": 0.06763285024154589,
"grad_norm": 1.1171875,
"learning_rate": 0.000402,
"loss": 6.7751,
"mean_token_accuracy": 0.09214248061180115,
"num_tokens": 1485248.0,
"step": 805
},
{
"entropy": 6.854774427413941,
"epoch": 0.06805293005671077,
"grad_norm": 1.1484375,
"learning_rate": 0.0004045,
"loss": 6.7307,
"mean_token_accuracy": 0.09543775320053101,
"num_tokens": 1494248.0,
"step": 810
},
{
"entropy": 6.848575687408447,
"epoch": 0.06847300987187566,
"grad_norm": 1.234375,
"learning_rate": 0.00040699999999999997,
"loss": 6.8448,
"mean_token_accuracy": 0.0940382607281208,
"num_tokens": 1503565.0,
"step": 815
},
{
"entropy": 6.988439130783081,
"epoch": 0.06889308968704054,
"grad_norm": 1.0546875,
"learning_rate": 0.0004095,
"loss": 6.9384,
"mean_token_accuracy": 0.08889181464910507,
"num_tokens": 1513227.0,
"step": 820
},
{
"entropy": 6.93678297996521,
"epoch": 0.06931316950220542,
"grad_norm": 1.1796875,
"learning_rate": 0.000412,
"loss": 6.7217,
"mean_token_accuracy": 0.10070210471749305,
"num_tokens": 1522312.0,
"step": 825
},
{
"entropy": 6.770338535308838,
"epoch": 0.0697332493173703,
"grad_norm": 1.0390625,
"learning_rate": 0.0004145,
"loss": 6.6784,
"mean_token_accuracy": 0.09791189730167389,
"num_tokens": 1531720.0,
"step": 830
},
{
"entropy": 6.800765943527222,
"epoch": 0.07015332913253518,
"grad_norm": 1.1171875,
"learning_rate": 0.000417,
"loss": 6.7521,
"mean_token_accuracy": 0.09716509580612183,
"num_tokens": 1541238.0,
"step": 835
},
{
"entropy": 6.8829351425170895,
"epoch": 0.07057340894770006,
"grad_norm": 1.1328125,
"learning_rate": 0.0004195,
"loss": 6.8628,
"mean_token_accuracy": 0.09571778625249863,
"num_tokens": 1550875.0,
"step": 840
},
{
"entropy": 6.7474853515625,
"epoch": 0.07099348876286495,
"grad_norm": 1.0390625,
"learning_rate": 0.000422,
"loss": 6.7945,
"mean_token_accuracy": 0.09439405128359794,
"num_tokens": 1560287.0,
"step": 845
},
{
"entropy": 6.8450279712677,
"epoch": 0.07141356857802983,
"grad_norm": 1.0703125,
"learning_rate": 0.0004245,
"loss": 6.6719,
"mean_token_accuracy": 0.10050797313451768,
"num_tokens": 1569043.0,
"step": 850
},
{
"entropy": 6.72012848854065,
"epoch": 0.07183364839319471,
"grad_norm": 1.0703125,
"learning_rate": 0.000427,
"loss": 6.6946,
"mean_token_accuracy": 0.10327838435769081,
"num_tokens": 1578112.0,
"step": 855
},
{
"entropy": 6.666503381729126,
"epoch": 0.07225372820835958,
"grad_norm": 1.1484375,
"learning_rate": 0.0004295,
"loss": 6.6083,
"mean_token_accuracy": 0.10177602767944335,
"num_tokens": 1586587.0,
"step": 860
},
{
"entropy": 6.876049327850342,
"epoch": 0.07267380802352447,
"grad_norm": 1.0390625,
"learning_rate": 0.000432,
"loss": 6.7715,
"mean_token_accuracy": 0.09597784802317619,
"num_tokens": 1595585.0,
"step": 865
},
{
"entropy": 6.793572664260864,
"epoch": 0.07309388783868935,
"grad_norm": 1.078125,
"learning_rate": 0.0004345,
"loss": 6.7402,
"mean_token_accuracy": 0.09475546851754188,
"num_tokens": 1605355.0,
"step": 870
},
{
"entropy": 6.829131984710694,
"epoch": 0.07351396765385423,
"grad_norm": 1.1796875,
"learning_rate": 0.000437,
"loss": 6.7645,
"mean_token_accuracy": 0.09627607688307763,
"num_tokens": 1613637.0,
"step": 875
},
{
"entropy": 6.7632164478302,
"epoch": 0.07393404746901912,
"grad_norm": 1.1171875,
"learning_rate": 0.0004395,
"loss": 6.7187,
"mean_token_accuracy": 0.09899500831961631,
"num_tokens": 1622731.0,
"step": 880
},
{
"entropy": 6.812683629989624,
"epoch": 0.074354127284184,
"grad_norm": 1.0234375,
"learning_rate": 0.000442,
"loss": 6.678,
"mean_token_accuracy": 0.09412262439727784,
"num_tokens": 1632098.0,
"step": 885
},
{
"entropy": 6.743659448623657,
"epoch": 0.07477420709934887,
"grad_norm": 1.0234375,
"learning_rate": 0.0004445,
"loss": 6.6765,
"mean_token_accuracy": 0.09482985511422157,
"num_tokens": 1641259.0,
"step": 890
},
{
"entropy": 6.833035087585449,
"epoch": 0.07519428691451376,
"grad_norm": 1.1796875,
"learning_rate": 0.000447,
"loss": 6.7498,
"mean_token_accuracy": 0.09258906096220017,
"num_tokens": 1651362.0,
"step": 895
},
{
"entropy": 6.710019874572754,
"epoch": 0.07561436672967864,
"grad_norm": 1.109375,
"learning_rate": 0.00044950000000000003,
"loss": 6.6731,
"mean_token_accuracy": 0.09449022710323333,
"num_tokens": 1660190.0,
"step": 900
},
{
"entropy": 6.716372060775757,
"epoch": 0.07603444654484352,
"grad_norm": 1.1796875,
"learning_rate": 0.00045200000000000004,
"loss": 6.6958,
"mean_token_accuracy": 0.09791603237390518,
"num_tokens": 1669020.0,
"step": 905
},
{
"entropy": 6.81228666305542,
"epoch": 0.0764545263600084,
"grad_norm": 1.09375,
"learning_rate": 0.00045450000000000004,
"loss": 6.7321,
"mean_token_accuracy": 0.09860685616731643,
"num_tokens": 1678158.0,
"step": 910
},
{
"entropy": 6.792080020904541,
"epoch": 0.07687460617517328,
"grad_norm": 1.109375,
"learning_rate": 0.00045700000000000005,
"loss": 6.7306,
"mean_token_accuracy": 0.09886500239372253,
"num_tokens": 1687481.0,
"step": 915
},
{
"entropy": 6.71827883720398,
"epoch": 0.07729468599033816,
"grad_norm": 1.0234375,
"learning_rate": 0.00045950000000000006,
"loss": 6.6994,
"mean_token_accuracy": 0.103325155377388,
"num_tokens": 1696782.0,
"step": 920
},
{
"entropy": 6.721747827529907,
"epoch": 0.07771476580550304,
"grad_norm": 1.0546875,
"learning_rate": 0.000462,
"loss": 6.7107,
"mean_token_accuracy": 0.10372448563575745,
"num_tokens": 1706153.0,
"step": 925
},
{
"entropy": 6.703522777557373,
"epoch": 0.07813484562066793,
"grad_norm": 1.1640625,
"learning_rate": 0.0004645,
"loss": 6.7323,
"mean_token_accuracy": 0.10109473243355752,
"num_tokens": 1715585.0,
"step": 930
},
{
"entropy": 6.9429340839385985,
"epoch": 0.07855492543583281,
"grad_norm": 1.4296875,
"learning_rate": 0.000467,
"loss": 6.8552,
"mean_token_accuracy": 0.09585651680827141,
"num_tokens": 1724857.0,
"step": 935
},
{
"entropy": 6.723682641983032,
"epoch": 0.0789750052509977,
"grad_norm": 1.2265625,
"learning_rate": 0.0004695,
"loss": 6.6587,
"mean_token_accuracy": 0.10578344613313675,
"num_tokens": 1733528.0,
"step": 940
},
{
"entropy": 6.796629476547241,
"epoch": 0.07939508506616257,
"grad_norm": 0.9765625,
"learning_rate": 0.000472,
"loss": 6.7839,
"mean_token_accuracy": 0.09946857616305352,
"num_tokens": 1742953.0,
"step": 945
},
{
"entropy": 6.778720664978027,
"epoch": 0.07981516488132745,
"grad_norm": 1.265625,
"learning_rate": 0.0004745,
"loss": 6.7418,
"mean_token_accuracy": 0.10183344334363938,
"num_tokens": 1752155.0,
"step": 950
},
{
"entropy": 6.6747581481933596,
"epoch": 0.08023524469649233,
"grad_norm": 1.1328125,
"learning_rate": 0.000477,
"loss": 6.6189,
"mean_token_accuracy": 0.10308177843689918,
"num_tokens": 1760562.0,
"step": 955
},
{
"entropy": 6.6881184577941895,
"epoch": 0.08065532451165722,
"grad_norm": 1.1875,
"learning_rate": 0.0004795,
"loss": 6.6407,
"mean_token_accuracy": 0.09682166650891304,
"num_tokens": 1769631.0,
"step": 960
},
{
"entropy": 6.686205625534058,
"epoch": 0.0810754043268221,
"grad_norm": 1.1875,
"learning_rate": 0.000482,
"loss": 6.6737,
"mean_token_accuracy": 0.09623132348060608,
"num_tokens": 1779080.0,
"step": 965
},
{
"entropy": 6.71329026222229,
"epoch": 0.08149548414198697,
"grad_norm": 1.3359375,
"learning_rate": 0.0004845,
"loss": 6.6501,
"mean_token_accuracy": 0.09797736331820488,
"num_tokens": 1787830.0,
"step": 970
},
{
"entropy": 6.607724714279175,
"epoch": 0.08191556395715185,
"grad_norm": 1.0390625,
"learning_rate": 0.000487,
"loss": 6.5762,
"mean_token_accuracy": 0.10056376829743385,
"num_tokens": 1796998.0,
"step": 975
},
{
"entropy": 6.796718168258667,
"epoch": 0.08233564377231674,
"grad_norm": 1.1171875,
"learning_rate": 0.0004895,
"loss": 6.6548,
"mean_token_accuracy": 0.10055585950613022,
"num_tokens": 1806194.0,
"step": 980
},
{
"entropy": 6.432325410842895,
"epoch": 0.08275572358748162,
"grad_norm": 1.0,
"learning_rate": 0.000492,
"loss": 6.5356,
"mean_token_accuracy": 0.10625480636954307,
"num_tokens": 1815751.0,
"step": 985
},
{
"entropy": 6.659997034072876,
"epoch": 0.0831758034026465,
"grad_norm": 1.046875,
"learning_rate": 0.0004945,
"loss": 6.6207,
"mean_token_accuracy": 0.10119200572371483,
"num_tokens": 1825379.0,
"step": 990
},
{
"entropy": 6.685537910461425,
"epoch": 0.08359588321781139,
"grad_norm": 1.0859375,
"learning_rate": 0.000497,
"loss": 6.5776,
"mean_token_accuracy": 0.10274154916405678,
"num_tokens": 1834158.0,
"step": 995
},
{
"entropy": 6.586271667480469,
"epoch": 0.08401596303297626,
"grad_norm": 1.1796875,
"learning_rate": 0.0004995,
"loss": 6.5331,
"mean_token_accuracy": 0.1030009813606739,
"num_tokens": 1842724.0,
"step": 1000
},
{
"entropy": 6.606362009048462,
"epoch": 0.08443604284814114,
"grad_norm": 0.98828125,
"learning_rate": 0.000499999998724557,
"loss": 6.5511,
"mean_token_accuracy": 0.10277646854519844,
"num_tokens": 1852485.0,
"step": 1005
},
{
"entropy": 6.6398862361907955,
"epoch": 0.08485612266330603,
"grad_norm": 1.109375,
"learning_rate": 0.0004999999935430703,
"loss": 6.5824,
"mean_token_accuracy": 0.10646345019340515,
"num_tokens": 1861303.0,
"step": 1010
},
{
"entropy": 6.527065420150757,
"epoch": 0.08527620247847091,
"grad_norm": 1.078125,
"learning_rate": 0.0004999999843758243,
"loss": 6.5766,
"mean_token_accuracy": 0.1160741962492466,
"num_tokens": 1870859.0,
"step": 1015
},
{
"entropy": 6.720239210128784,
"epoch": 0.0856962822936358,
"grad_norm": 1.03125,
"learning_rate": 0.0004999999712228196,
"loss": 6.7375,
"mean_token_accuracy": 0.09559379816055298,
"num_tokens": 1880295.0,
"step": 1020
},
{
"entropy": 6.755932474136353,
"epoch": 0.08611636210880068,
"grad_norm": 1.03125,
"learning_rate": 0.0004999999540840562,
"loss": 6.6322,
"mean_token_accuracy": 0.10711020082235337,
"num_tokens": 1889193.0,
"step": 1025
},
{
"entropy": 6.635529565811157,
"epoch": 0.08653644192396555,
"grad_norm": 0.98046875,
"learning_rate": 0.0004999999329595345,
"loss": 6.7369,
"mean_token_accuracy": 0.09584898650646209,
"num_tokens": 1899437.0,
"step": 1030
},
{
"entropy": 6.71239709854126,
"epoch": 0.08695652173913043,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999999078492548,
"loss": 6.6284,
"mean_token_accuracy": 0.10203150510787964,
"num_tokens": 1907882.0,
"step": 1035
},
{
"entropy": 6.588162136077881,
"epoch": 0.08737660155429532,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999998787532176,
"loss": 6.5411,
"mean_token_accuracy": 0.10083841383457184,
"num_tokens": 1916872.0,
"step": 1040
},
{
"entropy": 6.676869010925293,
"epoch": 0.0877966813694602,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999998456714234,
"loss": 6.7265,
"mean_token_accuracy": 0.09887873977422715,
"num_tokens": 1926636.0,
"step": 1045
},
{
"entropy": 6.626446390151978,
"epoch": 0.08821676118462508,
"grad_norm": 1.125,
"learning_rate": 0.0004999998086038729,
"loss": 6.6125,
"mean_token_accuracy": 0.10665144100785255,
"num_tokens": 1935962.0,
"step": 1050
},
{
"entropy": 6.63335337638855,
"epoch": 0.08863684099978995,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999997675505665,
"loss": 6.5714,
"mean_token_accuracy": 0.10421753227710724,
"num_tokens": 1944600.0,
"step": 1055
},
{
"entropy": 6.66912956237793,
"epoch": 0.08905692081495484,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999997225115052,
"loss": 6.7618,
"mean_token_accuracy": 0.10299883931875228,
"num_tokens": 1954234.0,
"step": 1060
},
{
"entropy": 6.80169267654419,
"epoch": 0.08947700063011972,
"grad_norm": 1.109375,
"learning_rate": 0.0004999996734866896,
"loss": 6.7125,
"mean_token_accuracy": 0.10105927959084511,
"num_tokens": 1964499.0,
"step": 1065
},
{
"entropy": 6.4505407333374025,
"epoch": 0.0898970804452846,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999996204761206,
"loss": 6.4236,
"mean_token_accuracy": 0.1128264844417572,
"num_tokens": 1973635.0,
"step": 1070
},
{
"entropy": 6.56596827507019,
"epoch": 0.09031716026044949,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999995634797993,
"loss": 6.5739,
"mean_token_accuracy": 0.10490957424044609,
"num_tokens": 1983509.0,
"step": 1075
},
{
"entropy": 6.62415657043457,
"epoch": 0.09073724007561437,
"grad_norm": 1.09375,
"learning_rate": 0.0004999995024977265,
"loss": 6.5551,
"mean_token_accuracy": 0.11068339720368385,
"num_tokens": 1992336.0,
"step": 1080
},
{
"entropy": 6.581994724273682,
"epoch": 0.09115731989077924,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999994375299034,
"loss": 6.5937,
"mean_token_accuracy": 0.10452346429228783,
"num_tokens": 2001931.0,
"step": 1085
},
{
"entropy": 6.580089092254639,
"epoch": 0.09157739970594413,
"grad_norm": 0.96484375,
"learning_rate": 0.000499999368576331,
"loss": 6.4447,
"mean_token_accuracy": 0.11201497912406921,
"num_tokens": 2010935.0,
"step": 1090
},
{
"entropy": 6.511315250396729,
"epoch": 0.09199747952110901,
"grad_norm": 1.03125,
"learning_rate": 0.0004999992956370109,
"loss": 6.4995,
"mean_token_accuracy": 0.10933665409684182,
"num_tokens": 2020587.0,
"step": 1095
},
{
"entropy": 6.465148067474365,
"epoch": 0.0924175593362739,
"grad_norm": 1.0234375,
"learning_rate": 0.000499999218711944,
"loss": 6.5391,
"mean_token_accuracy": 0.10621756613254547,
"num_tokens": 2029743.0,
"step": 1100
},
{
"entropy": 6.62024712562561,
"epoch": 0.09283763915143878,
"grad_norm": 1.09375,
"learning_rate": 0.0004999991378011317,
"loss": 6.5513,
"mean_token_accuracy": 0.11122238337993622,
"num_tokens": 2038468.0,
"step": 1105
},
{
"entropy": 6.526382637023926,
"epoch": 0.09325771896660366,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999990529045757,
"loss": 6.4773,
"mean_token_accuracy": 0.10901794061064721,
"num_tokens": 2047456.0,
"step": 1110
},
{
"entropy": 6.6758506298065186,
"epoch": 0.09367779878176853,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999989640222771,
"loss": 6.7907,
"mean_token_accuracy": 0.09581167250871658,
"num_tokens": 2056691.0,
"step": 1115
},
{
"entropy": 6.724249839782715,
"epoch": 0.09409787859693342,
"grad_norm": 1.0,
"learning_rate": 0.000499998871154238,
"loss": 6.5726,
"mean_token_accuracy": 0.1056052066385746,
"num_tokens": 2066068.0,
"step": 1120
},
{
"entropy": 6.619223117828369,
"epoch": 0.0945179584120983,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999987743004597,
"loss": 6.5105,
"mean_token_accuracy": 0.10904356241226196,
"num_tokens": 2075113.0,
"step": 1125
},
{
"entropy": 6.579265213012695,
"epoch": 0.09493803822726318,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999986734609438,
"loss": 6.648,
"mean_token_accuracy": 0.10513966977596283,
"num_tokens": 2084557.0,
"step": 1130
},
{
"entropy": 6.630017042160034,
"epoch": 0.09535811804242807,
"grad_norm": 1.09375,
"learning_rate": 0.0004999985686356923,
"loss": 6.5466,
"mean_token_accuracy": 0.10622756630182266,
"num_tokens": 2093424.0,
"step": 1135
},
{
"entropy": 6.602306842803955,
"epoch": 0.09577819785759294,
"grad_norm": 1.0234375,
"learning_rate": 0.000499998459824707,
"loss": 6.6635,
"mean_token_accuracy": 0.10230447798967361,
"num_tokens": 2103066.0,
"step": 1140
},
{
"entropy": 6.612010765075683,
"epoch": 0.09619827767275782,
"grad_norm": 1.078125,
"learning_rate": 0.00049999834702799,
"loss": 6.5355,
"mean_token_accuracy": 0.1076541669666767,
"num_tokens": 2112447.0,
"step": 1145
},
{
"entropy": 6.522880172729492,
"epoch": 0.0966183574879227,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999982302455431,
"loss": 6.5497,
"mean_token_accuracy": 0.10876928493380547,
"num_tokens": 2121949.0,
"step": 1150
},
{
"entropy": 6.574669218063354,
"epoch": 0.09703843730308759,
"grad_norm": 1.078125,
"learning_rate": 0.0004999981094773683,
"loss": 6.4538,
"mean_token_accuracy": 0.10955686494708061,
"num_tokens": 2130464.0,
"step": 1155
},
{
"entropy": 6.626054668426514,
"epoch": 0.09745851711825247,
"grad_norm": 1.109375,
"learning_rate": 0.000499997984723468,
"loss": 6.6151,
"mean_token_accuracy": 0.10182780474424362,
"num_tokens": 2139577.0,
"step": 1160
},
{
"entropy": 6.2696503639221195,
"epoch": 0.09787859693341736,
"grad_norm": 0.9375,
"learning_rate": 0.0004999978559838441,
"loss": 6.3583,
"mean_token_accuracy": 0.10666822865605355,
"num_tokens": 2147919.0,
"step": 1165
},
{
"entropy": 6.4677763938903805,
"epoch": 0.09829867674858223,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999977232584991,
"loss": 6.5126,
"mean_token_accuracy": 0.10790005698800087,
"num_tokens": 2156936.0,
"step": 1170
},
{
"entropy": 6.619596195220947,
"epoch": 0.09871875656374711,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999975865474354,
"loss": 6.565,
"mean_token_accuracy": 0.1067568302154541,
"num_tokens": 2165362.0,
"step": 1175
},
{
"entropy": 6.488823509216308,
"epoch": 0.099138836378912,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999974458506551,
"loss": 6.4994,
"mean_token_accuracy": 0.105003522336483,
"num_tokens": 2173665.0,
"step": 1180
},
{
"entropy": 6.633204603195191,
"epoch": 0.09955891619407688,
"grad_norm": 1.140625,
"learning_rate": 0.000499997301168161,
"loss": 6.4861,
"mean_token_accuracy": 0.10850023925304413,
"num_tokens": 2182222.0,
"step": 1185
},
{
"entropy": 6.550716543197632,
"epoch": 0.09997899600924176,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999971524999556,
"loss": 6.5724,
"mean_token_accuracy": 0.11284129321575165,
"num_tokens": 2192358.0,
"step": 1190
},
{
"entropy": 6.5804180145263675,
"epoch": 0.10039907582440663,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999969998460414,
"loss": 6.5471,
"mean_token_accuracy": 0.10514037609100342,
"num_tokens": 2201889.0,
"step": 1195
},
{
"entropy": 6.548522186279297,
"epoch": 0.10081915563957151,
"grad_norm": 1.25,
"learning_rate": 0.0004999968432064213,
"loss": 6.5442,
"mean_token_accuracy": 0.11475524455308914,
"num_tokens": 2211810.0,
"step": 1200
},
{
"entropy": 6.474522876739502,
"epoch": 0.1012392354547364,
"grad_norm": 0.94140625,
"learning_rate": 0.0004999966825810979,
"loss": 6.483,
"mean_token_accuracy": 0.10969577804207802,
"num_tokens": 2221123.0,
"step": 1205
},
{
"entropy": 6.482648086547852,
"epoch": 0.10165931526990128,
"grad_norm": 1.0625,
"learning_rate": 0.0004999965179700742,
"loss": 6.4233,
"mean_token_accuracy": 0.11192466542124749,
"num_tokens": 2230129.0,
"step": 1210
},
{
"entropy": 6.427746820449829,
"epoch": 0.10207939508506617,
"grad_norm": 0.98046875,
"learning_rate": 0.000499996349373353,
"loss": 6.4731,
"mean_token_accuracy": 0.11126175448298455,
"num_tokens": 2239929.0,
"step": 1215
},
{
"entropy": 6.540882635116577,
"epoch": 0.10249947490023105,
"grad_norm": 1.0625,
"learning_rate": 0.0004999961767909374,
"loss": 6.4503,
"mean_token_accuracy": 0.11326849237084388,
"num_tokens": 2248078.0,
"step": 1220
},
{
"entropy": 6.496834564208984,
"epoch": 0.10291955471539592,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999960002228303,
"loss": 6.5467,
"mean_token_accuracy": 0.11118405237793923,
"num_tokens": 2256975.0,
"step": 1225
},
{
"entropy": 6.528544092178345,
"epoch": 0.1033396345305608,
"grad_norm": 1.109375,
"learning_rate": 0.0004999958196690349,
"loss": 6.4084,
"mean_token_accuracy": 0.11252974197268487,
"num_tokens": 2265797.0,
"step": 1230
},
{
"entropy": 6.500071668624878,
"epoch": 0.10375971434572569,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999956351295545,
"loss": 6.4961,
"mean_token_accuracy": 0.11529959514737129,
"num_tokens": 2274099.0,
"step": 1235
},
{
"entropy": 6.427072525024414,
"epoch": 0.10417979416089057,
"grad_norm": 1.03125,
"learning_rate": 0.0004999954466043922,
"loss": 6.4331,
"mean_token_accuracy": 0.11734503880143166,
"num_tokens": 2282360.0,
"step": 1240
},
{
"entropy": 6.490129566192627,
"epoch": 0.10459987397605545,
"grad_norm": 0.9375,
"learning_rate": 0.0004999952540935514,
"loss": 6.5292,
"mean_token_accuracy": 0.10359383374452591,
"num_tokens": 2292714.0,
"step": 1245
},
{
"entropy": 6.519558954238891,
"epoch": 0.10501995379122034,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999950575970356,
"loss": 6.4583,
"mean_token_accuracy": 0.11484305784106255,
"num_tokens": 2301633.0,
"step": 1250
},
{
"entropy": 6.521380376815796,
"epoch": 0.10544003360638521,
"grad_norm": 1.03125,
"learning_rate": 0.0004999948571148482,
"loss": 6.4373,
"mean_token_accuracy": 0.1137208767235279,
"num_tokens": 2310067.0,
"step": 1255
},
{
"entropy": 6.447480583190918,
"epoch": 0.10586011342155009,
"grad_norm": 1.046875,
"learning_rate": 0.0004999946526469927,
"loss": 6.5213,
"mean_token_accuracy": 0.11123185902833939,
"num_tokens": 2320090.0,
"step": 1260
},
{
"entropy": 6.488873481750488,
"epoch": 0.10628019323671498,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999944441934728,
"loss": 6.474,
"mean_token_accuracy": 0.11672058925032616,
"num_tokens": 2329255.0,
"step": 1265
},
{
"entropy": 6.575503969192505,
"epoch": 0.10670027305187986,
"grad_norm": 1.109375,
"learning_rate": 0.0004999942317542922,
"loss": 6.5597,
"mean_token_accuracy": 0.11168134436011315,
"num_tokens": 2339535.0,
"step": 1270
},
{
"entropy": 6.4351553440094,
"epoch": 0.10712035286704474,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999940153294546,
"loss": 6.4631,
"mean_token_accuracy": 0.11417224109172822,
"num_tokens": 2348948.0,
"step": 1275
},
{
"entropy": 6.516087198257447,
"epoch": 0.10754043268220961,
"grad_norm": 1.0078125,
"learning_rate": 0.000499993794918964,
"loss": 6.4852,
"mean_token_accuracy": 0.10800226852297783,
"num_tokens": 2359141.0,
"step": 1280
},
{
"entropy": 6.426724910736084,
"epoch": 0.1079605124973745,
"grad_norm": 1.15625,
"learning_rate": 0.0004999935705228241,
"loss": 6.5269,
"mean_token_accuracy": 0.10617210119962692,
"num_tokens": 2368906.0,
"step": 1285
},
{
"entropy": 6.60525369644165,
"epoch": 0.10838059231253938,
"grad_norm": 1.125,
"learning_rate": 0.0004999933421410389,
"loss": 6.5118,
"mean_token_accuracy": 0.11492194160819054,
"num_tokens": 2377029.0,
"step": 1290
},
{
"entropy": 6.507741403579712,
"epoch": 0.10880067212770426,
"grad_norm": 0.91015625,
"learning_rate": 0.0004999931097736125,
"loss": 6.5731,
"mean_token_accuracy": 0.10368336364626884,
"num_tokens": 2387088.0,
"step": 1295
},
{
"entropy": 6.560735607147217,
"epoch": 0.10922075194286915,
"grad_norm": 1.09375,
"learning_rate": 0.0004999928734205492,
"loss": 6.4678,
"mean_token_accuracy": 0.11101854220032692,
"num_tokens": 2395596.0,
"step": 1300
},
{
"entropy": 6.4469006061553955,
"epoch": 0.10964083175803403,
"grad_norm": 1.0625,
"learning_rate": 0.0004999926330818528,
"loss": 6.4508,
"mean_token_accuracy": 0.11560385897755623,
"num_tokens": 2404506.0,
"step": 1305
},
{
"entropy": 6.50532808303833,
"epoch": 0.1100609115731989,
"grad_norm": 1.125,
"learning_rate": 0.0004999923887575278,
"loss": 6.4871,
"mean_token_accuracy": 0.1127387061715126,
"num_tokens": 2414342.0,
"step": 1310
},
{
"entropy": 6.511183404922486,
"epoch": 0.11048099138836379,
"grad_norm": 1.0625,
"learning_rate": 0.0004999921404475785,
"loss": 6.464,
"mean_token_accuracy": 0.11368927583098412,
"num_tokens": 2423076.0,
"step": 1315
},
{
"entropy": 6.4253387451171875,
"epoch": 0.11090107120352867,
"grad_norm": 0.8984375,
"learning_rate": 0.0004999918881520093,
"loss": 6.415,
"mean_token_accuracy": 0.11362927556037902,
"num_tokens": 2432492.0,
"step": 1320
},
{
"entropy": 6.421670770645141,
"epoch": 0.11132115101869355,
"grad_norm": 1.015625,
"learning_rate": 0.0004999916318708246,
"loss": 6.3657,
"mean_token_accuracy": 0.11932958588004113,
"num_tokens": 2441916.0,
"step": 1325
},
{
"entropy": 6.4051666259765625,
"epoch": 0.11174123083385844,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999913716040291,
"loss": 6.4305,
"mean_token_accuracy": 0.11500561088323594,
"num_tokens": 2450932.0,
"step": 1330
},
{
"entropy": 6.410413789749145,
"epoch": 0.11216131064902331,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999911073516272,
"loss": 6.4353,
"mean_token_accuracy": 0.11533465534448624,
"num_tokens": 2460058.0,
"step": 1335
},
{
"entropy": 6.411908531188965,
"epoch": 0.11258139046418819,
"grad_norm": 1.015625,
"learning_rate": 0.0004999908391136237,
"loss": 6.3795,
"mean_token_accuracy": 0.11413594856858253,
"num_tokens": 2469607.0,
"step": 1340
},
{
"entropy": 6.480467748641968,
"epoch": 0.11300147027935308,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999905668900234,
"loss": 6.4206,
"mean_token_accuracy": 0.10967940241098403,
"num_tokens": 2478345.0,
"step": 1345
},
{
"entropy": 6.437506008148193,
"epoch": 0.11342155009451796,
"grad_norm": 1.1484375,
"learning_rate": 0.000499990290680831,
"loss": 6.3551,
"mean_token_accuracy": 0.11509306952357293,
"num_tokens": 2486662.0,
"step": 1350
},
{
"entropy": 6.440225267410279,
"epoch": 0.11384162990968284,
"grad_norm": 1.0625,
"learning_rate": 0.0004999900104860516,
"loss": 6.493,
"mean_token_accuracy": 0.10815305337309837,
"num_tokens": 2495392.0,
"step": 1355
},
{
"entropy": 6.476166391372681,
"epoch": 0.11426170972484773,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999897263056898,
"loss": 6.5134,
"mean_token_accuracy": 0.1084674522280693,
"num_tokens": 2505254.0,
"step": 1360
},
{
"entropy": 6.572962856292724,
"epoch": 0.1146817895400126,
"grad_norm": 1.0390625,
"learning_rate": 0.000499989438139751,
"loss": 6.3391,
"mean_token_accuracy": 0.11722229272127152,
"num_tokens": 2514096.0,
"step": 1365
},
{
"entropy": 6.320563554763794,
"epoch": 0.11510186935517748,
"grad_norm": 0.90625,
"learning_rate": 0.0004999891459882401,
"loss": 6.3415,
"mean_token_accuracy": 0.11798084080219269,
"num_tokens": 2523635.0,
"step": 1370
},
{
"entropy": 6.370605707168579,
"epoch": 0.11552194917034236,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999888498511624,
"loss": 6.4231,
"mean_token_accuracy": 0.11351286545395851,
"num_tokens": 2532528.0,
"step": 1375
},
{
"entropy": 6.42153754234314,
"epoch": 0.11594202898550725,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999885497285229,
"loss": 6.3223,
"mean_token_accuracy": 0.11377703249454499,
"num_tokens": 2541893.0,
"step": 1380
},
{
"entropy": 6.374641036987304,
"epoch": 0.11636210880067213,
"grad_norm": 1.0,
"learning_rate": 0.0004999882456203273,
"loss": 6.3828,
"mean_token_accuracy": 0.11570416316390038,
"num_tokens": 2551551.0,
"step": 1385
},
{
"entropy": 6.409754896163941,
"epoch": 0.11678218861583702,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999879375265806,
"loss": 6.3397,
"mean_token_accuracy": 0.11345641836524009,
"num_tokens": 2560183.0,
"step": 1390
},
{
"entropy": 6.346025371551514,
"epoch": 0.11720226843100189,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999876254472886,
"loss": 6.244,
"mean_token_accuracy": 0.1259176701307297,
"num_tokens": 2568697.0,
"step": 1395
},
{
"entropy": 6.418486166000366,
"epoch": 0.11762234824616677,
"grad_norm": 0.91796875,
"learning_rate": 0.0004999873093824565,
"loss": 6.4413,
"mean_token_accuracy": 0.11301257386803627,
"num_tokens": 2578151.0,
"step": 1400
},
{
"entropy": 6.546730661392212,
"epoch": 0.11804242806133165,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999869893320902,
"loss": 6.5711,
"mean_token_accuracy": 0.11398048102855682,
"num_tokens": 2585901.0,
"step": 1405
},
{
"entropy": 6.384084796905517,
"epoch": 0.11846250787649654,
"grad_norm": 1.046875,
"learning_rate": 0.0004999866652961952,
"loss": 6.3911,
"mean_token_accuracy": 0.1123290129005909,
"num_tokens": 2595655.0,
"step": 1410
},
{
"entropy": 6.452324104309082,
"epoch": 0.11888258769166142,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999863372747773,
"loss": 6.3493,
"mean_token_accuracy": 0.10948696061968803,
"num_tokens": 2604949.0,
"step": 1415
},
{
"entropy": 6.454392957687378,
"epoch": 0.11930266750682629,
"grad_norm": 1.1796875,
"learning_rate": 0.0004999860052678423,
"loss": 6.4265,
"mean_token_accuracy": 0.11580813452601432,
"num_tokens": 2614260.0,
"step": 1420
},
{
"entropy": 6.358513355255127,
"epoch": 0.11972274732199117,
"grad_norm": 1.1796875,
"learning_rate": 0.0004999856692753959,
"loss": 6.4088,
"mean_token_accuracy": 0.11651854142546654,
"num_tokens": 2623740.0,
"step": 1425
},
{
"entropy": 6.431614780426026,
"epoch": 0.12014282713715606,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999853292974444,
"loss": 6.3193,
"mean_token_accuracy": 0.119064449518919,
"num_tokens": 2631998.0,
"step": 1430
},
{
"entropy": 6.387428283691406,
"epoch": 0.12056290695232094,
"grad_norm": 0.9375,
"learning_rate": 0.0004999849853339936,
"loss": 6.4515,
"mean_token_accuracy": 0.11693638861179352,
"num_tokens": 2641169.0,
"step": 1435
},
{
"entropy": 6.461726379394531,
"epoch": 0.12098298676748583,
"grad_norm": 0.94140625,
"learning_rate": 0.0004999846373850497,
"loss": 6.3006,
"mean_token_accuracy": 0.11889183148741722,
"num_tokens": 2650576.0,
"step": 1440
},
{
"entropy": 6.285704851150513,
"epoch": 0.12140306658265071,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999842854506186,
"loss": 6.3909,
"mean_token_accuracy": 0.11337714865803719,
"num_tokens": 2660817.0,
"step": 1445
},
{
"entropy": 6.454374599456787,
"epoch": 0.12182314639781558,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999839295307069,
"loss": 6.3413,
"mean_token_accuracy": 0.11637749969959259,
"num_tokens": 2669338.0,
"step": 1450
},
{
"entropy": 6.4314216613769535,
"epoch": 0.12224322621298046,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999835696253206,
"loss": 6.3931,
"mean_token_accuracy": 0.1177656464278698,
"num_tokens": 2679108.0,
"step": 1455
},
{
"entropy": 6.414116144180298,
"epoch": 0.12266330602814535,
"grad_norm": 0.95703125,
"learning_rate": 0.0004999832057344664,
"loss": 6.3513,
"mean_token_accuracy": 0.11523670107126235,
"num_tokens": 2688126.0,
"step": 1460
},
{
"entropy": 6.248635339736938,
"epoch": 0.12308338584331023,
"grad_norm": 1.09375,
"learning_rate": 0.0004999828378581504,
"loss": 6.3207,
"mean_token_accuracy": 0.1255062073469162,
"num_tokens": 2697245.0,
"step": 1465
},
{
"entropy": 6.469296169281006,
"epoch": 0.12350346565847511,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999824659963793,
"loss": 6.3851,
"mean_token_accuracy": 0.12115937024354935,
"num_tokens": 2705934.0,
"step": 1470
},
{
"entropy": 6.348638343811035,
"epoch": 0.12392354547364,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999820901491598,
"loss": 6.3102,
"mean_token_accuracy": 0.12247596234083176,
"num_tokens": 2714367.0,
"step": 1475
},
{
"entropy": 6.288274192810059,
"epoch": 0.12434362528880487,
"grad_norm": 1.046875,
"learning_rate": 0.0004999817103164983,
"loss": 6.347,
"mean_token_accuracy": 0.120758505910635,
"num_tokens": 2724366.0,
"step": 1480
},
{
"entropy": 6.409024095535278,
"epoch": 0.12476370510396975,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999813264984017,
"loss": 6.3559,
"mean_token_accuracy": 0.11786462664604187,
"num_tokens": 2733980.0,
"step": 1485
},
{
"entropy": 6.405835437774658,
"epoch": 0.12518378491913462,
"grad_norm": 1.0,
"learning_rate": 0.0004999809386948767,
"loss": 6.3447,
"mean_token_accuracy": 0.12095973119139672,
"num_tokens": 2744013.0,
"step": 1490
},
{
"entropy": 6.306218957901001,
"epoch": 0.12560386473429952,
"grad_norm": 1.078125,
"learning_rate": 0.0004999805469059302,
"loss": 6.409,
"mean_token_accuracy": 0.11885412856936454,
"num_tokens": 2753385.0,
"step": 1495
},
{
"entropy": 6.402831554412842,
"epoch": 0.1260239445494644,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999801511315693,
"loss": 6.2797,
"mean_token_accuracy": 0.11564093008637429,
"num_tokens": 2762875.0,
"step": 1500
},
{
"entropy": 6.413339233398437,
"epoch": 0.1264440243646293,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999797513718007,
"loss": 6.3316,
"mean_token_accuracy": 0.12512060776352882,
"num_tokens": 2772182.0,
"step": 1505
},
{
"entropy": 6.232090759277344,
"epoch": 0.12686410417979416,
"grad_norm": 1.015625,
"learning_rate": 0.0004999793476266317,
"loss": 6.2777,
"mean_token_accuracy": 0.1216270886361599,
"num_tokens": 2780814.0,
"step": 1510
},
{
"entropy": 6.62731146812439,
"epoch": 0.12728418399495905,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999789398960695,
"loss": 6.5737,
"mean_token_accuracy": 0.11791431680321693,
"num_tokens": 2791104.0,
"step": 1515
},
{
"entropy": 6.245316696166992,
"epoch": 0.12770426381012392,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999785281801212,
"loss": 6.2623,
"mean_token_accuracy": 0.12040979042649269,
"num_tokens": 2800081.0,
"step": 1520
},
{
"entropy": 6.359339189529419,
"epoch": 0.1281243436252888,
"grad_norm": 1.09375,
"learning_rate": 0.000499978112478794,
"loss": 6.3902,
"mean_token_accuracy": 0.11980480477213859,
"num_tokens": 2809096.0,
"step": 1525
},
{
"entropy": 6.453446865081787,
"epoch": 0.1285444234404537,
"grad_norm": 1.046875,
"learning_rate": 0.0004999776927920955,
"loss": 6.3617,
"mean_token_accuracy": 0.12184310704469681,
"num_tokens": 2818857.0,
"step": 1530
},
{
"entropy": 6.291106510162353,
"epoch": 0.12896450325561856,
"grad_norm": 1.109375,
"learning_rate": 0.000499977269120033,
"loss": 6.444,
"mean_token_accuracy": 0.11637230366468429,
"num_tokens": 2829332.0,
"step": 1535
},
{
"entropy": 6.427845621109009,
"epoch": 0.12938458307078346,
"grad_norm": 0.97265625,
"learning_rate": 0.000499976841462614,
"loss": 6.3583,
"mean_token_accuracy": 0.11303109228610993,
"num_tokens": 2839193.0,
"step": 1540
},
{
"entropy": 6.396592044830323,
"epoch": 0.12980466288594833,
"grad_norm": 0.92578125,
"learning_rate": 0.000499976409819846,
"loss": 6.34,
"mean_token_accuracy": 0.11379488185048103,
"num_tokens": 2848535.0,
"step": 1545
},
{
"entropy": 6.191351747512817,
"epoch": 0.1302247427011132,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999759741917369,
"loss": 6.2442,
"mean_token_accuracy": 0.1232595019042492,
"num_tokens": 2858090.0,
"step": 1550
},
{
"entropy": 6.410407018661499,
"epoch": 0.1306448225162781,
"grad_norm": 1.1171875,
"learning_rate": 0.0004999755345782941,
"loss": 6.3942,
"mean_token_accuracy": 0.11326258555054665,
"num_tokens": 2866984.0,
"step": 1555
},
{
"entropy": 6.209921407699585,
"epoch": 0.13106490233144297,
"grad_norm": 0.90625,
"learning_rate": 0.0004999750909795256,
"loss": 6.202,
"mean_token_accuracy": 0.12322057262063027,
"num_tokens": 2876550.0,
"step": 1560
},
{
"entropy": 6.351957511901856,
"epoch": 0.13148498214660786,
"grad_norm": 0.9765625,
"learning_rate": 0.0004999746433954394,
"loss": 6.3062,
"mean_token_accuracy": 0.11787799671292305,
"num_tokens": 2885782.0,
"step": 1565
},
{
"entropy": 6.3278861999511715,
"epoch": 0.13190506196177273,
"grad_norm": 1.03125,
"learning_rate": 0.000499974191826043,
"loss": 6.2833,
"mean_token_accuracy": 0.13189474642276763,
"num_tokens": 2894807.0,
"step": 1570
},
{
"entropy": 6.376989316940308,
"epoch": 0.1323251417769376,
"grad_norm": 1.15625,
"learning_rate": 0.0004999737362713448,
"loss": 6.3235,
"mean_token_accuracy": 0.12015982195734978,
"num_tokens": 2904076.0,
"step": 1575
},
{
"entropy": 6.2569879531860355,
"epoch": 0.1327452215921025,
"grad_norm": 1.046875,
"learning_rate": 0.0004999732767313527,
"loss": 6.2239,
"mean_token_accuracy": 0.12142896950244904,
"num_tokens": 2913761.0,
"step": 1580
},
{
"entropy": 6.479147958755493,
"epoch": 0.13316530140726737,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999728132060746,
"loss": 6.4597,
"mean_token_accuracy": 0.1231887847185135,
"num_tokens": 2922848.0,
"step": 1585
},
{
"entropy": 6.33915228843689,
"epoch": 0.13358538122243227,
"grad_norm": 0.91796875,
"learning_rate": 0.0004999723456955192,
"loss": 6.3453,
"mean_token_accuracy": 0.12047107368707657,
"num_tokens": 2932718.0,
"step": 1590
},
{
"entropy": 6.313900423049927,
"epoch": 0.13400546103759714,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999718741996945,
"loss": 6.2846,
"mean_token_accuracy": 0.12055236473679543,
"num_tokens": 2942686.0,
"step": 1595
},
{
"entropy": 6.250068187713623,
"epoch": 0.13442554085276204,
"grad_norm": 1.0234375,
"learning_rate": 0.000499971398718609,
"loss": 6.2541,
"mean_token_accuracy": 0.12233499884605407,
"num_tokens": 2952096.0,
"step": 1600
},
{
"entropy": 6.400825262069702,
"epoch": 0.1348456206679269,
"grad_norm": 0.984375,
"learning_rate": 0.0004999709192522708,
"loss": 6.343,
"mean_token_accuracy": 0.12068295776844025,
"num_tokens": 2960660.0,
"step": 1605
},
{
"entropy": 6.373941278457641,
"epoch": 0.13526570048309178,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999704358006887,
"loss": 6.338,
"mean_token_accuracy": 0.11847807541489601,
"num_tokens": 2969834.0,
"step": 1610
},
{
"entropy": 6.3392332077026365,
"epoch": 0.13568578029825668,
"grad_norm": 1.09375,
"learning_rate": 0.0004999699483638712,
"loss": 6.3224,
"mean_token_accuracy": 0.11975563690066338,
"num_tokens": 2979023.0,
"step": 1615
},
{
"entropy": 6.312718057632447,
"epoch": 0.13610586011342155,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999694569418269,
"loss": 6.3217,
"mean_token_accuracy": 0.12105349376797676,
"num_tokens": 2988083.0,
"step": 1620
},
{
"entropy": 6.298077011108399,
"epoch": 0.13652593992858644,
"grad_norm": 1.015625,
"learning_rate": 0.0004999689615345645,
"loss": 6.2407,
"mean_token_accuracy": 0.12377956956624984,
"num_tokens": 2997240.0,
"step": 1625
},
{
"entropy": 6.40955810546875,
"epoch": 0.1369460197437513,
"grad_norm": 1.046875,
"learning_rate": 0.0004999684621420928,
"loss": 6.325,
"mean_token_accuracy": 0.11809631884098053,
"num_tokens": 3007077.0,
"step": 1630
},
{
"entropy": 6.277155590057373,
"epoch": 0.13736609955891618,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999679587644205,
"loss": 6.3514,
"mean_token_accuracy": 0.117049939930439,
"num_tokens": 3015821.0,
"step": 1635
},
{
"entropy": 6.334466028213501,
"epoch": 0.13778617937408108,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999674514015568,
"loss": 6.27,
"mean_token_accuracy": 0.12460469976067542,
"num_tokens": 3025858.0,
"step": 1640
},
{
"entropy": 6.350087356567383,
"epoch": 0.13820625918924595,
"grad_norm": 0.9921875,
"learning_rate": 0.0004999669400535105,
"loss": 6.246,
"mean_token_accuracy": 0.11693726480007172,
"num_tokens": 3035537.0,
"step": 1645
},
{
"entropy": 6.1635466575622555,
"epoch": 0.13862633900441085,
"grad_norm": 1.140625,
"learning_rate": 0.0004999664247202907,
"loss": 6.1621,
"mean_token_accuracy": 0.12513800859451293,
"num_tokens": 3044204.0,
"step": 1650
},
{
"entropy": 6.405851364135742,
"epoch": 0.13904641881957572,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999659054019066,
"loss": 6.3371,
"mean_token_accuracy": 0.12036163732409477,
"num_tokens": 3053111.0,
"step": 1655
},
{
"entropy": 6.243852519989014,
"epoch": 0.1394664986347406,
"grad_norm": 1.078125,
"learning_rate": 0.0004999653820983673,
"loss": 6.2536,
"mean_token_accuracy": 0.12018982619047165,
"num_tokens": 3062456.0,
"step": 1660
},
{
"entropy": 6.309187984466552,
"epoch": 0.13988657844990549,
"grad_norm": 1.03125,
"learning_rate": 0.000499964854809682,
"loss": 6.2742,
"mean_token_accuracy": 0.12464778944849968,
"num_tokens": 3071132.0,
"step": 1665
},
{
"entropy": 6.253125143051148,
"epoch": 0.14030665826507036,
"grad_norm": 1.0,
"learning_rate": 0.0004999643235358602,
"loss": 6.2296,
"mean_token_accuracy": 0.12636966705322267,
"num_tokens": 3080892.0,
"step": 1670
},
{
"entropy": 6.241448926925659,
"epoch": 0.14072673808023525,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999637882769112,
"loss": 6.166,
"mean_token_accuracy": 0.129159078001976,
"num_tokens": 3089874.0,
"step": 1675
},
{
"entropy": 6.321546411514282,
"epoch": 0.14114681789540012,
"grad_norm": 0.9375,
"learning_rate": 0.0004999632490328447,
"loss": 6.3087,
"mean_token_accuracy": 0.12358235269784927,
"num_tokens": 3099535.0,
"step": 1680
},
{
"entropy": 6.297367906570434,
"epoch": 0.14156689771056502,
"grad_norm": 0.9609375,
"learning_rate": 0.0004999627058036699,
"loss": 6.2544,
"mean_token_accuracy": 0.11949014514684678,
"num_tokens": 3108772.0,
"step": 1685
},
{
"entropy": 6.318113327026367,
"epoch": 0.1419869775257299,
"grad_norm": 1.03125,
"learning_rate": 0.0004999621585893966,
"loss": 6.2875,
"mean_token_accuracy": 0.11683647856116294,
"num_tokens": 3118333.0,
"step": 1690
},
{
"entropy": 6.319453144073487,
"epoch": 0.14240705734089476,
"grad_norm": 1.0625,
"learning_rate": 0.0004999616073900346,
"loss": 6.3252,
"mean_token_accuracy": 0.12005885392427444,
"num_tokens": 3127356.0,
"step": 1695
},
{
"entropy": 6.334293079376221,
"epoch": 0.14282713715605966,
"grad_norm": 1.09375,
"learning_rate": 0.0004999610522055935,
"loss": 6.2875,
"mean_token_accuracy": 0.11621066182851791,
"num_tokens": 3136859.0,
"step": 1700
},
{
"entropy": 6.296079492568969,
"epoch": 0.14324721697122453,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999604930360832,
"loss": 6.3178,
"mean_token_accuracy": 0.11642780154943466,
"num_tokens": 3146607.0,
"step": 1705
},
{
"entropy": 6.24936580657959,
"epoch": 0.14366729678638943,
"grad_norm": 0.9453125,
"learning_rate": 0.0004999599298815136,
"loss": 6.2668,
"mean_token_accuracy": 0.12453223988413811,
"num_tokens": 3156327.0,
"step": 1710
},
{
"entropy": 6.249935483932495,
"epoch": 0.1440873766015543,
"grad_norm": 1.625,
"learning_rate": 0.0004999593627418947,
"loss": 6.203,
"mean_token_accuracy": 0.12521106824278833,
"num_tokens": 3165559.0,
"step": 1715
},
{
"entropy": 6.336143445968628,
"epoch": 0.14450745641671917,
"grad_norm": 1.0625,
"learning_rate": 0.0004999587916172365,
"loss": 6.3011,
"mean_token_accuracy": 0.1152021661400795,
"num_tokens": 3173850.0,
"step": 1720
},
{
"entropy": 6.285958003997803,
"epoch": 0.14492753623188406,
"grad_norm": 1.015625,
"learning_rate": 0.0004999582165075492,
"loss": 6.2465,
"mean_token_accuracy": 0.115229881554842,
"num_tokens": 3182838.0,
"step": 1725
},
{
"entropy": 6.184572982788086,
"epoch": 0.14534761604704893,
"grad_norm": 1.046875,
"learning_rate": 0.0004999576374128429,
"loss": 6.237,
"mean_token_accuracy": 0.12389757782220841,
"num_tokens": 3191692.0,
"step": 1730
},
{
"entropy": 6.397026300430298,
"epoch": 0.14576769586221383,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999570543331279,
"loss": 6.2806,
"mean_token_accuracy": 0.12041986957192422,
"num_tokens": 3200069.0,
"step": 1735
},
{
"entropy": 6.260718488693238,
"epoch": 0.1461877756773787,
"grad_norm": 1.140625,
"learning_rate": 0.0004999564672684145,
"loss": 6.3438,
"mean_token_accuracy": 0.1193387195467949,
"num_tokens": 3209653.0,
"step": 1740
},
{
"entropy": 6.393806028366089,
"epoch": 0.14660785549254357,
"grad_norm": 1.03125,
"learning_rate": 0.0004999558762187131,
"loss": 6.217,
"mean_token_accuracy": 0.12818640992045402,
"num_tokens": 3218313.0,
"step": 1745
},
{
"entropy": 6.188047790527344,
"epoch": 0.14702793530770847,
"grad_norm": 1.03125,
"learning_rate": 0.0004999552811840342,
"loss": 6.1623,
"mean_token_accuracy": 0.12572802156209945,
"num_tokens": 3227525.0,
"step": 1750
},
{
"entropy": 6.254945421218872,
"epoch": 0.14744801512287334,
"grad_norm": 0.9609375,
"learning_rate": 0.0004999546821643884,
"loss": 6.275,
"mean_token_accuracy": 0.1252661019563675,
"num_tokens": 3237022.0,
"step": 1755
},
{
"entropy": 6.239528560638428,
"epoch": 0.14786809493803824,
"grad_norm": 1.0,
"learning_rate": 0.0004999540791597861,
"loss": 6.1635,
"mean_token_accuracy": 0.1251967169344425,
"num_tokens": 3246605.0,
"step": 1760
},
{
"entropy": 6.140348815917969,
"epoch": 0.1482881747532031,
"grad_norm": 1.046875,
"learning_rate": 0.0004999534721702383,
"loss": 6.1328,
"mean_token_accuracy": 0.12913108766078948,
"num_tokens": 3255587.0,
"step": 1765
},
{
"entropy": 6.292690563201904,
"epoch": 0.148708254568368,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999528611957553,
"loss": 6.2117,
"mean_token_accuracy": 0.1271899625658989,
"num_tokens": 3265669.0,
"step": 1770
},
{
"entropy": 6.244245195388794,
"epoch": 0.14912833438353287,
"grad_norm": 1.09375,
"learning_rate": 0.0004999522462363485,
"loss": 6.2107,
"mean_token_accuracy": 0.12735746875405313,
"num_tokens": 3275013.0,
"step": 1775
},
{
"entropy": 6.274697399139404,
"epoch": 0.14954841419869774,
"grad_norm": 0.95703125,
"learning_rate": 0.0004999516272920283,
"loss": 6.3057,
"mean_token_accuracy": 0.12256114035844803,
"num_tokens": 3284723.0,
"step": 1780
},
{
"entropy": 6.149048852920532,
"epoch": 0.14996849401386264,
"grad_norm": 0.98828125,
"learning_rate": 0.000499951004362806,
"loss": 6.148,
"mean_token_accuracy": 0.12680203318595887,
"num_tokens": 3293860.0,
"step": 1785
},
{
"entropy": 6.166975450515747,
"epoch": 0.1503885738290275,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999503774486924,
"loss": 6.1912,
"mean_token_accuracy": 0.12579060941934586,
"num_tokens": 3303158.0,
"step": 1790
},
{
"entropy": 6.141963243484497,
"epoch": 0.1508086536441924,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999497465496987,
"loss": 6.127,
"mean_token_accuracy": 0.11945450827479362,
"num_tokens": 3313068.0,
"step": 1795
},
{
"entropy": 6.272381019592285,
"epoch": 0.15122873345935728,
"grad_norm": 1.0546875,
"learning_rate": 0.000499949111665836,
"loss": 6.2113,
"mean_token_accuracy": 0.12545057907700538,
"num_tokens": 3321885.0,
"step": 1800
},
{
"entropy": 6.260741281509399,
"epoch": 0.15164881327452215,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999484727971158,
"loss": 6.2004,
"mean_token_accuracy": 0.1255272276699543,
"num_tokens": 3330924.0,
"step": 1805
},
{
"entropy": 6.217929172515869,
"epoch": 0.15206889308968705,
"grad_norm": 0.96875,
"learning_rate": 0.000499947829943549,
"loss": 6.2325,
"mean_token_accuracy": 0.12151647359132767,
"num_tokens": 3340070.0,
"step": 1810
},
{
"entropy": 6.26392617225647,
"epoch": 0.15248897290485192,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999471831051474,
"loss": 6.2282,
"mean_token_accuracy": 0.1356467515230179,
"num_tokens": 3349870.0,
"step": 1815
},
{
"entropy": 6.306232357025147,
"epoch": 0.1529090527200168,
"grad_norm": 0.99609375,
"learning_rate": 0.0004999465322819222,
"loss": 6.2642,
"mean_token_accuracy": 0.12093094363808632,
"num_tokens": 3359573.0,
"step": 1820
},
{
"entropy": 6.253879976272583,
"epoch": 0.15332913253518168,
"grad_norm": 1.03125,
"learning_rate": 0.0004999458774738851,
"loss": 6.2008,
"mean_token_accuracy": 0.1337040476500988,
"num_tokens": 3368577.0,
"step": 1825
},
{
"entropy": 6.189093828201294,
"epoch": 0.15374921235034655,
"grad_norm": 1.046875,
"learning_rate": 0.0004999452186810476,
"loss": 6.1859,
"mean_token_accuracy": 0.12909814566373826,
"num_tokens": 3377801.0,
"step": 1830
},
{
"entropy": 6.290127277374268,
"epoch": 0.15416929216551145,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999445559034214,
"loss": 6.2251,
"mean_token_accuracy": 0.13048515170812608,
"num_tokens": 3386666.0,
"step": 1835
},
{
"entropy": 6.365573501586914,
"epoch": 0.15458937198067632,
"grad_norm": 1.015625,
"learning_rate": 0.0004999438891410181,
"loss": 6.364,
"mean_token_accuracy": 0.11682043001055717,
"num_tokens": 3396086.0,
"step": 1840
},
{
"entropy": 6.219829654693603,
"epoch": 0.15500945179584122,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999432183938496,
"loss": 6.2847,
"mean_token_accuracy": 0.12174131944775582,
"num_tokens": 3404894.0,
"step": 1845
},
{
"entropy": 6.1805830001831055,
"epoch": 0.1554295316110061,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999425436619279,
"loss": 6.2621,
"mean_token_accuracy": 0.119705418497324,
"num_tokens": 3414172.0,
"step": 1850
},
{
"entropy": 6.348053646087647,
"epoch": 0.15584961142617096,
"grad_norm": 0.9453125,
"learning_rate": 0.000499941864945265,
"loss": 6.2477,
"mean_token_accuracy": 0.11934950053691865,
"num_tokens": 3423409.0,
"step": 1855
},
{
"entropy": 6.174108409881592,
"epoch": 0.15626969124133586,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999411822438726,
"loss": 6.1935,
"mean_token_accuracy": 0.12505294382572174,
"num_tokens": 3433047.0,
"step": 1860
},
{
"entropy": 6.257612848281861,
"epoch": 0.15668977105650073,
"grad_norm": 1.109375,
"learning_rate": 0.000499940495557763,
"loss": 6.178,
"mean_token_accuracy": 0.12779488489031793,
"num_tokens": 3442490.0,
"step": 1865
},
{
"entropy": 6.248907375335693,
"epoch": 0.15710985087166562,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999398048869485,
"loss": 6.2472,
"mean_token_accuracy": 0.12368729263544083,
"num_tokens": 3451804.0,
"step": 1870
},
{
"entropy": 6.3145753860473635,
"epoch": 0.1575299306868305,
"grad_norm": 1.015625,
"learning_rate": 0.000499939110231441,
"loss": 6.2386,
"mean_token_accuracy": 0.1283128082752228,
"num_tokens": 3461481.0,
"step": 1875
},
{
"entropy": 6.25744366645813,
"epoch": 0.1579500105019954,
"grad_norm": 1.1328125,
"learning_rate": 0.0004999384115912531,
"loss": 6.2722,
"mean_token_accuracy": 0.12746385112404823,
"num_tokens": 3471798.0,
"step": 1880
},
{
"entropy": 6.1273274421691895,
"epoch": 0.15837009031716026,
"grad_norm": 0.97265625,
"learning_rate": 0.000499937708966397,
"loss": 6.1695,
"mean_token_accuracy": 0.1235118955373764,
"num_tokens": 3481386.0,
"step": 1885
},
{
"entropy": 6.278209686279297,
"epoch": 0.15879017013232513,
"grad_norm": 1.0,
"learning_rate": 0.0004999370023568853,
"loss": 6.1682,
"mean_token_accuracy": 0.1265706330537796,
"num_tokens": 3489981.0,
"step": 1890
},
{
"entropy": 6.165022706985473,
"epoch": 0.15921024994749003,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999362917627304,
"loss": 6.1462,
"mean_token_accuracy": 0.12947175428271293,
"num_tokens": 3498551.0,
"step": 1895
},
{
"entropy": 6.203568363189698,
"epoch": 0.1596303297626549,
"grad_norm": 1.0625,
"learning_rate": 0.0004999355771839448,
"loss": 6.1261,
"mean_token_accuracy": 0.12978117987513543,
"num_tokens": 3507921.0,
"step": 1900
},
{
"entropy": 6.325990772247314,
"epoch": 0.1600504095778198,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999348586205414,
"loss": 6.3032,
"mean_token_accuracy": 0.1252683699131012,
"num_tokens": 3517570.0,
"step": 1905
},
{
"entropy": 6.316230726242066,
"epoch": 0.16047048939298467,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999341360725327,
"loss": 6.2873,
"mean_token_accuracy": 0.1217451848089695,
"num_tokens": 3526774.0,
"step": 1910
},
{
"entropy": 6.25418004989624,
"epoch": 0.16089056920814954,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999334095399317,
"loss": 6.2253,
"mean_token_accuracy": 0.13255979344248772,
"num_tokens": 3535319.0,
"step": 1915
},
{
"entropy": 6.0919859409332275,
"epoch": 0.16131064902331443,
"grad_norm": 1.0703125,
"learning_rate": 0.0004999326790227512,
"loss": 6.192,
"mean_token_accuracy": 0.12934273406863211,
"num_tokens": 3544468.0,
"step": 1920
},
{
"entropy": 6.13924469947815,
"epoch": 0.1617307288384793,
"grad_norm": 0.98828125,
"learning_rate": 0.0004999319445210041,
"loss": 6.0772,
"mean_token_accuracy": 0.1331401713192463,
"num_tokens": 3553529.0,
"step": 1925
},
{
"entropy": 6.140831708908081,
"epoch": 0.1621508086536442,
"grad_norm": 0.984375,
"learning_rate": 0.0004999312060347034,
"loss": 6.1169,
"mean_token_accuracy": 0.128947152197361,
"num_tokens": 3563053.0,
"step": 1930
},
{
"entropy": 6.201217079162598,
"epoch": 0.16257088846880907,
"grad_norm": 0.97265625,
"learning_rate": 0.0004999304635638621,
"loss": 6.0724,
"mean_token_accuracy": 0.1313454084098339,
"num_tokens": 3571877.0,
"step": 1935
},
{
"entropy": 6.137188291549682,
"epoch": 0.16299096828397394,
"grad_norm": 0.9140625,
"learning_rate": 0.0004999297171084935,
"loss": 6.1144,
"mean_token_accuracy": 0.13178201764822006,
"num_tokens": 3581496.0,
"step": 1940
},
{
"entropy": 6.274984455108642,
"epoch": 0.16341104809913884,
"grad_norm": 0.984375,
"learning_rate": 0.0004999289666686109,
"loss": 6.1397,
"mean_token_accuracy": 0.12548287436366082,
"num_tokens": 3590752.0,
"step": 1945
},
{
"entropy": 6.027514791488647,
"epoch": 0.1638311279143037,
"grad_norm": 0.9765625,
"learning_rate": 0.0004999282122442274,
"loss": 6.1413,
"mean_token_accuracy": 0.12893687859177588,
"num_tokens": 3599885.0,
"step": 1950
},
{
"entropy": 6.314913415908814,
"epoch": 0.1642512077294686,
"grad_norm": 0.953125,
"learning_rate": 0.0004999274538353564,
"loss": 6.225,
"mean_token_accuracy": 0.12287019938230515,
"num_tokens": 3610039.0,
"step": 1955
},
{
"entropy": 6.137080287933349,
"epoch": 0.16467128754463348,
"grad_norm": 1.046875,
"learning_rate": 0.0004999266914420114,
"loss": 6.1398,
"mean_token_accuracy": 0.12600617855787277,
"num_tokens": 3619954.0,
"step": 1960
},
{
"entropy": 6.19411187171936,
"epoch": 0.16509136735979837,
"grad_norm": 1.0625,
"learning_rate": 0.000499925925064206,
"loss": 6.1087,
"mean_token_accuracy": 0.13167392686009408,
"num_tokens": 3628164.0,
"step": 1965
},
{
"entropy": 6.276717853546143,
"epoch": 0.16551144717496324,
"grad_norm": 1.0,
"learning_rate": 0.0004999251547019535,
"loss": 6.2662,
"mean_token_accuracy": 0.12937605381011963,
"num_tokens": 3636778.0,
"step": 1970
},
{
"entropy": 6.321251440048218,
"epoch": 0.16593152699012811,
"grad_norm": 1.0,
"learning_rate": 0.0004999243803552678,
"loss": 6.2031,
"mean_token_accuracy": 0.12865082323551177,
"num_tokens": 3647046.0,
"step": 1975
},
{
"entropy": 6.092304801940918,
"epoch": 0.166351606805293,
"grad_norm": 1.09375,
"learning_rate": 0.0004999236020241625,
"loss": 6.1208,
"mean_token_accuracy": 0.12650337740778922,
"num_tokens": 3656130.0,
"step": 1980
},
{
"entropy": 6.216774225234985,
"epoch": 0.16677168662045788,
"grad_norm": 1.046875,
"learning_rate": 0.0004999228197086514,
"loss": 6.1975,
"mean_token_accuracy": 0.11985883414745331,
"num_tokens": 3666145.0,
"step": 1985
},
{
"entropy": 6.222474908828735,
"epoch": 0.16719176643562278,
"grad_norm": 0.8984375,
"learning_rate": 0.0004999220334087484,
"loss": 6.2471,
"mean_token_accuracy": 0.1268869273364544,
"num_tokens": 3676722.0,
"step": 1990
},
{
"entropy": 6.2897861957550045,
"epoch": 0.16761184625078765,
"grad_norm": 1.0,
"learning_rate": 0.0004999212431244673,
"loss": 6.2493,
"mean_token_accuracy": 0.1202566534280777,
"num_tokens": 3685880.0,
"step": 1995
},
{
"entropy": 6.107799291610718,
"epoch": 0.16803192606595252,
"grad_norm": 0.9921875,
"learning_rate": 0.0004999204488558222,
"loss": 6.0634,
"mean_token_accuracy": 0.13050426542758942,
"num_tokens": 3695167.0,
"step": 2000
},
{
"entropy": 6.197722768783569,
"epoch": 0.16845200588111742,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999196506028273,
"loss": 6.183,
"mean_token_accuracy": 0.1260111801326275,
"num_tokens": 3703700.0,
"step": 2005
},
{
"entropy": 6.235522031784058,
"epoch": 0.1688720856962823,
"grad_norm": 1.1015625,
"learning_rate": 0.0004999188483654965,
"loss": 6.1217,
"mean_token_accuracy": 0.1260783888399601,
"num_tokens": 3712825.0,
"step": 2010
},
{
"entropy": 6.099165439605713,
"epoch": 0.16929216551144718,
"grad_norm": 0.953125,
"learning_rate": 0.0004999180421438442,
"loss": 6.089,
"mean_token_accuracy": 0.12902323082089423,
"num_tokens": 3721807.0,
"step": 2015
},
{
"entropy": 6.248017930984497,
"epoch": 0.16971224532661205,
"grad_norm": 1.09375,
"learning_rate": 0.0004999172319378846,
"loss": 6.2757,
"mean_token_accuracy": 0.12072234824299813,
"num_tokens": 3730502.0,
"step": 2020
},
{
"entropy": 6.263659858703614,
"epoch": 0.17013232514177692,
"grad_norm": 1.0078125,
"learning_rate": 0.0004999164177476319,
"loss": 6.149,
"mean_token_accuracy": 0.12994196712970735,
"num_tokens": 3739696.0,
"step": 2025
},
{
"entropy": 6.063019037246704,
"epoch": 0.17055240495694182,
"grad_norm": 1.0859375,
"learning_rate": 0.0004999155995731009,
"loss": 6.1537,
"mean_token_accuracy": 0.13063850849866868,
"num_tokens": 3748675.0,
"step": 2030
},
{
"entropy": 6.347209215164185,
"epoch": 0.1709724847721067,
"grad_norm": 1.046875,
"learning_rate": 0.0004999147774143057,
"loss": 6.233,
"mean_token_accuracy": 0.12277846410870552,
"num_tokens": 3757714.0,
"step": 2035
},
{
"entropy": 6.09723424911499,
"epoch": 0.1713925645872716,
"grad_norm": 0.984375,
"learning_rate": 0.000499913951271261,
"loss": 6.0474,
"mean_token_accuracy": 0.13144370764493943,
"num_tokens": 3767589.0,
"step": 2040
},
{
"entropy": 6.220744895935058,
"epoch": 0.17181264440243646,
"grad_norm": 1.1640625,
"learning_rate": 0.0004999131211439816,
"loss": 6.1603,
"mean_token_accuracy": 0.12925415337085724,
"num_tokens": 3777261.0,
"step": 2045
},
{
"entropy": 6.157608842849731,
"epoch": 0.17223272421760136,
"grad_norm": 1.0390625,
"learning_rate": 0.000499912287032482,
"loss": 6.1136,
"mean_token_accuracy": 0.13876328021287918,
"num_tokens": 3786658.0,
"step": 2050
},
{
"entropy": 6.097555351257324,
"epoch": 0.17265280403276623,
"grad_norm": 1.0703125,
"learning_rate": 0.000499911448936777,
"loss": 6.1039,
"mean_token_accuracy": 0.13591505512595176,
"num_tokens": 3794977.0,
"step": 2055
},
{
"entropy": 6.0962035179138185,
"epoch": 0.1730728838479311,
"grad_norm": 0.953125,
"learning_rate": 0.0004999106068568816,
"loss": 6.1794,
"mean_token_accuracy": 0.12809087112545967,
"num_tokens": 3805138.0,
"step": 2060
},
{
"entropy": 6.231352376937866,
"epoch": 0.173492963663096,
"grad_norm": 1.015625,
"learning_rate": 0.0004999097607928106,
"loss": 6.1206,
"mean_token_accuracy": 0.1363896384835243,
"num_tokens": 3814444.0,
"step": 2065
},
{
"entropy": 6.190911483764649,
"epoch": 0.17391304347826086,
"grad_norm": 1.0234375,
"learning_rate": 0.0004999089107445788,
"loss": 6.0912,
"mean_token_accuracy": 0.12825695872306825,
"num_tokens": 3822859.0,
"step": 2070
},
{
"entropy": 6.055686283111572,
"epoch": 0.17433312329342576,
"grad_norm": 0.97265625,
"learning_rate": 0.0004999080567122016,
"loss": 6.1054,
"mean_token_accuracy": 0.12722248658537866,
"num_tokens": 3833159.0,
"step": 2075
},
{
"entropy": 6.126674318313599,
"epoch": 0.17475320310859063,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999071986956941,
"loss": 6.1145,
"mean_token_accuracy": 0.13205313310027122,
"num_tokens": 3842136.0,
"step": 2080
},
{
"entropy": 6.132652282714844,
"epoch": 0.1751732829237555,
"grad_norm": 1.046875,
"learning_rate": 0.0004999063366950713,
"loss": 6.1975,
"mean_token_accuracy": 0.12595101371407508,
"num_tokens": 3851406.0,
"step": 2085
},
{
"entropy": 6.16230001449585,
"epoch": 0.1755933627389204,
"grad_norm": 1.015625,
"learning_rate": 0.0004999054707103486,
"loss": 6.1031,
"mean_token_accuracy": 0.1279636099934578,
"num_tokens": 3861061.0,
"step": 2090
},
{
"entropy": 6.178242540359497,
"epoch": 0.17601344255408527,
"grad_norm": 1.0546875,
"learning_rate": 0.0004999046007415412,
"loss": 6.0821,
"mean_token_accuracy": 0.12796319127082825,
"num_tokens": 3870357.0,
"step": 2095
},
{
"entropy": 6.1904627799987795,
"epoch": 0.17643352236925017,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999037267886646,
"loss": 6.1006,
"mean_token_accuracy": 0.13064605742692947,
"num_tokens": 3879393.0,
"step": 2100
},
{
"entropy": 6.076858758926392,
"epoch": 0.17685360218441504,
"grad_norm": 1.078125,
"learning_rate": 0.0004999028488517343,
"loss": 6.1097,
"mean_token_accuracy": 0.1291967511177063,
"num_tokens": 3888030.0,
"step": 2105
},
{
"entropy": 6.22084493637085,
"epoch": 0.1772736819995799,
"grad_norm": 1.0390625,
"learning_rate": 0.0004999019669307659,
"loss": 6.1189,
"mean_token_accuracy": 0.1333627261221409,
"num_tokens": 3897430.0,
"step": 2110
},
{
"entropy": 6.15262508392334,
"epoch": 0.1776937618147448,
"grad_norm": 0.96484375,
"learning_rate": 0.0004999010810257749,
"loss": 6.1461,
"mean_token_accuracy": 0.1225900873541832,
"num_tokens": 3907711.0,
"step": 2115
},
{
"entropy": 6.100615692138672,
"epoch": 0.17811384162990967,
"grad_norm": 1.03125,
"learning_rate": 0.0004999001911367771,
"loss": 6.0784,
"mean_token_accuracy": 0.13617549166083337,
"num_tokens": 3915816.0,
"step": 2120
},
{
"entropy": 6.130948638916015,
"epoch": 0.17853392144507457,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998992972637883,
"loss": 6.2002,
"mean_token_accuracy": 0.12154756337404252,
"num_tokens": 3925162.0,
"step": 2125
},
{
"entropy": 6.210935020446778,
"epoch": 0.17895400126023944,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998983994068242,
"loss": 6.0874,
"mean_token_accuracy": 0.1311741665005684,
"num_tokens": 3934476.0,
"step": 2130
},
{
"entropy": 6.098375844955444,
"epoch": 0.17937408107540434,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998974975659006,
"loss": 6.1351,
"mean_token_accuracy": 0.12713489457964897,
"num_tokens": 3943501.0,
"step": 2135
},
{
"entropy": 6.198156356811523,
"epoch": 0.1797941608905692,
"grad_norm": 0.984375,
"learning_rate": 0.0004998965917410338,
"loss": 6.1279,
"mean_token_accuracy": 0.12831434607505798,
"num_tokens": 3953663.0,
"step": 2140
},
{
"entropy": 6.133723402023316,
"epoch": 0.18021424070573408,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998956819322397,
"loss": 6.088,
"mean_token_accuracy": 0.12946128249168395,
"num_tokens": 3962634.0,
"step": 2145
},
{
"entropy": 6.1390259742736815,
"epoch": 0.18063432052089898,
"grad_norm": 1.0,
"learning_rate": 0.0004998947681395343,
"loss": 6.0855,
"mean_token_accuracy": 0.13397737592458725,
"num_tokens": 3972496.0,
"step": 2150
},
{
"entropy": 6.254479598999024,
"epoch": 0.18105440033606385,
"grad_norm": 1.0546875,
"learning_rate": 0.000499893850362934,
"loss": 6.3337,
"mean_token_accuracy": 0.12332669869065285,
"num_tokens": 3980724.0,
"step": 2155
},
{
"entropy": 6.2081263065338135,
"epoch": 0.18147448015122875,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998929286024548,
"loss": 6.1709,
"mean_token_accuracy": 0.12541203945875168,
"num_tokens": 3989842.0,
"step": 2160
},
{
"entropy": 6.17150354385376,
"epoch": 0.18189455996639362,
"grad_norm": 1.109375,
"learning_rate": 0.0004998920028581133,
"loss": 6.0848,
"mean_token_accuracy": 0.1349337212741375,
"num_tokens": 3998534.0,
"step": 2165
},
{
"entropy": 6.159293746948242,
"epoch": 0.18231463978155849,
"grad_norm": 1.0,
"learning_rate": 0.0004998910731299258,
"loss": 6.0963,
"mean_token_accuracy": 0.12547213733196258,
"num_tokens": 4007677.0,
"step": 2170
},
{
"entropy": 6.151889276504517,
"epoch": 0.18273471959672338,
"grad_norm": 1.03125,
"learning_rate": 0.0004998901394179085,
"loss": 6.1632,
"mean_token_accuracy": 0.12913861274719238,
"num_tokens": 4016347.0,
"step": 2175
},
{
"entropy": 6.125647306442261,
"epoch": 0.18315479941188825,
"grad_norm": 1.109375,
"learning_rate": 0.0004998892017220784,
"loss": 6.0392,
"mean_token_accuracy": 0.13342646807432174,
"num_tokens": 4025199.0,
"step": 2180
},
{
"entropy": 6.153134059906006,
"epoch": 0.18357487922705315,
"grad_norm": 1.125,
"learning_rate": 0.0004998882600424519,
"loss": 6.0961,
"mean_token_accuracy": 0.12564898803830146,
"num_tokens": 4033933.0,
"step": 2185
},
{
"entropy": 6.154629516601562,
"epoch": 0.18399495904221802,
"grad_norm": 1.09375,
"learning_rate": 0.0004998873143790455,
"loss": 6.0291,
"mean_token_accuracy": 0.13878689035773278,
"num_tokens": 4042891.0,
"step": 2190
},
{
"entropy": 6.129179048538208,
"epoch": 0.1844150388573829,
"grad_norm": 1.03125,
"learning_rate": 0.0004998863647318763,
"loss": 6.1413,
"mean_token_accuracy": 0.1272033281624317,
"num_tokens": 4051123.0,
"step": 2195
},
{
"entropy": 6.1010294437408445,
"epoch": 0.1848351186725478,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998854111009608,
"loss": 6.1152,
"mean_token_accuracy": 0.12936600148677826,
"num_tokens": 4060025.0,
"step": 2200
},
{
"entropy": 6.11760630607605,
"epoch": 0.18525519848771266,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998844534863161,
"loss": 6.0205,
"mean_token_accuracy": 0.12755625769495965,
"num_tokens": 4069363.0,
"step": 2205
},
{
"entropy": 6.150998878479004,
"epoch": 0.18567527830287756,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998834918879592,
"loss": 6.1697,
"mean_token_accuracy": 0.1331343524158001,
"num_tokens": 4078855.0,
"step": 2210
},
{
"entropy": 6.200693273544312,
"epoch": 0.18609535811804243,
"grad_norm": 0.94921875,
"learning_rate": 0.000499882526305907,
"loss": 6.1425,
"mean_token_accuracy": 0.12896015048027037,
"num_tokens": 4087801.0,
"step": 2215
},
{
"entropy": 6.137786483764648,
"epoch": 0.18651543793320732,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998815567401765,
"loss": 6.1525,
"mean_token_accuracy": 0.12895300164818763,
"num_tokens": 4096949.0,
"step": 2220
},
{
"entropy": 6.203073024749756,
"epoch": 0.1869355177483722,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998805831907851,
"loss": 6.1034,
"mean_token_accuracy": 0.1270811975002289,
"num_tokens": 4105399.0,
"step": 2225
},
{
"entropy": 6.1230597496032715,
"epoch": 0.18735559756353706,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998796056577501,
"loss": 6.0488,
"mean_token_accuracy": 0.12729625552892684,
"num_tokens": 4113873.0,
"step": 2230
},
{
"entropy": 6.073714399337769,
"epoch": 0.18777567737870196,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998786241410886,
"loss": 6.1026,
"mean_token_accuracy": 0.13166192471981047,
"num_tokens": 4123528.0,
"step": 2235
},
{
"entropy": 6.234827375411987,
"epoch": 0.18819575719386683,
"grad_norm": 0.96484375,
"learning_rate": 0.000499877638640818,
"loss": 6.1114,
"mean_token_accuracy": 0.12597778365015982,
"num_tokens": 4133370.0,
"step": 2240
},
{
"entropy": 6.071894741058349,
"epoch": 0.18861583700903173,
"grad_norm": 0.98828125,
"learning_rate": 0.000499876649156956,
"loss": 6.028,
"mean_token_accuracy": 0.1308871813118458,
"num_tokens": 4142370.0,
"step": 2245
},
{
"entropy": 6.07040696144104,
"epoch": 0.1890359168241966,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998756556895196,
"loss": 6.1178,
"mean_token_accuracy": 0.13069864958524705,
"num_tokens": 4152367.0,
"step": 2250
},
{
"entropy": 6.167872524261474,
"epoch": 0.18945599663936147,
"grad_norm": 1.03125,
"learning_rate": 0.000499874658238527,
"loss": 6.1024,
"mean_token_accuracy": 0.13190473541617392,
"num_tokens": 4161126.0,
"step": 2255
},
{
"entropy": 6.139232063293457,
"epoch": 0.18987607645452637,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998736568039957,
"loss": 5.9976,
"mean_token_accuracy": 0.13119693994522094,
"num_tokens": 4169910.0,
"step": 2260
},
{
"entropy": 6.1185729026794435,
"epoch": 0.19029615626969124,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998726513859432,
"loss": 6.1461,
"mean_token_accuracy": 0.12717667669057847,
"num_tokens": 4179893.0,
"step": 2265
},
{
"entropy": 6.202062654495239,
"epoch": 0.19071623608485613,
"grad_norm": 0.94921875,
"learning_rate": 0.0004998716419843875,
"loss": 6.1633,
"mean_token_accuracy": 0.13450514376163483,
"num_tokens": 4190065.0,
"step": 2270
},
{
"entropy": 6.053268957138061,
"epoch": 0.191136315900021,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998706285993465,
"loss": 6.0722,
"mean_token_accuracy": 0.12834293991327286,
"num_tokens": 4198395.0,
"step": 2275
},
{
"entropy": 6.130187177658081,
"epoch": 0.19155639571518587,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998696112308381,
"loss": 6.0944,
"mean_token_accuracy": 0.12858275175094605,
"num_tokens": 4207555.0,
"step": 2280
},
{
"entropy": 6.074777889251709,
"epoch": 0.19197647553035077,
"grad_norm": 0.984375,
"learning_rate": 0.0004998685898788803,
"loss": 6.0485,
"mean_token_accuracy": 0.13106716349720954,
"num_tokens": 4216533.0,
"step": 2285
},
{
"entropy": 6.191452217102051,
"epoch": 0.19239655534551564,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998675645434914,
"loss": 6.1523,
"mean_token_accuracy": 0.13225150257349014,
"num_tokens": 4225575.0,
"step": 2290
},
{
"entropy": 6.0228188037872314,
"epoch": 0.19281663516068054,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998665352246891,
"loss": 5.9361,
"mean_token_accuracy": 0.13841283321380615,
"num_tokens": 4234306.0,
"step": 2295
},
{
"entropy": 6.043151473999023,
"epoch": 0.1932367149758454,
"grad_norm": 1.015625,
"learning_rate": 0.0004998655019224921,
"loss": 6.1283,
"mean_token_accuracy": 0.13190191760659217,
"num_tokens": 4243998.0,
"step": 2300
},
{
"entropy": 6.166877937316895,
"epoch": 0.19365679479101028,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998644646369185,
"loss": 6.0139,
"mean_token_accuracy": 0.12847840487957002,
"num_tokens": 4253653.0,
"step": 2305
},
{
"entropy": 6.034109115600586,
"epoch": 0.19407687460617518,
"grad_norm": 1.046875,
"learning_rate": 0.0004998634233679865,
"loss": 6.0949,
"mean_token_accuracy": 0.12612878382205964,
"num_tokens": 4263305.0,
"step": 2310
},
{
"entropy": 6.1194260597229,
"epoch": 0.19449695442134005,
"grad_norm": 1.046875,
"learning_rate": 0.000499862378115715,
"loss": 5.9818,
"mean_token_accuracy": 0.13570686057209969,
"num_tokens": 4272212.0,
"step": 2315
},
{
"entropy": 6.182863759994507,
"epoch": 0.19491703423650494,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998613288801221,
"loss": 6.1959,
"mean_token_accuracy": 0.1276652343571186,
"num_tokens": 4281445.0,
"step": 2320
},
{
"entropy": 6.213861799240112,
"epoch": 0.1953371140516698,
"grad_norm": 0.9609375,
"learning_rate": 0.0004998602756612267,
"loss": 6.1058,
"mean_token_accuracy": 0.12670243680477142,
"num_tokens": 4290938.0,
"step": 2325
},
{
"entropy": 6.066101360321045,
"epoch": 0.1957571938668347,
"grad_norm": 1.015625,
"learning_rate": 0.0004998592184590471,
"loss": 6.1379,
"mean_token_accuracy": 0.12725966945290565,
"num_tokens": 4300022.0,
"step": 2330
},
{
"entropy": 6.080681276321411,
"epoch": 0.19617727368199958,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998581572736024,
"loss": 6.019,
"mean_token_accuracy": 0.1344592235982418,
"num_tokens": 4308910.0,
"step": 2335
},
{
"entropy": 6.050167417526245,
"epoch": 0.19659735349716445,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998570921049112,
"loss": 5.9814,
"mean_token_accuracy": 0.13220275193452835,
"num_tokens": 4317136.0,
"step": 2340
},
{
"entropy": 6.09632978439331,
"epoch": 0.19701743331232935,
"grad_norm": 1.078125,
"learning_rate": 0.0004998560229529924,
"loss": 6.0501,
"mean_token_accuracy": 0.1387757182121277,
"num_tokens": 4326163.0,
"step": 2345
},
{
"entropy": 6.229255342483521,
"epoch": 0.19743751312749422,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998549498178649,
"loss": 6.1921,
"mean_token_accuracy": 0.12868764251470566,
"num_tokens": 4335837.0,
"step": 2350
},
{
"entropy": 6.126859140396118,
"epoch": 0.19785759294265912,
"grad_norm": 1.1171875,
"learning_rate": 0.0004998538726995477,
"loss": 6.1084,
"mean_token_accuracy": 0.13344382494688034,
"num_tokens": 4345108.0,
"step": 2355
},
{
"entropy": 6.168588161468506,
"epoch": 0.198277672757824,
"grad_norm": 0.9765625,
"learning_rate": 0.00049985279159806,
"loss": 6.119,
"mean_token_accuracy": 0.12684730514883996,
"num_tokens": 4353761.0,
"step": 2360
},
{
"entropy": 6.090028953552246,
"epoch": 0.19869775257298886,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998517065134208,
"loss": 6.0824,
"mean_token_accuracy": 0.13213628232479097,
"num_tokens": 4363244.0,
"step": 2365
},
{
"entropy": 6.138245010375977,
"epoch": 0.19911783238815375,
"grad_norm": 0.9921875,
"learning_rate": 0.0004998506174456494,
"loss": 6.0839,
"mean_token_accuracy": 0.12802947238087653,
"num_tokens": 4373034.0,
"step": 2370
},
{
"entropy": 6.12951922416687,
"epoch": 0.19953791220331862,
"grad_norm": 0.9453125,
"learning_rate": 0.0004998495243947653,
"loss": 6.0216,
"mean_token_accuracy": 0.1251508317887783,
"num_tokens": 4382554.0,
"step": 2375
},
{
"entropy": 6.1475914478302,
"epoch": 0.19995799201848352,
"grad_norm": 1.140625,
"learning_rate": 0.0004998484273607875,
"loss": 6.0463,
"mean_token_accuracy": 0.136245708912611,
"num_tokens": 4391001.0,
"step": 2380
},
{
"entropy": 5.926258325576782,
"epoch": 0.2003780718336484,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998473263437356,
"loss": 5.9565,
"mean_token_accuracy": 0.13519108295440674,
"num_tokens": 4400632.0,
"step": 2385
},
{
"entropy": 6.048220825195313,
"epoch": 0.20079815164881326,
"grad_norm": 1.0234375,
"learning_rate": 0.000499846221343629,
"loss": 6.051,
"mean_token_accuracy": 0.13025175258517266,
"num_tokens": 4409565.0,
"step": 2390
},
{
"entropy": 6.0700782299041744,
"epoch": 0.20121823146397816,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998451123604875,
"loss": 5.9988,
"mean_token_accuracy": 0.14039506316184996,
"num_tokens": 4418384.0,
"step": 2395
},
{
"entropy": 6.146504878997803,
"epoch": 0.20163831127914303,
"grad_norm": 1.0625,
"learning_rate": 0.0004998439993943306,
"loss": 6.1232,
"mean_token_accuracy": 0.13494747802615165,
"num_tokens": 4427581.0,
"step": 2400
},
{
"entropy": 6.175554275512695,
"epoch": 0.20205839109430793,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998428824451779,
"loss": 6.1094,
"mean_token_accuracy": 0.1269066423177719,
"num_tokens": 4436572.0,
"step": 2405
},
{
"entropy": 6.086094999313355,
"epoch": 0.2024784709094728,
"grad_norm": 1.0546875,
"learning_rate": 0.0004998417615130495,
"loss": 6.1156,
"mean_token_accuracy": 0.12977832332253456,
"num_tokens": 4445230.0,
"step": 2410
},
{
"entropy": 6.189484167098999,
"epoch": 0.2028985507246377,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998406365979649,
"loss": 6.1725,
"mean_token_accuracy": 0.13100939616560936,
"num_tokens": 4454251.0,
"step": 2415
},
{
"entropy": 6.080749225616455,
"epoch": 0.20331863053980256,
"grad_norm": 0.953125,
"learning_rate": 0.0004998395076999443,
"loss": 6.0178,
"mean_token_accuracy": 0.13957264572381972,
"num_tokens": 4463949.0,
"step": 2420
},
{
"entropy": 6.179844999313355,
"epoch": 0.20373871035496743,
"grad_norm": 1.03125,
"learning_rate": 0.0004998383748190076,
"loss": 6.2136,
"mean_token_accuracy": 0.1258860044181347,
"num_tokens": 4473373.0,
"step": 2425
},
{
"entropy": 6.209265089035034,
"epoch": 0.20415879017013233,
"grad_norm": 1.1328125,
"learning_rate": 0.0004998372379551748,
"loss": 6.0447,
"mean_token_accuracy": 0.13152522593736649,
"num_tokens": 4482303.0,
"step": 2430
},
{
"entropy": 6.047933959960938,
"epoch": 0.2045788699852972,
"grad_norm": 1.03125,
"learning_rate": 0.0004998360971084663,
"loss": 6.0094,
"mean_token_accuracy": 0.1300078108906746,
"num_tokens": 4491214.0,
"step": 2435
},
{
"entropy": 5.923014402389526,
"epoch": 0.2049989498004621,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998349522789019,
"loss": 5.9367,
"mean_token_accuracy": 0.13840087428689002,
"num_tokens": 4500099.0,
"step": 2440
},
{
"entropy": 6.065491151809693,
"epoch": 0.20541902961562697,
"grad_norm": 1.0,
"learning_rate": 0.0004998338034665021,
"loss": 6.0088,
"mean_token_accuracy": 0.14065721929073333,
"num_tokens": 4509893.0,
"step": 2445
},
{
"entropy": 6.0568382263183596,
"epoch": 0.20583910943079184,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998326506712872,
"loss": 5.9933,
"mean_token_accuracy": 0.13122306242585183,
"num_tokens": 4518606.0,
"step": 2450
},
{
"entropy": 6.102985429763794,
"epoch": 0.20625918924595674,
"grad_norm": 1.0625,
"learning_rate": 0.0004998314938932778,
"loss": 6.0731,
"mean_token_accuracy": 0.13163421601057052,
"num_tokens": 4528392.0,
"step": 2455
},
{
"entropy": 6.143069076538086,
"epoch": 0.2066792690611216,
"grad_norm": 1.03125,
"learning_rate": 0.0004998303331324943,
"loss": 6.0446,
"mean_token_accuracy": 0.13527958691120148,
"num_tokens": 4536983.0,
"step": 2460
},
{
"entropy": 5.983616924285888,
"epoch": 0.2070993488762865,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998291683889571,
"loss": 5.9545,
"mean_token_accuracy": 0.13531955927610398,
"num_tokens": 4544967.0,
"step": 2465
},
{
"entropy": 6.107338380813599,
"epoch": 0.20751942869145137,
"grad_norm": 1.0859375,
"learning_rate": 0.000499827999662687,
"loss": 6.0262,
"mean_token_accuracy": 0.12663825750350952,
"num_tokens": 4554646.0,
"step": 2470
},
{
"entropy": 6.1525249004364015,
"epoch": 0.20793950850661624,
"grad_norm": 0.984375,
"learning_rate": 0.0004998268269537046,
"loss": 6.0498,
"mean_token_accuracy": 0.13295727223157883,
"num_tokens": 4564040.0,
"step": 2475
},
{
"entropy": 5.977402019500732,
"epoch": 0.20835958832178114,
"grad_norm": 1.015625,
"learning_rate": 0.0004998256502620308,
"loss": 6.0631,
"mean_token_accuracy": 0.13584776520729064,
"num_tokens": 4573758.0,
"step": 2480
},
{
"entropy": 6.187655830383301,
"epoch": 0.208779668136946,
"grad_norm": 0.9765625,
"learning_rate": 0.0004998244695876864,
"loss": 6.0894,
"mean_token_accuracy": 0.12783714309334754,
"num_tokens": 4582097.0,
"step": 2485
},
{
"entropy": 6.000187587738037,
"epoch": 0.2091997479521109,
"grad_norm": 1.0859375,
"learning_rate": 0.0004998232849306921,
"loss": 6.0587,
"mean_token_accuracy": 0.1367252618074417,
"num_tokens": 4590687.0,
"step": 2490
},
{
"entropy": 6.167983675003052,
"epoch": 0.20961982776727578,
"grad_norm": 1.0625,
"learning_rate": 0.0004998220962910693,
"loss": 6.0418,
"mean_token_accuracy": 0.1291399121284485,
"num_tokens": 4599497.0,
"step": 2495
},
{
"entropy": 6.0570958137512205,
"epoch": 0.21003990758244068,
"grad_norm": 1.109375,
"learning_rate": 0.0004998209036688386,
"loss": 6.0052,
"mean_token_accuracy": 0.134087672829628,
"num_tokens": 4607958.0,
"step": 2500
},
{
"entropy": 6.154553699493408,
"epoch": 0.21045998739760555,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998197070640216,
"loss": 6.1436,
"mean_token_accuracy": 0.1265629693865776,
"num_tokens": 4617515.0,
"step": 2505
},
{
"entropy": 6.159878873825074,
"epoch": 0.21088006721277042,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998185064766391,
"loss": 6.029,
"mean_token_accuracy": 0.13321105763316154,
"num_tokens": 4627037.0,
"step": 2510
},
{
"entropy": 5.991772747039795,
"epoch": 0.21130014702793531,
"grad_norm": 0.96875,
"learning_rate": 0.0004998173019067127,
"loss": 6.0263,
"mean_token_accuracy": 0.13551492914557456,
"num_tokens": 4637393.0,
"step": 2515
},
{
"entropy": 6.076245498657227,
"epoch": 0.21172022684310018,
"grad_norm": 1.0078125,
"learning_rate": 0.0004998160933542633,
"loss": 6.0656,
"mean_token_accuracy": 0.1218369334936142,
"num_tokens": 4646832.0,
"step": 2520
},
{
"entropy": 6.151754760742188,
"epoch": 0.21214030665826508,
"grad_norm": 1.109375,
"learning_rate": 0.0004998148808193128,
"loss": 6.0983,
"mean_token_accuracy": 0.13457245901226997,
"num_tokens": 4655719.0,
"step": 2525
},
{
"entropy": 6.129997682571411,
"epoch": 0.21256038647342995,
"grad_norm": 1.03125,
"learning_rate": 0.0004998136643018823,
"loss": 6.0362,
"mean_token_accuracy": 0.13282962441444396,
"num_tokens": 4665364.0,
"step": 2530
},
{
"entropy": 6.060281705856323,
"epoch": 0.21298046628859482,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998124438019935,
"loss": 6.016,
"mean_token_accuracy": 0.1327340230345726,
"num_tokens": 4674760.0,
"step": 2535
},
{
"entropy": 5.969087028503418,
"epoch": 0.21340054610375972,
"grad_norm": 0.99609375,
"learning_rate": 0.0004998112193196681,
"loss": 5.9355,
"mean_token_accuracy": 0.1348019614815712,
"num_tokens": 4683900.0,
"step": 2540
},
{
"entropy": 5.984380483627319,
"epoch": 0.2138206259189246,
"grad_norm": 1.0234375,
"learning_rate": 0.0004998099908549277,
"loss": 5.9913,
"mean_token_accuracy": 0.13222553506493567,
"num_tokens": 4693915.0,
"step": 2545
},
{
"entropy": 5.969556903839111,
"epoch": 0.2142407057340895,
"grad_norm": 1.0234375,
"learning_rate": 0.000499808758407794,
"loss": 5.8426,
"mean_token_accuracy": 0.14078716412186623,
"num_tokens": 4703102.0,
"step": 2550
},
{
"entropy": 6.0391675472259525,
"epoch": 0.21466078554925436,
"grad_norm": 1.015625,
"learning_rate": 0.0004998075219782889,
"loss": 6.076,
"mean_token_accuracy": 0.13493016585707665,
"num_tokens": 4712925.0,
"step": 2555
},
{
"entropy": 6.044742774963379,
"epoch": 0.21508086536441923,
"grad_norm": 1.09375,
"learning_rate": 0.0004998062815664344,
"loss": 6.0143,
"mean_token_accuracy": 0.1313328728079796,
"num_tokens": 4722641.0,
"step": 2560
},
{
"entropy": 6.006574583053589,
"epoch": 0.21550094517958412,
"grad_norm": 1.0390625,
"learning_rate": 0.0004998050371722524,
"loss": 6.0658,
"mean_token_accuracy": 0.13319918289780616,
"num_tokens": 4732603.0,
"step": 2565
},
{
"entropy": 5.998908233642578,
"epoch": 0.215921024994749,
"grad_norm": 0.97265625,
"learning_rate": 0.0004998037887957649,
"loss": 5.91,
"mean_token_accuracy": 0.13604277446866037,
"num_tokens": 4742644.0,
"step": 2570
},
{
"entropy": 6.173721647262573,
"epoch": 0.2163411048099139,
"grad_norm": 1.046875,
"learning_rate": 0.0004998025364369939,
"loss": 6.2348,
"mean_token_accuracy": 0.1272044688463211,
"num_tokens": 4751482.0,
"step": 2575
},
{
"entropy": 6.235174703598022,
"epoch": 0.21676118462507876,
"grad_norm": 1.09375,
"learning_rate": 0.0004998012800959619,
"loss": 6.0898,
"mean_token_accuracy": 0.13052162379026414,
"num_tokens": 4760593.0,
"step": 2580
},
{
"entropy": 6.111858797073364,
"epoch": 0.21718126444024366,
"grad_norm": 1.109375,
"learning_rate": 0.0004998000197726909,
"loss": 6.0859,
"mean_token_accuracy": 0.13651981949806213,
"num_tokens": 4769294.0,
"step": 2585
},
{
"entropy": 6.069575262069702,
"epoch": 0.21760134425540853,
"grad_norm": 0.921875,
"learning_rate": 0.0004997987554672033,
"loss": 5.9867,
"mean_token_accuracy": 0.13439425751566886,
"num_tokens": 4779239.0,
"step": 2590
},
{
"entropy": 6.047330856323242,
"epoch": 0.2180214240705734,
"grad_norm": 0.98828125,
"learning_rate": 0.0004997974871795215,
"loss": 6.0684,
"mean_token_accuracy": 0.1306284710764885,
"num_tokens": 4788211.0,
"step": 2595
},
{
"entropy": 6.074938344955444,
"epoch": 0.2184415038857383,
"grad_norm": 0.9375,
"learning_rate": 0.000499796214909668,
"loss": 6.0291,
"mean_token_accuracy": 0.13745831623673438,
"num_tokens": 4797921.0,
"step": 2600
},
{
"entropy": 6.087161684036255,
"epoch": 0.21886158370090317,
"grad_norm": 1.0234375,
"learning_rate": 0.0004997949386576653,
"loss": 6.0259,
"mean_token_accuracy": 0.13380660563707353,
"num_tokens": 4807772.0,
"step": 2605
},
{
"entropy": 6.045609188079834,
"epoch": 0.21928166351606806,
"grad_norm": 0.96484375,
"learning_rate": 0.000499793658423536,
"loss": 6.053,
"mean_token_accuracy": 0.12916495203971862,
"num_tokens": 4817999.0,
"step": 2610
},
{
"entropy": 6.102342319488526,
"epoch": 0.21970174333123293,
"grad_norm": 1.1171875,
"learning_rate": 0.0004997923742073028,
"loss": 6.0064,
"mean_token_accuracy": 0.14091803804039954,
"num_tokens": 4826679.0,
"step": 2615
},
{
"entropy": 5.991182327270508,
"epoch": 0.2201218231463978,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997910860089884,
"loss": 6.0052,
"mean_token_accuracy": 0.13694314211606978,
"num_tokens": 4834998.0,
"step": 2620
},
{
"entropy": 6.100556564331055,
"epoch": 0.2205419029615627,
"grad_norm": 1.0390625,
"learning_rate": 0.0004997897938286156,
"loss": 5.9615,
"mean_token_accuracy": 0.13899449855089188,
"num_tokens": 4843635.0,
"step": 2625
},
{
"entropy": 6.097274303436279,
"epoch": 0.22096198277672757,
"grad_norm": 1.140625,
"learning_rate": 0.0004997884976662075,
"loss": 6.0782,
"mean_token_accuracy": 0.13321034833788872,
"num_tokens": 4852027.0,
"step": 2630
},
{
"entropy": 6.147892570495605,
"epoch": 0.22138206259189247,
"grad_norm": 1.046875,
"learning_rate": 0.0004997871975217868,
"loss": 5.997,
"mean_token_accuracy": 0.14202353730797768,
"num_tokens": 4861244.0,
"step": 2635
},
{
"entropy": 5.8932945251464846,
"epoch": 0.22180214240705734,
"grad_norm": 0.99609375,
"learning_rate": 0.0004997858933953768,
"loss": 5.9307,
"mean_token_accuracy": 0.13638841062784196,
"num_tokens": 4869902.0,
"step": 2640
},
{
"entropy": 5.95978422164917,
"epoch": 0.2222222222222222,
"grad_norm": 1.0078125,
"learning_rate": 0.0004997845852870004,
"loss": 5.8916,
"mean_token_accuracy": 0.1398716911673546,
"num_tokens": 4878502.0,
"step": 2645
},
{
"entropy": 6.00921368598938,
"epoch": 0.2226423020373871,
"grad_norm": 1.046875,
"learning_rate": 0.0004997832731966806,
"loss": 5.9483,
"mean_token_accuracy": 0.14096327498555183,
"num_tokens": 4888348.0,
"step": 2650
},
{
"entropy": 5.991452217102051,
"epoch": 0.22306238185255198,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997819571244411,
"loss": 6.0189,
"mean_token_accuracy": 0.1372461050748825,
"num_tokens": 4897302.0,
"step": 2655
},
{
"entropy": 6.012991952896118,
"epoch": 0.22348246166771688,
"grad_norm": 1.03125,
"learning_rate": 0.0004997806370703049,
"loss": 6.0444,
"mean_token_accuracy": 0.13569730073213576,
"num_tokens": 4907078.0,
"step": 2660
},
{
"entropy": 5.988911724090576,
"epoch": 0.22390254148288175,
"grad_norm": 0.921875,
"learning_rate": 0.0004997793130342954,
"loss": 5.874,
"mean_token_accuracy": 0.13960873782634736,
"num_tokens": 4917489.0,
"step": 2665
},
{
"entropy": 5.930651092529297,
"epoch": 0.22432262129804661,
"grad_norm": 1.046875,
"learning_rate": 0.0004997779850164363,
"loss": 5.9779,
"mean_token_accuracy": 0.13561228066682815,
"num_tokens": 4927073.0,
"step": 2670
},
{
"entropy": 6.158057308197021,
"epoch": 0.2247427011132115,
"grad_norm": 1.0546875,
"learning_rate": 0.0004997766530167508,
"loss": 6.0815,
"mean_token_accuracy": 0.13055123686790465,
"num_tokens": 4935464.0,
"step": 2675
},
{
"entropy": 6.137771940231323,
"epoch": 0.22516278092837638,
"grad_norm": 1.1015625,
"learning_rate": 0.0004997753170352627,
"loss": 6.1621,
"mean_token_accuracy": 0.12912468686699868,
"num_tokens": 4944718.0,
"step": 2680
},
{
"entropy": 6.082327508926392,
"epoch": 0.22558286074354128,
"grad_norm": 1.125,
"learning_rate": 0.0004997739770719955,
"loss": 6.0351,
"mean_token_accuracy": 0.13314241990447045,
"num_tokens": 4954223.0,
"step": 2685
},
{
"entropy": 6.058599901199341,
"epoch": 0.22600294055870615,
"grad_norm": 0.96875,
"learning_rate": 0.000499772633126973,
"loss": 6.0734,
"mean_token_accuracy": 0.132587993144989,
"num_tokens": 4963371.0,
"step": 2690
},
{
"entropy": 6.008123540878296,
"epoch": 0.22642302037387105,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997712852002192,
"loss": 5.9177,
"mean_token_accuracy": 0.14214024171233178,
"num_tokens": 4972973.0,
"step": 2695
},
{
"entropy": 6.0829901695251465,
"epoch": 0.22684310018903592,
"grad_norm": 1.09375,
"learning_rate": 0.0004997699332917578,
"loss": 6.1705,
"mean_token_accuracy": 0.12515681982040405,
"num_tokens": 4982808.0,
"step": 2700
},
{
"entropy": 6.190164613723755,
"epoch": 0.2272631800042008,
"grad_norm": 0.984375,
"learning_rate": 0.0004997685774016127,
"loss": 6.0492,
"mean_token_accuracy": 0.13310236856341362,
"num_tokens": 4992427.0,
"step": 2705
},
{
"entropy": 6.118629407882691,
"epoch": 0.22768325981936569,
"grad_norm": 0.9296875,
"learning_rate": 0.000499767217529808,
"loss": 6.2194,
"mean_token_accuracy": 0.1233941525220871,
"num_tokens": 5003562.0,
"step": 2710
},
{
"entropy": 6.01776008605957,
"epoch": 0.22810333963453056,
"grad_norm": 0.91015625,
"learning_rate": 0.0004997658536763678,
"loss": 5.9127,
"mean_token_accuracy": 0.13683665543794632,
"num_tokens": 5013429.0,
"step": 2715
},
{
"entropy": 6.101313972473145,
"epoch": 0.22852341944969545,
"grad_norm": 0.99609375,
"learning_rate": 0.0004997644858413163,
"loss": 6.0303,
"mean_token_accuracy": 0.13920028880238533,
"num_tokens": 5022045.0,
"step": 2720
},
{
"entropy": 5.957802677154541,
"epoch": 0.22894349926486032,
"grad_norm": 0.9609375,
"learning_rate": 0.0004997631140246775,
"loss": 5.8742,
"mean_token_accuracy": 0.1393354929983616,
"num_tokens": 5032260.0,
"step": 2725
},
{
"entropy": 6.008182525634766,
"epoch": 0.2293635790800252,
"grad_norm": 1.015625,
"learning_rate": 0.000499761738226476,
"loss": 5.9304,
"mean_token_accuracy": 0.13518433645367622,
"num_tokens": 5041688.0,
"step": 2730
},
{
"entropy": 6.007689189910889,
"epoch": 0.2297836588951901,
"grad_norm": 1.0703125,
"learning_rate": 0.000499760358446736,
"loss": 6.0285,
"mean_token_accuracy": 0.1302636370062828,
"num_tokens": 5051005.0,
"step": 2735
},
{
"entropy": 6.086931037902832,
"epoch": 0.23020373871035496,
"grad_norm": 1.015625,
"learning_rate": 0.000499758974685482,
"loss": 5.9683,
"mean_token_accuracy": 0.13642121478915215,
"num_tokens": 5060084.0,
"step": 2740
},
{
"entropy": 6.020289707183838,
"epoch": 0.23062381852551986,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997575869427385,
"loss": 5.9633,
"mean_token_accuracy": 0.14196947664022447,
"num_tokens": 5069081.0,
"step": 2745
},
{
"entropy": 6.004650115966797,
"epoch": 0.23104389834068473,
"grad_norm": 1.015625,
"learning_rate": 0.00049975619521853,
"loss": 5.9567,
"mean_token_accuracy": 0.12970962673425673,
"num_tokens": 5078597.0,
"step": 2750
},
{
"entropy": 5.982837867736817,
"epoch": 0.2314639781558496,
"grad_norm": 1.015625,
"learning_rate": 0.0004997547995128814,
"loss": 6.0116,
"mean_token_accuracy": 0.13743849247694015,
"num_tokens": 5087607.0,
"step": 2755
},
{
"entropy": 6.052627325057983,
"epoch": 0.2318840579710145,
"grad_norm": 1.1015625,
"learning_rate": 0.0004997533998258171,
"loss": 6.0129,
"mean_token_accuracy": 0.13540438339114189,
"num_tokens": 5097412.0,
"step": 2760
},
{
"entropy": 6.117385768890381,
"epoch": 0.23230413778617937,
"grad_norm": 1.078125,
"learning_rate": 0.0004997519961573622,
"loss": 6.0833,
"mean_token_accuracy": 0.12936894744634628,
"num_tokens": 5105817.0,
"step": 2765
},
{
"entropy": 6.181453561782837,
"epoch": 0.23272421760134426,
"grad_norm": 1.1328125,
"learning_rate": 0.0004997505885075414,
"loss": 6.1236,
"mean_token_accuracy": 0.131087327003479,
"num_tokens": 5114958.0,
"step": 2770
},
{
"entropy": 6.078393983840942,
"epoch": 0.23314429741650913,
"grad_norm": 1.03125,
"learning_rate": 0.0004997491768763795,
"loss": 6.0399,
"mean_token_accuracy": 0.13298976346850394,
"num_tokens": 5123728.0,
"step": 2775
},
{
"entropy": 6.067098140716553,
"epoch": 0.23356437723167403,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997477612639018,
"loss": 6.0705,
"mean_token_accuracy": 0.13075175881385803,
"num_tokens": 5134099.0,
"step": 2780
},
{
"entropy": 6.135926675796509,
"epoch": 0.2339844570468389,
"grad_norm": 1.078125,
"learning_rate": 0.0004997463416701332,
"loss": 6.0951,
"mean_token_accuracy": 0.12707924395799636,
"num_tokens": 5142934.0,
"step": 2785
},
{
"entropy": 6.010523986816406,
"epoch": 0.23440453686200377,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997449180950989,
"loss": 5.9263,
"mean_token_accuracy": 0.15269666612148286,
"num_tokens": 5151835.0,
"step": 2790
},
{
"entropy": 5.981678676605225,
"epoch": 0.23482461667716867,
"grad_norm": 0.95703125,
"learning_rate": 0.0004997434905388241,
"loss": 5.9711,
"mean_token_accuracy": 0.1440896660089493,
"num_tokens": 5161136.0,
"step": 2795
},
{
"entropy": 6.001285219192505,
"epoch": 0.23524469649233354,
"grad_norm": 0.96875,
"learning_rate": 0.000499742059001334,
"loss": 5.9034,
"mean_token_accuracy": 0.13934119418263435,
"num_tokens": 5170741.0,
"step": 2800
},
{
"entropy": 6.018690299987793,
"epoch": 0.23566477630749844,
"grad_norm": 1.046875,
"learning_rate": 0.0004997406234826541,
"loss": 5.9552,
"mean_token_accuracy": 0.14054046496748923,
"num_tokens": 5180549.0,
"step": 2805
},
{
"entropy": 5.938224172592163,
"epoch": 0.2360848561226633,
"grad_norm": 0.97265625,
"learning_rate": 0.0004997391839828098,
"loss": 5.9206,
"mean_token_accuracy": 0.14346935153007506,
"num_tokens": 5189486.0,
"step": 2810
},
{
"entropy": 5.992739534378051,
"epoch": 0.23650493593782818,
"grad_norm": 1.046875,
"learning_rate": 0.0004997377405018266,
"loss": 5.989,
"mean_token_accuracy": 0.13284722566604615,
"num_tokens": 5198525.0,
"step": 2815
},
{
"entropy": 6.092206192016602,
"epoch": 0.23692501575299307,
"grad_norm": 1.0234375,
"learning_rate": 0.00049973629303973,
"loss": 6.0314,
"mean_token_accuracy": 0.13478757068514824,
"num_tokens": 5207124.0,
"step": 2820
},
{
"entropy": 5.945079660415649,
"epoch": 0.23734509556815794,
"grad_norm": 0.96875,
"learning_rate": 0.0004997348415965457,
"loss": 5.8623,
"mean_token_accuracy": 0.13759657815098764,
"num_tokens": 5216529.0,
"step": 2825
},
{
"entropy": 6.023407697677612,
"epoch": 0.23776517538332284,
"grad_norm": 1.09375,
"learning_rate": 0.0004997333861722995,
"loss": 6.0141,
"mean_token_accuracy": 0.13514548242092134,
"num_tokens": 5225796.0,
"step": 2830
},
{
"entropy": 6.10746054649353,
"epoch": 0.2381852551984877,
"grad_norm": 1.1015625,
"learning_rate": 0.000499731926767017,
"loss": 6.038,
"mean_token_accuracy": 0.13310380131006241,
"num_tokens": 5233876.0,
"step": 2835
},
{
"entropy": 5.989476442337036,
"epoch": 0.23860533501365258,
"grad_norm": 0.984375,
"learning_rate": 0.0004997304633807242,
"loss": 6.0249,
"mean_token_accuracy": 0.13009608685970306,
"num_tokens": 5244782.0,
"step": 2840
},
{
"entropy": 6.030221557617187,
"epoch": 0.23902541482881748,
"grad_norm": 1.078125,
"learning_rate": 0.0004997289960134468,
"loss": 5.9703,
"mean_token_accuracy": 0.1368887707591057,
"num_tokens": 5253453.0,
"step": 2845
},
{
"entropy": 5.974435091018677,
"epoch": 0.23944549464398235,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997275246652111,
"loss": 5.9925,
"mean_token_accuracy": 0.13992664366960525,
"num_tokens": 5262355.0,
"step": 2850
},
{
"entropy": 5.960743093490601,
"epoch": 0.23986557445914725,
"grad_norm": 1.09375,
"learning_rate": 0.000499726049336043,
"loss": 5.9169,
"mean_token_accuracy": 0.13876279294490815,
"num_tokens": 5271959.0,
"step": 2855
},
{
"entropy": 6.026579475402832,
"epoch": 0.24028565427431212,
"grad_norm": 1.078125,
"learning_rate": 0.0004997245700259686,
"loss": 5.9378,
"mean_token_accuracy": 0.14501210153102875,
"num_tokens": 5281393.0,
"step": 2860
},
{
"entropy": 6.063703155517578,
"epoch": 0.240705734089477,
"grad_norm": 0.96484375,
"learning_rate": 0.0004997230867350141,
"loss": 6.0777,
"mean_token_accuracy": 0.13284969255328177,
"num_tokens": 5290979.0,
"step": 2865
},
{
"entropy": 6.101211261749268,
"epoch": 0.24112581390464188,
"grad_norm": 1.0078125,
"learning_rate": 0.0004997215994632059,
"loss": 6.0262,
"mean_token_accuracy": 0.13758746907114983,
"num_tokens": 5300263.0,
"step": 2870
},
{
"entropy": 6.094251537322998,
"epoch": 0.24154589371980675,
"grad_norm": 1.03125,
"learning_rate": 0.0004997201082105704,
"loss": 6.0534,
"mean_token_accuracy": 0.13286127597093583,
"num_tokens": 5309522.0,
"step": 2875
},
{
"entropy": 6.022702550888061,
"epoch": 0.24196597353497165,
"grad_norm": 0.9921875,
"learning_rate": 0.0004997186129771338,
"loss": 6.0403,
"mean_token_accuracy": 0.13587599471211434,
"num_tokens": 5319770.0,
"step": 2880
},
{
"entropy": 6.158471012115479,
"epoch": 0.24238605335013652,
"grad_norm": 1.109375,
"learning_rate": 0.0004997171137629226,
"loss": 6.0553,
"mean_token_accuracy": 0.13789832219481468,
"num_tokens": 5328400.0,
"step": 2885
},
{
"entropy": 5.906137275695801,
"epoch": 0.24280613316530142,
"grad_norm": 1.1328125,
"learning_rate": 0.0004997156105679636,
"loss": 5.8523,
"mean_token_accuracy": 0.1505324125289917,
"num_tokens": 5336338.0,
"step": 2890
},
{
"entropy": 5.937476301193238,
"epoch": 0.2432262129804663,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997141033922832,
"loss": 5.9567,
"mean_token_accuracy": 0.13674451038241386,
"num_tokens": 5345391.0,
"step": 2895
},
{
"entropy": 6.064019298553466,
"epoch": 0.24364629279563116,
"grad_norm": 1.0859375,
"learning_rate": 0.0004997125922359081,
"loss": 5.9792,
"mean_token_accuracy": 0.12863812744617462,
"num_tokens": 5354709.0,
"step": 2900
},
{
"entropy": 6.019229459762573,
"epoch": 0.24406637261079606,
"grad_norm": 1.046875,
"learning_rate": 0.0004997110770988652,
"loss": 5.8953,
"mean_token_accuracy": 0.1387791097164154,
"num_tokens": 5363738.0,
"step": 2905
},
{
"entropy": 6.001317119598388,
"epoch": 0.24448645242596093,
"grad_norm": 1.1796875,
"learning_rate": 0.0004997095579811813,
"loss": 6.0193,
"mean_token_accuracy": 0.13423686176538469,
"num_tokens": 5373583.0,
"step": 2910
},
{
"entropy": 6.057038736343384,
"epoch": 0.24490653224112582,
"grad_norm": 1.0,
"learning_rate": 0.0004997080348828833,
"loss": 6.0731,
"mean_token_accuracy": 0.1342361442744732,
"num_tokens": 5383486.0,
"step": 2915
},
{
"entropy": 6.072895145416259,
"epoch": 0.2453266120562907,
"grad_norm": 1.1171875,
"learning_rate": 0.0004997065078039981,
"loss": 5.9871,
"mean_token_accuracy": 0.13158463463187217,
"num_tokens": 5391974.0,
"step": 2920
},
{
"entropy": 6.060671615600586,
"epoch": 0.24574669187145556,
"grad_norm": 1.078125,
"learning_rate": 0.0004997049767445529,
"loss": 6.0206,
"mean_token_accuracy": 0.13064797297120095,
"num_tokens": 5400882.0,
"step": 2925
},
{
"entropy": 6.072234678268432,
"epoch": 0.24616677168662046,
"grad_norm": 1.0390625,
"learning_rate": 0.0004997034417045746,
"loss": 5.9808,
"mean_token_accuracy": 0.13372990265488624,
"num_tokens": 5410538.0,
"step": 2930
},
{
"entropy": 5.924702501296997,
"epoch": 0.24658685150178533,
"grad_norm": 1.078125,
"learning_rate": 0.0004997019026840907,
"loss": 5.8715,
"mean_token_accuracy": 0.13977383449673653,
"num_tokens": 5419406.0,
"step": 2935
},
{
"entropy": 5.90452995300293,
"epoch": 0.24700693131695023,
"grad_norm": 1.0703125,
"learning_rate": 0.0004997003596831282,
"loss": 5.9778,
"mean_token_accuracy": 0.1360274873673916,
"num_tokens": 5428817.0,
"step": 2940
},
{
"entropy": 6.072986078262329,
"epoch": 0.2474270111321151,
"grad_norm": 1.046875,
"learning_rate": 0.0004996988127017145,
"loss": 6.0065,
"mean_token_accuracy": 0.135275649279356,
"num_tokens": 5438277.0,
"step": 2945
},
{
"entropy": 6.042351722717285,
"epoch": 0.24784709094728,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996972617398772,
"loss": 6.0257,
"mean_token_accuracy": 0.1333914116024971,
"num_tokens": 5447440.0,
"step": 2950
},
{
"entropy": 6.086295413970947,
"epoch": 0.24826717076244487,
"grad_norm": 1.046875,
"learning_rate": 0.0004996957067976435,
"loss": 5.9563,
"mean_token_accuracy": 0.1371552027761936,
"num_tokens": 5455988.0,
"step": 2955
},
{
"entropy": 5.992445421218872,
"epoch": 0.24868725057760974,
"grad_norm": 1.0859375,
"learning_rate": 0.0004996941478750411,
"loss": 5.9653,
"mean_token_accuracy": 0.13751397803425788,
"num_tokens": 5464996.0,
"step": 2960
},
{
"entropy": 6.070774841308594,
"epoch": 0.24910733039277463,
"grad_norm": 0.953125,
"learning_rate": 0.0004996925849720975,
"loss": 6.0955,
"mean_token_accuracy": 0.1302956260740757,
"num_tokens": 5474174.0,
"step": 2965
},
{
"entropy": 6.157161235809326,
"epoch": 0.2495274102079395,
"grad_norm": 1.109375,
"learning_rate": 0.0004996910180888405,
"loss": 5.9967,
"mean_token_accuracy": 0.13579627349972725,
"num_tokens": 5482838.0,
"step": 2970
},
{
"entropy": 5.956932163238525,
"epoch": 0.2499474900231044,
"grad_norm": 1.03125,
"learning_rate": 0.0004996894472252977,
"loss": 6.0037,
"mean_token_accuracy": 0.13603818491101266,
"num_tokens": 5491616.0,
"step": 2975
},
{
"entropy": 6.029225301742554,
"epoch": 0.25036756983826924,
"grad_norm": 1.015625,
"learning_rate": 0.0004996878723814973,
"loss": 5.9869,
"mean_token_accuracy": 0.13199443742632866,
"num_tokens": 5500942.0,
"step": 2980
},
{
"entropy": 6.036972188949585,
"epoch": 0.25078764965343414,
"grad_norm": 1.0234375,
"learning_rate": 0.0004996862935574667,
"loss": 5.9337,
"mean_token_accuracy": 0.12902862578630447,
"num_tokens": 5510078.0,
"step": 2985
},
{
"entropy": 5.945563459396363,
"epoch": 0.25120772946859904,
"grad_norm": 0.99609375,
"learning_rate": 0.0004996847107532342,
"loss": 5.967,
"mean_token_accuracy": 0.13469186648726464,
"num_tokens": 5518924.0,
"step": 2990
},
{
"entropy": 6.076007747650147,
"epoch": 0.25162780928376394,
"grad_norm": 0.9765625,
"learning_rate": 0.0004996831239688277,
"loss": 5.9812,
"mean_token_accuracy": 0.13338077962398528,
"num_tokens": 5527385.0,
"step": 2995
},
{
"entropy": 5.917894792556763,
"epoch": 0.2520478890989288,
"grad_norm": 1.046875,
"learning_rate": 0.0004996815332042754,
"loss": 5.8328,
"mean_token_accuracy": 0.1404413752257824,
"num_tokens": 5536781.0,
"step": 3000
},
{
"epoch": 0.2520478890989288,
"eval_entropy": 5.782812329743307,
"eval_loss": 6.003105163574219,
"eval_mean_token_accuracy": 0.1416803571949664,
"eval_num_tokens": 5536781.0,
"eval_runtime": 27.3987,
"eval_samples_per_second": 1363.789,
"eval_steps_per_second": 170.483,
"step": 3000
}
],
"logging_steps": 5,
"max_steps": 119020,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8117673873408000.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}