Files
swe-latn-10mb-ppt-Dp-10mb_s…/checkpoint-4000/trainer_state.json
ModelHub XC fcfb32cd61 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/swe-latn-10mb-ppt-Dp-10mb_seed3407
Source: Original Platform
2026-06-30 19:20:25 +08:00

8035 lines
219 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.23244326931458292,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 10.742584228515625,
"epoch": 0.0002905540866432286,
"grad_norm": 4.90625,
"learning_rate": 2e-06,
"loss": 10.7837,
"mean_token_accuracy": 0.0,
"num_tokens": 10156.0,
"step": 5
},
{
"entropy": 10.742587471008301,
"epoch": 0.0005811081732864572,
"grad_norm": 4.8125,
"learning_rate": 4.5e-06,
"loss": 10.7753,
"mean_token_accuracy": 9.267840650863945e-05,
"num_tokens": 20933.0,
"step": 10
},
{
"entropy": 10.74257869720459,
"epoch": 0.0008716622599296859,
"grad_norm": 4.375,
"learning_rate": 7e-06,
"loss": 10.7508,
"mean_token_accuracy": 0.0,
"num_tokens": 31298.0,
"step": 15
},
{
"entropy": 10.742635726928711,
"epoch": 0.0011622163465729145,
"grad_norm": 4.875,
"learning_rate": 9.5e-06,
"loss": 10.697,
"mean_token_accuracy": 0.0,
"num_tokens": 40913.0,
"step": 20
},
{
"entropy": 10.742652702331544,
"epoch": 0.0014527704332161432,
"grad_norm": 4.28125,
"learning_rate": 1.2e-05,
"loss": 10.5798,
"mean_token_accuracy": 0.0007269373920280487,
"num_tokens": 49901.0,
"step": 25
},
{
"entropy": 10.742454719543456,
"epoch": 0.0017433245198593718,
"grad_norm": 4.0625,
"learning_rate": 1.4500000000000002e-05,
"loss": 10.4688,
"mean_token_accuracy": 0.01560134175233543,
"num_tokens": 59328.0,
"step": 30
},
{
"entropy": 10.741775226593017,
"epoch": 0.0020338786065026006,
"grad_norm": 3.25,
"learning_rate": 1.7000000000000003e-05,
"loss": 10.3287,
"mean_token_accuracy": 0.037073963694274424,
"num_tokens": 68405.0,
"step": 35
},
{
"entropy": 10.740037631988525,
"epoch": 0.002324432693145829,
"grad_norm": 2.578125,
"learning_rate": 1.95e-05,
"loss": 10.2203,
"mean_token_accuracy": 0.037133642472326756,
"num_tokens": 77591.0,
"step": 40
},
{
"entropy": 10.73731575012207,
"epoch": 0.0026149867797890577,
"grad_norm": 2.359375,
"learning_rate": 2.2e-05,
"loss": 10.1202,
"mean_token_accuracy": 0.03901108838617802,
"num_tokens": 88186.0,
"step": 45
},
{
"entropy": 10.734606838226318,
"epoch": 0.0029055408664322865,
"grad_norm": 2.09375,
"learning_rate": 2.4500000000000003e-05,
"loss": 10.0211,
"mean_token_accuracy": 0.04241710864007473,
"num_tokens": 97594.0,
"step": 50
},
{
"entropy": 10.73211612701416,
"epoch": 0.003196094953075515,
"grad_norm": 1.9921875,
"learning_rate": 2.7e-05,
"loss": 9.9871,
"mean_token_accuracy": 0.03826836366206408,
"num_tokens": 107386.0,
"step": 55
},
{
"entropy": 10.73102445602417,
"epoch": 0.0034866490397187436,
"grad_norm": 1.9140625,
"learning_rate": 2.95e-05,
"loss": 9.9132,
"mean_token_accuracy": 0.03943221494555473,
"num_tokens": 116742.0,
"step": 60
},
{
"entropy": 10.729645252227783,
"epoch": 0.0037772031263619723,
"grad_norm": 1.859375,
"learning_rate": 3.2e-05,
"loss": 9.8519,
"mean_token_accuracy": 0.03962419871240854,
"num_tokens": 126520.0,
"step": 65
},
{
"entropy": 10.727834033966065,
"epoch": 0.004067757213005201,
"grad_norm": 1.7734375,
"learning_rate": 3.4500000000000005e-05,
"loss": 9.7907,
"mean_token_accuracy": 0.03989919070154428,
"num_tokens": 136382.0,
"step": 70
},
{
"entropy": 10.724947452545166,
"epoch": 0.0043583112996484295,
"grad_norm": 1.7421875,
"learning_rate": 3.7e-05,
"loss": 9.7212,
"mean_token_accuracy": 0.03671109899878502,
"num_tokens": 146435.0,
"step": 75
},
{
"entropy": 10.72182493209839,
"epoch": 0.004648865386291658,
"grad_norm": 1.8359375,
"learning_rate": 3.95e-05,
"loss": 9.6591,
"mean_token_accuracy": 0.037667426839470865,
"num_tokens": 156174.0,
"step": 80
},
{
"entropy": 10.71723222732544,
"epoch": 0.004939419472934887,
"grad_norm": 1.765625,
"learning_rate": 4.2000000000000004e-05,
"loss": 9.5783,
"mean_token_accuracy": 0.04142397493124008,
"num_tokens": 165118.0,
"step": 85
},
{
"entropy": 10.708585739135742,
"epoch": 0.005229973559578115,
"grad_norm": 1.875,
"learning_rate": 4.45e-05,
"loss": 9.5252,
"mean_token_accuracy": 0.04036426953971386,
"num_tokens": 174401.0,
"step": 90
},
{
"entropy": 10.697160243988037,
"epoch": 0.005520527646221344,
"grad_norm": 1.765625,
"learning_rate": 4.7000000000000004e-05,
"loss": 9.443,
"mean_token_accuracy": 0.04118307866156101,
"num_tokens": 183533.0,
"step": 95
},
{
"entropy": 10.683875274658202,
"epoch": 0.005811081732864573,
"grad_norm": 1.7734375,
"learning_rate": 4.9500000000000004e-05,
"loss": 9.3596,
"mean_token_accuracy": 0.045602331310510634,
"num_tokens": 193296.0,
"step": 100
},
{
"entropy": 10.665638542175293,
"epoch": 0.006101635819507801,
"grad_norm": 1.75,
"learning_rate": 5.2e-05,
"loss": 9.2242,
"mean_token_accuracy": 0.055989645794034,
"num_tokens": 202741.0,
"step": 105
},
{
"entropy": 10.637047958374023,
"epoch": 0.00639218990615103,
"grad_norm": 1.703125,
"learning_rate": 5.45e-05,
"loss": 9.1359,
"mean_token_accuracy": 0.05134495124220848,
"num_tokens": 212441.0,
"step": 110
},
{
"entropy": 10.61000461578369,
"epoch": 0.006682743992794259,
"grad_norm": 1.6875,
"learning_rate": 5.7e-05,
"loss": 8.9868,
"mean_token_accuracy": 0.04918566383421421,
"num_tokens": 220671.0,
"step": 115
},
{
"entropy": 10.56931962966919,
"epoch": 0.006973298079437487,
"grad_norm": 1.7578125,
"learning_rate": 5.9499999999999996e-05,
"loss": 8.9878,
"mean_token_accuracy": 0.04560479037463665,
"num_tokens": 231390.0,
"step": 120
},
{
"entropy": 10.515452098846435,
"epoch": 0.007263852166080716,
"grad_norm": 1.5859375,
"learning_rate": 6.2e-05,
"loss": 8.8241,
"mean_token_accuracy": 0.05023673102259636,
"num_tokens": 241137.0,
"step": 125
},
{
"entropy": 10.430156230926514,
"epoch": 0.007554406252723945,
"grad_norm": 1.578125,
"learning_rate": 6.450000000000001e-05,
"loss": 8.6778,
"mean_token_accuracy": 0.05138532817363739,
"num_tokens": 250627.0,
"step": 130
},
{
"entropy": 10.353140926361084,
"epoch": 0.007844960339367173,
"grad_norm": 1.5859375,
"learning_rate": 6.7e-05,
"loss": 8.5255,
"mean_token_accuracy": 0.05529710613191128,
"num_tokens": 259564.0,
"step": 135
},
{
"entropy": 10.262280082702636,
"epoch": 0.008135514426010402,
"grad_norm": 1.453125,
"learning_rate": 6.950000000000001e-05,
"loss": 8.4168,
"mean_token_accuracy": 0.05102897398173809,
"num_tokens": 268997.0,
"step": 140
},
{
"entropy": 10.17268762588501,
"epoch": 0.00842606851265363,
"grad_norm": 1.40625,
"learning_rate": 7.2e-05,
"loss": 8.402,
"mean_token_accuracy": 0.04707291163504124,
"num_tokens": 278989.0,
"step": 145
},
{
"entropy": 10.068906784057617,
"epoch": 0.008716622599296859,
"grad_norm": 1.3828125,
"learning_rate": 7.45e-05,
"loss": 8.2195,
"mean_token_accuracy": 0.04823922924697399,
"num_tokens": 288770.0,
"step": 150
},
{
"entropy": 9.884156227111816,
"epoch": 0.009007176685940088,
"grad_norm": 1.234375,
"learning_rate": 7.7e-05,
"loss": 8.1604,
"mean_token_accuracy": 0.05296766012907028,
"num_tokens": 298368.0,
"step": 155
},
{
"entropy": 9.749515438079834,
"epoch": 0.009297730772583316,
"grad_norm": 1.125,
"learning_rate": 7.950000000000001e-05,
"loss": 7.9887,
"mean_token_accuracy": 0.054083061218261716,
"num_tokens": 307437.0,
"step": 160
},
{
"entropy": 9.539670753479005,
"epoch": 0.009588284859226545,
"grad_norm": 1.3828125,
"learning_rate": 8.2e-05,
"loss": 7.931,
"mean_token_accuracy": 0.05368399284780025,
"num_tokens": 317842.0,
"step": 165
},
{
"entropy": 9.367785167694091,
"epoch": 0.009878838945869774,
"grad_norm": 0.984375,
"learning_rate": 8.450000000000001e-05,
"loss": 7.7746,
"mean_token_accuracy": 0.056211471930146216,
"num_tokens": 327455.0,
"step": 170
},
{
"entropy": 9.106531143188477,
"epoch": 0.010169393032513002,
"grad_norm": 0.96484375,
"learning_rate": 8.7e-05,
"loss": 7.7023,
"mean_token_accuracy": 0.059121083468198776,
"num_tokens": 338593.0,
"step": 175
},
{
"entropy": 8.891216564178468,
"epoch": 0.01045994711915623,
"grad_norm": 1.0,
"learning_rate": 8.95e-05,
"loss": 7.6717,
"mean_token_accuracy": 0.060001150518655774,
"num_tokens": 348278.0,
"step": 180
},
{
"entropy": 8.690237522125244,
"epoch": 0.01075050120579946,
"grad_norm": 0.91796875,
"learning_rate": 9.2e-05,
"loss": 7.5848,
"mean_token_accuracy": 0.060652027279138564,
"num_tokens": 358293.0,
"step": 185
},
{
"entropy": 8.500970458984375,
"epoch": 0.011041055292442687,
"grad_norm": 0.70703125,
"learning_rate": 9.45e-05,
"loss": 7.6462,
"mean_token_accuracy": 0.06345079019665718,
"num_tokens": 368177.0,
"step": 190
},
{
"entropy": 8.432841682434082,
"epoch": 0.011331609379085917,
"grad_norm": 0.87890625,
"learning_rate": 9.7e-05,
"loss": 7.5041,
"mean_token_accuracy": 0.06438801400363445,
"num_tokens": 377258.0,
"step": 195
},
{
"entropy": 8.328762531280518,
"epoch": 0.011622163465729146,
"grad_norm": 0.79296875,
"learning_rate": 9.95e-05,
"loss": 7.515,
"mean_token_accuracy": 0.06462946832180023,
"num_tokens": 385931.0,
"step": 200
},
{
"entropy": 8.228355598449706,
"epoch": 0.011912717552372373,
"grad_norm": 0.984375,
"learning_rate": 0.000102,
"loss": 7.4262,
"mean_token_accuracy": 0.06731356121599674,
"num_tokens": 394370.0,
"step": 205
},
{
"entropy": 8.163572025299072,
"epoch": 0.012203271639015602,
"grad_norm": 0.765625,
"learning_rate": 0.00010449999999999999,
"loss": 7.5127,
"mean_token_accuracy": 0.06187250129878521,
"num_tokens": 405167.0,
"step": 210
},
{
"entropy": 8.144425964355468,
"epoch": 0.012493825725658832,
"grad_norm": 0.90234375,
"learning_rate": 0.000107,
"loss": 7.4823,
"mean_token_accuracy": 0.06424942426383495,
"num_tokens": 414954.0,
"step": 215
},
{
"entropy": 8.074434852600097,
"epoch": 0.01278437981230206,
"grad_norm": 1.0078125,
"learning_rate": 0.0001095,
"loss": 7.4379,
"mean_token_accuracy": 0.07021872885525227,
"num_tokens": 423806.0,
"step": 220
},
{
"entropy": 8.100719451904297,
"epoch": 0.013074933898945288,
"grad_norm": 1.1015625,
"learning_rate": 0.000112,
"loss": 7.4049,
"mean_token_accuracy": 0.07006631046533585,
"num_tokens": 433416.0,
"step": 225
},
{
"entropy": 8.068440341949463,
"epoch": 0.013365487985588518,
"grad_norm": 1.015625,
"learning_rate": 0.0001145,
"loss": 7.4086,
"mean_token_accuracy": 0.0656484205275774,
"num_tokens": 443237.0,
"step": 230
},
{
"entropy": 8.008077144622803,
"epoch": 0.013656042072231747,
"grad_norm": 1.0078125,
"learning_rate": 0.00011700000000000001,
"loss": 7.3811,
"mean_token_accuracy": 0.07138268202543259,
"num_tokens": 452334.0,
"step": 235
},
{
"entropy": 7.95733003616333,
"epoch": 0.013946596158874974,
"grad_norm": 1.0390625,
"learning_rate": 0.00011949999999999999,
"loss": 7.451,
"mean_token_accuracy": 0.07069577798247337,
"num_tokens": 462604.0,
"step": 240
},
{
"entropy": 7.985943031311035,
"epoch": 0.014237150245518203,
"grad_norm": 1.109375,
"learning_rate": 0.000122,
"loss": 7.3342,
"mean_token_accuracy": 0.07418472990393639,
"num_tokens": 472105.0,
"step": 245
},
{
"entropy": 7.985353708267212,
"epoch": 0.014527704332161433,
"grad_norm": 0.89453125,
"learning_rate": 0.0001245,
"loss": 7.3573,
"mean_token_accuracy": 0.07192124761641025,
"num_tokens": 481873.0,
"step": 250
},
{
"entropy": 7.852858924865723,
"epoch": 0.01481825841880466,
"grad_norm": 0.9140625,
"learning_rate": 0.000127,
"loss": 7.3134,
"mean_token_accuracy": 0.07094009146094322,
"num_tokens": 490776.0,
"step": 255
},
{
"entropy": 7.97090711593628,
"epoch": 0.01510881250544789,
"grad_norm": 0.97265625,
"learning_rate": 0.0001295,
"loss": 7.3459,
"mean_token_accuracy": 0.06945950090885163,
"num_tokens": 500237.0,
"step": 260
},
{
"entropy": 7.988322401046753,
"epoch": 0.015399366592091119,
"grad_norm": 1.0078125,
"learning_rate": 0.000132,
"loss": 7.3569,
"mean_token_accuracy": 0.0719369538128376,
"num_tokens": 509449.0,
"step": 265
},
{
"entropy": 7.863973140716553,
"epoch": 0.015689920678734346,
"grad_norm": 0.9296875,
"learning_rate": 0.00013450000000000002,
"loss": 7.3463,
"mean_token_accuracy": 0.07629362866282463,
"num_tokens": 519335.0,
"step": 270
},
{
"entropy": 7.850080347061157,
"epoch": 0.015980474765377575,
"grad_norm": 0.828125,
"learning_rate": 0.00013700000000000002,
"loss": 7.269,
"mean_token_accuracy": 0.07348301075398922,
"num_tokens": 529108.0,
"step": 275
},
{
"entropy": 7.803001642227173,
"epoch": 0.016271028852020804,
"grad_norm": 0.9609375,
"learning_rate": 0.0001395,
"loss": 7.3421,
"mean_token_accuracy": 0.07442944496870041,
"num_tokens": 539409.0,
"step": 280
},
{
"entropy": 7.8401947021484375,
"epoch": 0.016561582938664034,
"grad_norm": 0.98828125,
"learning_rate": 0.00014199999999999998,
"loss": 7.269,
"mean_token_accuracy": 0.07476447969675064,
"num_tokens": 549790.0,
"step": 285
},
{
"entropy": 7.773062610626221,
"epoch": 0.01685213702530726,
"grad_norm": 0.90234375,
"learning_rate": 0.0001445,
"loss": 7.2597,
"mean_token_accuracy": 0.07743276208639145,
"num_tokens": 559343.0,
"step": 290
},
{
"entropy": 7.833370351791382,
"epoch": 0.01714269111195049,
"grad_norm": 1.1171875,
"learning_rate": 0.000147,
"loss": 7.306,
"mean_token_accuracy": 0.07507650516927242,
"num_tokens": 568806.0,
"step": 295
},
{
"entropy": 7.692620134353637,
"epoch": 0.017433245198593718,
"grad_norm": 1.0703125,
"learning_rate": 0.0001495,
"loss": 7.1532,
"mean_token_accuracy": 0.07671754881739616,
"num_tokens": 578988.0,
"step": 300
},
{
"entropy": 7.840510559082031,
"epoch": 0.017723799285236947,
"grad_norm": 1.015625,
"learning_rate": 0.000152,
"loss": 7.258,
"mean_token_accuracy": 0.0767325557768345,
"num_tokens": 588588.0,
"step": 305
},
{
"entropy": 7.740892934799194,
"epoch": 0.018014353371880176,
"grad_norm": 0.9453125,
"learning_rate": 0.00015450000000000001,
"loss": 7.2385,
"mean_token_accuracy": 0.07767370343208313,
"num_tokens": 597957.0,
"step": 310
},
{
"entropy": 7.761815309524536,
"epoch": 0.018304907458523405,
"grad_norm": 0.8671875,
"learning_rate": 0.000157,
"loss": 7.2168,
"mean_token_accuracy": 0.07732245922088624,
"num_tokens": 607446.0,
"step": 315
},
{
"entropy": 7.723113679885865,
"epoch": 0.01859546154516663,
"grad_norm": 0.8359375,
"learning_rate": 0.0001595,
"loss": 7.1559,
"mean_token_accuracy": 0.07753840312361718,
"num_tokens": 617064.0,
"step": 320
},
{
"entropy": 7.695508337020874,
"epoch": 0.01888601563180986,
"grad_norm": 1.03125,
"learning_rate": 0.000162,
"loss": 7.2008,
"mean_token_accuracy": 0.08057244047522545,
"num_tokens": 625927.0,
"step": 325
},
{
"entropy": 7.717827177047729,
"epoch": 0.01917656971845309,
"grad_norm": 0.97265625,
"learning_rate": 0.00016450000000000001,
"loss": 7.1152,
"mean_token_accuracy": 0.07994545996189117,
"num_tokens": 635341.0,
"step": 330
},
{
"entropy": 7.675025224685669,
"epoch": 0.01946712380509632,
"grad_norm": 2.171875,
"learning_rate": 0.00016700000000000002,
"loss": 7.1106,
"mean_token_accuracy": 0.08988085016608238,
"num_tokens": 645095.0,
"step": 335
},
{
"entropy": 7.714554166793823,
"epoch": 0.019757677891739548,
"grad_norm": 1.0234375,
"learning_rate": 0.00016950000000000003,
"loss": 7.1558,
"mean_token_accuracy": 0.0730321068316698,
"num_tokens": 654754.0,
"step": 340
},
{
"entropy": 7.60111026763916,
"epoch": 0.020048231978382777,
"grad_norm": 0.8671875,
"learning_rate": 0.00017199999999999998,
"loss": 7.1266,
"mean_token_accuracy": 0.07690966166555882,
"num_tokens": 664589.0,
"step": 345
},
{
"entropy": 7.6628223896026615,
"epoch": 0.020338786065026003,
"grad_norm": 1.0703125,
"learning_rate": 0.00017449999999999999,
"loss": 7.1425,
"mean_token_accuracy": 0.07918459475040436,
"num_tokens": 673870.0,
"step": 350
},
{
"entropy": 7.577814197540283,
"epoch": 0.020629340151669232,
"grad_norm": 1.03125,
"learning_rate": 0.000177,
"loss": 7.1137,
"mean_token_accuracy": 0.07997918874025345,
"num_tokens": 684309.0,
"step": 355
},
{
"entropy": 7.6769345760345455,
"epoch": 0.02091989423831246,
"grad_norm": 1.359375,
"learning_rate": 0.0001795,
"loss": 7.1629,
"mean_token_accuracy": 0.07469077445566655,
"num_tokens": 693702.0,
"step": 360
},
{
"entropy": 7.534895896911621,
"epoch": 0.02121044832495569,
"grad_norm": 0.94140625,
"learning_rate": 0.000182,
"loss": 7.0599,
"mean_token_accuracy": 0.07970957532525062,
"num_tokens": 702951.0,
"step": 365
},
{
"entropy": 7.588031339645386,
"epoch": 0.02150100241159892,
"grad_norm": 1.25,
"learning_rate": 0.0001845,
"loss": 7.0677,
"mean_token_accuracy": 0.08218754455447197,
"num_tokens": 712481.0,
"step": 370
},
{
"entropy": 7.600922870635986,
"epoch": 0.02179155649824215,
"grad_norm": 1.046875,
"learning_rate": 0.000187,
"loss": 7.0683,
"mean_token_accuracy": 0.08380770459771156,
"num_tokens": 721579.0,
"step": 375
},
{
"entropy": 7.572713327407837,
"epoch": 0.022082110584885375,
"grad_norm": 1.03125,
"learning_rate": 0.0001895,
"loss": 7.0774,
"mean_token_accuracy": 0.07982454895973205,
"num_tokens": 731404.0,
"step": 380
},
{
"entropy": 7.548839807510376,
"epoch": 0.022372664671528604,
"grad_norm": 0.93359375,
"learning_rate": 0.000192,
"loss": 7.0556,
"mean_token_accuracy": 0.07496214136481286,
"num_tokens": 740751.0,
"step": 385
},
{
"entropy": 7.523876476287842,
"epoch": 0.022663218758171833,
"grad_norm": 0.9765625,
"learning_rate": 0.0001945,
"loss": 7.0247,
"mean_token_accuracy": 0.08082472011446953,
"num_tokens": 751171.0,
"step": 390
},
{
"entropy": 7.552808237075806,
"epoch": 0.022953772844815062,
"grad_norm": 1.078125,
"learning_rate": 0.00019700000000000002,
"loss": 7.0823,
"mean_token_accuracy": 0.07615064568817616,
"num_tokens": 760874.0,
"step": 395
},
{
"entropy": 7.583486127853393,
"epoch": 0.02324432693145829,
"grad_norm": 1.2734375,
"learning_rate": 0.00019950000000000002,
"loss": 7.0585,
"mean_token_accuracy": 0.08326990716159344,
"num_tokens": 769652.0,
"step": 400
},
{
"entropy": 7.488273334503174,
"epoch": 0.02353488101810152,
"grad_norm": 0.98828125,
"learning_rate": 0.000202,
"loss": 7.0421,
"mean_token_accuracy": 0.07620194889605045,
"num_tokens": 779591.0,
"step": 405
},
{
"entropy": 7.564187002182007,
"epoch": 0.023825435104744747,
"grad_norm": 0.94921875,
"learning_rate": 0.00020449999999999998,
"loss": 7.156,
"mean_token_accuracy": 0.08098742663860321,
"num_tokens": 789582.0,
"step": 410
},
{
"entropy": 7.506245565414429,
"epoch": 0.024115989191387976,
"grad_norm": 1.1015625,
"learning_rate": 0.000207,
"loss": 7.0546,
"mean_token_accuracy": 0.07765479311347008,
"num_tokens": 799146.0,
"step": 415
},
{
"entropy": 7.4926127910614015,
"epoch": 0.024406543278031205,
"grad_norm": 1.0390625,
"learning_rate": 0.0002095,
"loss": 7.0246,
"mean_token_accuracy": 0.0782523088157177,
"num_tokens": 808934.0,
"step": 420
},
{
"entropy": 7.532363748550415,
"epoch": 0.024697097364674434,
"grad_norm": 0.87109375,
"learning_rate": 0.000212,
"loss": 7.0821,
"mean_token_accuracy": 0.07597277015447616,
"num_tokens": 819280.0,
"step": 425
},
{
"entropy": 7.457432746887207,
"epoch": 0.024987651451317663,
"grad_norm": 0.97265625,
"learning_rate": 0.0002145,
"loss": 6.9892,
"mean_token_accuracy": 0.0840725652873516,
"num_tokens": 828818.0,
"step": 430
},
{
"entropy": 7.463752698898316,
"epoch": 0.025278205537960893,
"grad_norm": 1.0703125,
"learning_rate": 0.00021700000000000002,
"loss": 6.9816,
"mean_token_accuracy": 0.08661384396255016,
"num_tokens": 839175.0,
"step": 435
},
{
"entropy": 7.5449175357818605,
"epoch": 0.02556875962460412,
"grad_norm": 1.125,
"learning_rate": 0.0002195,
"loss": 7.0777,
"mean_token_accuracy": 0.07947314418852329,
"num_tokens": 849965.0,
"step": 440
},
{
"entropy": 7.392349624633789,
"epoch": 0.025859313711247348,
"grad_norm": 1.0859375,
"learning_rate": 0.000222,
"loss": 6.9968,
"mean_token_accuracy": 0.08229465186595916,
"num_tokens": 859229.0,
"step": 445
},
{
"entropy": 7.4397971630096436,
"epoch": 0.026149867797890577,
"grad_norm": 1.28125,
"learning_rate": 0.0002245,
"loss": 6.9708,
"mean_token_accuracy": 0.0816520519554615,
"num_tokens": 869199.0,
"step": 450
},
{
"entropy": 7.399962043762207,
"epoch": 0.026440421884533806,
"grad_norm": 0.93359375,
"learning_rate": 0.00022700000000000002,
"loss": 6.9666,
"mean_token_accuracy": 0.09285714998841285,
"num_tokens": 879470.0,
"step": 455
},
{
"entropy": 7.4366514682769775,
"epoch": 0.026730975971177035,
"grad_norm": 0.921875,
"learning_rate": 0.00022950000000000002,
"loss": 6.9274,
"mean_token_accuracy": 0.0792453158646822,
"num_tokens": 888397.0,
"step": 460
},
{
"entropy": 7.370485734939575,
"epoch": 0.027021530057820264,
"grad_norm": 0.953125,
"learning_rate": 0.00023200000000000003,
"loss": 6.8202,
"mean_token_accuracy": 0.0855403620749712,
"num_tokens": 898321.0,
"step": 465
},
{
"entropy": 7.4845947265625,
"epoch": 0.027312084144463494,
"grad_norm": 1.0546875,
"learning_rate": 0.00023449999999999998,
"loss": 7.0856,
"mean_token_accuracy": 0.0808610200881958,
"num_tokens": 907947.0,
"step": 470
},
{
"entropy": 7.327203702926636,
"epoch": 0.02760263823110672,
"grad_norm": 1.1015625,
"learning_rate": 0.000237,
"loss": 6.9103,
"mean_token_accuracy": 0.0954340323805809,
"num_tokens": 916842.0,
"step": 475
},
{
"entropy": 7.380954456329346,
"epoch": 0.02789319231774995,
"grad_norm": 1.046875,
"learning_rate": 0.0002395,
"loss": 7.0098,
"mean_token_accuracy": 0.08165798112750053,
"num_tokens": 926431.0,
"step": 480
},
{
"entropy": 7.412681722640992,
"epoch": 0.028183746404393178,
"grad_norm": 0.98828125,
"learning_rate": 0.000242,
"loss": 6.9162,
"mean_token_accuracy": 0.08133741281926632,
"num_tokens": 935819.0,
"step": 485
},
{
"entropy": 7.44426212310791,
"epoch": 0.028474300491036407,
"grad_norm": 1.15625,
"learning_rate": 0.0002445,
"loss": 6.9418,
"mean_token_accuracy": 0.08402741849422454,
"num_tokens": 944198.0,
"step": 490
},
{
"entropy": 7.264917373657227,
"epoch": 0.028764854577679636,
"grad_norm": 0.88671875,
"learning_rate": 0.000247,
"loss": 6.9628,
"mean_token_accuracy": 0.08352083042263984,
"num_tokens": 954972.0,
"step": 495
},
{
"entropy": 7.385922384262085,
"epoch": 0.029055408664322865,
"grad_norm": 1.0703125,
"learning_rate": 0.0002495,
"loss": 6.9018,
"mean_token_accuracy": 0.08520250022411346,
"num_tokens": 964532.0,
"step": 500
},
{
"entropy": 7.475071048736572,
"epoch": 0.02934596275096609,
"grad_norm": 1.1171875,
"learning_rate": 0.000252,
"loss": 6.9955,
"mean_token_accuracy": 0.07958225682377815,
"num_tokens": 974547.0,
"step": 505
},
{
"entropy": 7.299204540252686,
"epoch": 0.02963651683760932,
"grad_norm": 0.98046875,
"learning_rate": 0.0002545,
"loss": 6.935,
"mean_token_accuracy": 0.08022963926196099,
"num_tokens": 984245.0,
"step": 510
},
{
"entropy": 7.318370199203491,
"epoch": 0.02992707092425255,
"grad_norm": 0.94140625,
"learning_rate": 0.000257,
"loss": 6.7766,
"mean_token_accuracy": 0.08500204458832741,
"num_tokens": 994400.0,
"step": 515
},
{
"entropy": 7.352757215499878,
"epoch": 0.03021762501089578,
"grad_norm": 1.2109375,
"learning_rate": 0.0002595,
"loss": 7.0024,
"mean_token_accuracy": 0.07765024341642857,
"num_tokens": 1005775.0,
"step": 520
},
{
"entropy": 7.312537145614624,
"epoch": 0.030508179097539008,
"grad_norm": 1.0859375,
"learning_rate": 0.000262,
"loss": 6.9055,
"mean_token_accuracy": 0.08693855553865433,
"num_tokens": 1015386.0,
"step": 525
},
{
"entropy": 7.383286190032959,
"epoch": 0.030798733184182237,
"grad_norm": 1.0859375,
"learning_rate": 0.00026450000000000003,
"loss": 6.8994,
"mean_token_accuracy": 0.09188547134399414,
"num_tokens": 1024963.0,
"step": 530
},
{
"entropy": 7.249363946914673,
"epoch": 0.031089287270825463,
"grad_norm": 0.9921875,
"learning_rate": 0.00026700000000000004,
"loss": 6.8996,
"mean_token_accuracy": 0.08531768508255481,
"num_tokens": 1034667.0,
"step": 535
},
{
"entropy": 7.265355777740479,
"epoch": 0.03137984135746869,
"grad_norm": 0.9921875,
"learning_rate": 0.00026950000000000005,
"loss": 6.8796,
"mean_token_accuracy": 0.08795020580291749,
"num_tokens": 1044171.0,
"step": 540
},
{
"entropy": 7.295146417617798,
"epoch": 0.031670395444111925,
"grad_norm": 1.1171875,
"learning_rate": 0.00027200000000000005,
"loss": 6.8538,
"mean_token_accuracy": 0.08691519349813462,
"num_tokens": 1053585.0,
"step": 545
},
{
"entropy": 7.237406063079834,
"epoch": 0.03196094953075515,
"grad_norm": 1.15625,
"learning_rate": 0.0002745,
"loss": 6.7515,
"mean_token_accuracy": 0.09050033241510391,
"num_tokens": 1063310.0,
"step": 550
},
{
"entropy": 7.263738679885864,
"epoch": 0.032251503617398376,
"grad_norm": 0.953125,
"learning_rate": 0.000277,
"loss": 6.8651,
"mean_token_accuracy": 0.08824861124157905,
"num_tokens": 1073529.0,
"step": 555
},
{
"entropy": 7.175330972671508,
"epoch": 0.03254205770404161,
"grad_norm": 1.109375,
"learning_rate": 0.0002795,
"loss": 6.8319,
"mean_token_accuracy": 0.08951647505164147,
"num_tokens": 1083432.0,
"step": 560
},
{
"entropy": 7.184946346282959,
"epoch": 0.032832611790684835,
"grad_norm": 0.953125,
"learning_rate": 0.00028199999999999997,
"loss": 6.8004,
"mean_token_accuracy": 0.09656240493059158,
"num_tokens": 1092453.0,
"step": 565
},
{
"entropy": 7.274725437164307,
"epoch": 0.03312316587732807,
"grad_norm": 1.03125,
"learning_rate": 0.0002845,
"loss": 6.8865,
"mean_token_accuracy": 0.08661114051938057,
"num_tokens": 1102402.0,
"step": 570
},
{
"entropy": 7.303795433044433,
"epoch": 0.03341371996397129,
"grad_norm": 1.1171875,
"learning_rate": 0.000287,
"loss": 6.8928,
"mean_token_accuracy": 0.09610759019851685,
"num_tokens": 1111907.0,
"step": 575
},
{
"entropy": 7.228280067443848,
"epoch": 0.03370427405061452,
"grad_norm": 1.125,
"learning_rate": 0.0002895,
"loss": 6.7846,
"mean_token_accuracy": 0.09133462607860565,
"num_tokens": 1120712.0,
"step": 580
},
{
"entropy": 7.0720751762390135,
"epoch": 0.03399482813725775,
"grad_norm": 1.0703125,
"learning_rate": 0.000292,
"loss": 6.6691,
"mean_token_accuracy": 0.0894063800573349,
"num_tokens": 1131165.0,
"step": 585
},
{
"entropy": 7.229758644104004,
"epoch": 0.03428538222390098,
"grad_norm": 1.0625,
"learning_rate": 0.0002945,
"loss": 6.8337,
"mean_token_accuracy": 0.08700250834226608,
"num_tokens": 1140527.0,
"step": 590
},
{
"entropy": 7.137591791152954,
"epoch": 0.03457593631054421,
"grad_norm": 1.140625,
"learning_rate": 0.000297,
"loss": 6.792,
"mean_token_accuracy": 0.08842456936836243,
"num_tokens": 1149977.0,
"step": 595
},
{
"entropy": 7.240325021743774,
"epoch": 0.034866490397187436,
"grad_norm": 1.1328125,
"learning_rate": 0.0002995,
"loss": 6.8153,
"mean_token_accuracy": 0.08972005397081376,
"num_tokens": 1159918.0,
"step": 600
},
{
"entropy": 7.116828918457031,
"epoch": 0.03515704448383067,
"grad_norm": 0.96484375,
"learning_rate": 0.000302,
"loss": 6.7965,
"mean_token_accuracy": 0.08587550893425941,
"num_tokens": 1169218.0,
"step": 605
},
{
"entropy": 7.1641600131988525,
"epoch": 0.035447598570473894,
"grad_norm": 1.1953125,
"learning_rate": 0.0003045,
"loss": 6.8058,
"mean_token_accuracy": 0.09056585654616356,
"num_tokens": 1179429.0,
"step": 610
},
{
"entropy": 7.0538177490234375,
"epoch": 0.03573815265711712,
"grad_norm": 0.953125,
"learning_rate": 0.000307,
"loss": 6.7051,
"mean_token_accuracy": 0.0951805867254734,
"num_tokens": 1189379.0,
"step": 615
},
{
"entropy": 7.165834856033325,
"epoch": 0.03602870674376035,
"grad_norm": 1.1328125,
"learning_rate": 0.0003095,
"loss": 6.6834,
"mean_token_accuracy": 0.09452618882060052,
"num_tokens": 1198643.0,
"step": 620
},
{
"entropy": 7.1435986995697025,
"epoch": 0.03631926083040358,
"grad_norm": 1.203125,
"learning_rate": 0.000312,
"loss": 6.8985,
"mean_token_accuracy": 0.08901753202080727,
"num_tokens": 1207933.0,
"step": 625
},
{
"entropy": 7.125590705871582,
"epoch": 0.03660981491704681,
"grad_norm": 1.1640625,
"learning_rate": 0.0003145,
"loss": 6.7771,
"mean_token_accuracy": 0.09473630785942078,
"num_tokens": 1217000.0,
"step": 630
},
{
"entropy": 7.342123746871948,
"epoch": 0.03690036900369004,
"grad_norm": 1.1796875,
"learning_rate": 0.000317,
"loss": 6.8715,
"mean_token_accuracy": 0.08738602064549923,
"num_tokens": 1227054.0,
"step": 635
},
{
"entropy": 7.0751423835754395,
"epoch": 0.03719092309033326,
"grad_norm": 1.0625,
"learning_rate": 0.0003195,
"loss": 6.8639,
"mean_token_accuracy": 0.08903967961668968,
"num_tokens": 1237126.0,
"step": 640
},
{
"entropy": 7.132748985290528,
"epoch": 0.037481477176976495,
"grad_norm": 1.140625,
"learning_rate": 0.000322,
"loss": 6.7309,
"mean_token_accuracy": 0.09907565861940384,
"num_tokens": 1247404.0,
"step": 645
},
{
"entropy": 7.105540752410889,
"epoch": 0.03777203126361972,
"grad_norm": 0.97265625,
"learning_rate": 0.00032450000000000003,
"loss": 6.6672,
"mean_token_accuracy": 0.08641588017344475,
"num_tokens": 1257130.0,
"step": 650
},
{
"entropy": 7.073269605636597,
"epoch": 0.038062585350262954,
"grad_norm": 1.0234375,
"learning_rate": 0.00032700000000000003,
"loss": 6.7423,
"mean_token_accuracy": 0.09811322540044784,
"num_tokens": 1266931.0,
"step": 655
},
{
"entropy": 7.157707405090332,
"epoch": 0.03835313943690618,
"grad_norm": 0.8828125,
"learning_rate": 0.00032950000000000004,
"loss": 6.7753,
"mean_token_accuracy": 0.08842945359647274,
"num_tokens": 1277770.0,
"step": 660
},
{
"entropy": 7.074891519546509,
"epoch": 0.03864369352354941,
"grad_norm": 1.1953125,
"learning_rate": 0.00033200000000000005,
"loss": 6.6966,
"mean_token_accuracy": 0.09733218997716904,
"num_tokens": 1287188.0,
"step": 665
},
{
"entropy": 7.035866546630859,
"epoch": 0.03893424761019264,
"grad_norm": 1.046875,
"learning_rate": 0.00033450000000000005,
"loss": 6.7408,
"mean_token_accuracy": 0.09134816229343415,
"num_tokens": 1297038.0,
"step": 670
},
{
"entropy": 7.091120624542237,
"epoch": 0.03922480169683586,
"grad_norm": 0.984375,
"learning_rate": 0.000337,
"loss": 6.6964,
"mean_token_accuracy": 0.09473009631037713,
"num_tokens": 1306860.0,
"step": 675
},
{
"entropy": 7.030598735809326,
"epoch": 0.039515355783479096,
"grad_norm": 0.94140625,
"learning_rate": 0.0003395,
"loss": 6.6668,
"mean_token_accuracy": 0.09435953348875045,
"num_tokens": 1316585.0,
"step": 680
},
{
"entropy": 7.1326805591583256,
"epoch": 0.03980590987012232,
"grad_norm": 1.0546875,
"learning_rate": 0.000342,
"loss": 6.7282,
"mean_token_accuracy": 0.09551571607589722,
"num_tokens": 1325601.0,
"step": 685
},
{
"entropy": 7.101321458816528,
"epoch": 0.040096463956765555,
"grad_norm": 1.0625,
"learning_rate": 0.00034449999999999997,
"loss": 6.7604,
"mean_token_accuracy": 0.09247554913163185,
"num_tokens": 1336305.0,
"step": 690
},
{
"entropy": 7.1049731254577635,
"epoch": 0.04038701804340878,
"grad_norm": 1.140625,
"learning_rate": 0.000347,
"loss": 6.6507,
"mean_token_accuracy": 0.09341847449541092,
"num_tokens": 1344820.0,
"step": 695
},
{
"entropy": 6.997063255310058,
"epoch": 0.040677572130052006,
"grad_norm": 1.125,
"learning_rate": 0.0003495,
"loss": 6.6331,
"mean_token_accuracy": 0.09355669766664505,
"num_tokens": 1353950.0,
"step": 700
},
{
"entropy": 7.01454758644104,
"epoch": 0.04096812621669524,
"grad_norm": 1.0078125,
"learning_rate": 0.000352,
"loss": 6.7545,
"mean_token_accuracy": 0.09254956245422363,
"num_tokens": 1364881.0,
"step": 705
},
{
"entropy": 7.0095212936401365,
"epoch": 0.041258680303338464,
"grad_norm": 1.078125,
"learning_rate": 0.0003545,
"loss": 6.7061,
"mean_token_accuracy": 0.09260506108403206,
"num_tokens": 1374018.0,
"step": 710
},
{
"entropy": 7.11537013053894,
"epoch": 0.0415492343899817,
"grad_norm": 1.0625,
"learning_rate": 0.000357,
"loss": 6.6946,
"mean_token_accuracy": 0.08821133449673653,
"num_tokens": 1384319.0,
"step": 715
},
{
"entropy": 6.958690166473389,
"epoch": 0.04183978847662492,
"grad_norm": 1.0546875,
"learning_rate": 0.0003595,
"loss": 6.5713,
"mean_token_accuracy": 0.09440450817346573,
"num_tokens": 1393753.0,
"step": 720
},
{
"entropy": 6.922836446762085,
"epoch": 0.042130342563268156,
"grad_norm": 1.046875,
"learning_rate": 0.000362,
"loss": 6.6616,
"mean_token_accuracy": 0.09427325800061226,
"num_tokens": 1403599.0,
"step": 725
},
{
"entropy": 7.020907402038574,
"epoch": 0.04242089664991138,
"grad_norm": 1.1484375,
"learning_rate": 0.0003645,
"loss": 6.6611,
"mean_token_accuracy": 0.10043973848223686,
"num_tokens": 1412508.0,
"step": 730
},
{
"entropy": 7.071925306320191,
"epoch": 0.04271145073655461,
"grad_norm": 1.125,
"learning_rate": 0.000367,
"loss": 6.8015,
"mean_token_accuracy": 0.0910523734986782,
"num_tokens": 1422776.0,
"step": 735
},
{
"entropy": 6.998428392410278,
"epoch": 0.04300200482319784,
"grad_norm": 1.140625,
"learning_rate": 0.0003695,
"loss": 6.6414,
"mean_token_accuracy": 0.09633751660585403,
"num_tokens": 1432901.0,
"step": 740
},
{
"entropy": 7.035877513885498,
"epoch": 0.043292558909841065,
"grad_norm": 1.0625,
"learning_rate": 0.000372,
"loss": 6.677,
"mean_token_accuracy": 0.09542910531163215,
"num_tokens": 1442916.0,
"step": 745
},
{
"entropy": 6.878139925003052,
"epoch": 0.0435831129964843,
"grad_norm": 1.0078125,
"learning_rate": 0.0003745,
"loss": 6.5395,
"mean_token_accuracy": 0.09616116657853127,
"num_tokens": 1453037.0,
"step": 750
},
{
"entropy": 6.96289029121399,
"epoch": 0.043873667083127524,
"grad_norm": 1.0703125,
"learning_rate": 0.000377,
"loss": 6.6196,
"mean_token_accuracy": 0.10786209627985954,
"num_tokens": 1461963.0,
"step": 755
},
{
"entropy": 7.00122447013855,
"epoch": 0.04416422116977075,
"grad_norm": 1.15625,
"learning_rate": 0.0003795,
"loss": 6.7012,
"mean_token_accuracy": 0.09169812574982643,
"num_tokens": 1471521.0,
"step": 760
},
{
"entropy": 6.930304098129272,
"epoch": 0.04445477525641398,
"grad_norm": 1.21875,
"learning_rate": 0.000382,
"loss": 6.5366,
"mean_token_accuracy": 0.0987947553396225,
"num_tokens": 1481438.0,
"step": 765
},
{
"entropy": 6.89730920791626,
"epoch": 0.04474532934305721,
"grad_norm": 1.1171875,
"learning_rate": 0.0003845,
"loss": 6.5654,
"mean_token_accuracy": 0.09912522435188294,
"num_tokens": 1490522.0,
"step": 770
},
{
"entropy": 6.994078540802002,
"epoch": 0.04503588342970044,
"grad_norm": 0.96875,
"learning_rate": 0.00038700000000000003,
"loss": 6.7343,
"mean_token_accuracy": 0.09250347167253495,
"num_tokens": 1501034.0,
"step": 775
},
{
"entropy": 6.894172525405883,
"epoch": 0.045326437516343666,
"grad_norm": 1.1796875,
"learning_rate": 0.00038950000000000003,
"loss": 6.5391,
"mean_token_accuracy": 0.10528326034545898,
"num_tokens": 1510390.0,
"step": 780
},
{
"entropy": 6.992980337142944,
"epoch": 0.0456169916029869,
"grad_norm": 1.21875,
"learning_rate": 0.00039200000000000004,
"loss": 6.6468,
"mean_token_accuracy": 0.09232402816414834,
"num_tokens": 1520048.0,
"step": 785
},
{
"entropy": 6.977211618423462,
"epoch": 0.045907545689630125,
"grad_norm": 1.2578125,
"learning_rate": 0.00039450000000000005,
"loss": 6.5275,
"mean_token_accuracy": 0.10221462920308114,
"num_tokens": 1529113.0,
"step": 790
},
{
"entropy": 6.760094785690308,
"epoch": 0.04619809977627335,
"grad_norm": 1.09375,
"learning_rate": 0.00039700000000000005,
"loss": 6.6057,
"mean_token_accuracy": 0.09887640923261642,
"num_tokens": 1538573.0,
"step": 795
},
{
"entropy": 6.975562715530396,
"epoch": 0.04648865386291658,
"grad_norm": 1.1640625,
"learning_rate": 0.0003995,
"loss": 6.6064,
"mean_token_accuracy": 0.10373581051826478,
"num_tokens": 1547471.0,
"step": 800
},
{
"entropy": 6.8805656909942625,
"epoch": 0.04677920794955981,
"grad_norm": 1.015625,
"learning_rate": 0.000402,
"loss": 6.5641,
"mean_token_accuracy": 0.10285315811634063,
"num_tokens": 1557259.0,
"step": 805
},
{
"entropy": 7.063277673721314,
"epoch": 0.04706976203620304,
"grad_norm": 0.98046875,
"learning_rate": 0.0004045,
"loss": 6.7921,
"mean_token_accuracy": 0.09200607016682624,
"num_tokens": 1567383.0,
"step": 810
},
{
"entropy": 6.87684121131897,
"epoch": 0.04736031612284627,
"grad_norm": 1.078125,
"learning_rate": 0.00040699999999999997,
"loss": 6.4826,
"mean_token_accuracy": 0.11064840331673623,
"num_tokens": 1577106.0,
"step": 815
},
{
"entropy": 6.807673025131225,
"epoch": 0.04765087020948949,
"grad_norm": 1.0703125,
"learning_rate": 0.0004095,
"loss": 6.5393,
"mean_token_accuracy": 0.10080247670412064,
"num_tokens": 1586100.0,
"step": 820
},
{
"entropy": 6.877712535858154,
"epoch": 0.047941424296132726,
"grad_norm": 1.0859375,
"learning_rate": 0.000412,
"loss": 6.6279,
"mean_token_accuracy": 0.09564873427152634,
"num_tokens": 1596950.0,
"step": 825
},
{
"entropy": 6.891899585723877,
"epoch": 0.04823197838277595,
"grad_norm": 1.0859375,
"learning_rate": 0.0004145,
"loss": 6.5837,
"mean_token_accuracy": 0.09832958057522774,
"num_tokens": 1606001.0,
"step": 830
},
{
"entropy": 6.978082180023193,
"epoch": 0.048522532469419184,
"grad_norm": 1.0390625,
"learning_rate": 0.000417,
"loss": 6.6825,
"mean_token_accuracy": 0.0975476372987032,
"num_tokens": 1616498.0,
"step": 835
},
{
"entropy": 6.831979036331177,
"epoch": 0.04881308655606241,
"grad_norm": 1.2109375,
"learning_rate": 0.0004195,
"loss": 6.5199,
"mean_token_accuracy": 0.10347988307476044,
"num_tokens": 1625195.0,
"step": 840
},
{
"entropy": 6.784482002258301,
"epoch": 0.04910364064270564,
"grad_norm": 1.0,
"learning_rate": 0.000422,
"loss": 6.4476,
"mean_token_accuracy": 0.10162880271673203,
"num_tokens": 1635176.0,
"step": 845
},
{
"entropy": 6.806185960769653,
"epoch": 0.04939419472934887,
"grad_norm": 1.09375,
"learning_rate": 0.0004245,
"loss": 6.553,
"mean_token_accuracy": 0.1015662670135498,
"num_tokens": 1645183.0,
"step": 850
},
{
"entropy": 6.801709985733032,
"epoch": 0.049684748815992094,
"grad_norm": 1.046875,
"learning_rate": 0.000427,
"loss": 6.5479,
"mean_token_accuracy": 0.10148834735155106,
"num_tokens": 1654226.0,
"step": 855
},
{
"entropy": 6.834500074386597,
"epoch": 0.04997530290263533,
"grad_norm": 1.0078125,
"learning_rate": 0.0004295,
"loss": 6.5426,
"mean_token_accuracy": 0.10362305790185929,
"num_tokens": 1664572.0,
"step": 860
},
{
"entropy": 6.950858306884766,
"epoch": 0.05026585698927855,
"grad_norm": 1.0234375,
"learning_rate": 0.000432,
"loss": 6.6472,
"mean_token_accuracy": 0.09981537386775016,
"num_tokens": 1674070.0,
"step": 865
},
{
"entropy": 6.791647720336914,
"epoch": 0.050556411075921785,
"grad_norm": 1.1171875,
"learning_rate": 0.0004345,
"loss": 6.4773,
"mean_token_accuracy": 0.09943379536271095,
"num_tokens": 1683473.0,
"step": 870
},
{
"entropy": 6.777591514587402,
"epoch": 0.05084696516256501,
"grad_norm": 1.1328125,
"learning_rate": 0.000437,
"loss": 6.4869,
"mean_token_accuracy": 0.10118941962718964,
"num_tokens": 1693171.0,
"step": 875
},
{
"entropy": 6.898639726638794,
"epoch": 0.05113751924920824,
"grad_norm": 0.953125,
"learning_rate": 0.0004395,
"loss": 6.606,
"mean_token_accuracy": 0.09705074802041054,
"num_tokens": 1703023.0,
"step": 880
},
{
"entropy": 6.73418025970459,
"epoch": 0.05142807333585147,
"grad_norm": 1.0703125,
"learning_rate": 0.000442,
"loss": 6.4984,
"mean_token_accuracy": 0.1019330695271492,
"num_tokens": 1712698.0,
"step": 885
},
{
"entropy": 6.906363248825073,
"epoch": 0.051718627422494695,
"grad_norm": 1.1171875,
"learning_rate": 0.0004445,
"loss": 6.6098,
"mean_token_accuracy": 0.09838435426354408,
"num_tokens": 1721502.0,
"step": 890
},
{
"entropy": 6.7474723815917965,
"epoch": 0.05200918150913793,
"grad_norm": 1.0546875,
"learning_rate": 0.000447,
"loss": 6.4942,
"mean_token_accuracy": 0.10594057068228721,
"num_tokens": 1730551.0,
"step": 895
},
{
"entropy": 6.808920383453369,
"epoch": 0.052299735595781154,
"grad_norm": 1.015625,
"learning_rate": 0.00044950000000000003,
"loss": 6.5645,
"mean_token_accuracy": 0.10622440055012702,
"num_tokens": 1739368.0,
"step": 900
},
{
"entropy": 6.827513933181763,
"epoch": 0.052590289682424386,
"grad_norm": 1.140625,
"learning_rate": 0.00045200000000000004,
"loss": 6.4176,
"mean_token_accuracy": 0.11146403327584267,
"num_tokens": 1748528.0,
"step": 905
},
{
"entropy": 6.713736248016358,
"epoch": 0.05288084376906761,
"grad_norm": 0.9765625,
"learning_rate": 0.00045450000000000004,
"loss": 6.5739,
"mean_token_accuracy": 0.09899114519357681,
"num_tokens": 1759569.0,
"step": 910
},
{
"entropy": 6.80773286819458,
"epoch": 0.05317139785571084,
"grad_norm": 1.15625,
"learning_rate": 0.00045700000000000005,
"loss": 6.5099,
"mean_token_accuracy": 0.10788461863994599,
"num_tokens": 1769366.0,
"step": 915
},
{
"entropy": 6.76817569732666,
"epoch": 0.05346195194235407,
"grad_norm": 1.046875,
"learning_rate": 0.00045950000000000006,
"loss": 6.6024,
"mean_token_accuracy": 0.09936894476413727,
"num_tokens": 1780155.0,
"step": 920
},
{
"entropy": 6.755830335617065,
"epoch": 0.053752506028997296,
"grad_norm": 1.1484375,
"learning_rate": 0.000462,
"loss": 6.4233,
"mean_token_accuracy": 0.10512633025646209,
"num_tokens": 1789436.0,
"step": 925
},
{
"entropy": 6.823408889770508,
"epoch": 0.05404306011564053,
"grad_norm": 1.1640625,
"learning_rate": 0.0004645,
"loss": 6.5652,
"mean_token_accuracy": 0.0998048096895218,
"num_tokens": 1798836.0,
"step": 930
},
{
"entropy": 6.751146364212036,
"epoch": 0.054333614202283755,
"grad_norm": 1.03125,
"learning_rate": 0.000467,
"loss": 6.444,
"mean_token_accuracy": 0.10532717406749725,
"num_tokens": 1808666.0,
"step": 935
},
{
"entropy": 6.8108867645263675,
"epoch": 0.05462416828892699,
"grad_norm": 1.1171875,
"learning_rate": 0.0004695,
"loss": 6.5972,
"mean_token_accuracy": 0.09496863186359406,
"num_tokens": 1820001.0,
"step": 940
},
{
"entropy": 6.751294231414795,
"epoch": 0.05491472237557021,
"grad_norm": 1.03125,
"learning_rate": 0.000472,
"loss": 6.4693,
"mean_token_accuracy": 0.10566612035036087,
"num_tokens": 1830284.0,
"step": 945
},
{
"entropy": 6.820448493957519,
"epoch": 0.05520527646221344,
"grad_norm": 1.1171875,
"learning_rate": 0.0004745,
"loss": 6.4794,
"mean_token_accuracy": 0.10577797368168831,
"num_tokens": 1839930.0,
"step": 950
},
{
"entropy": 6.629036235809326,
"epoch": 0.05549583054885667,
"grad_norm": 0.98046875,
"learning_rate": 0.000477,
"loss": 6.5675,
"mean_token_accuracy": 0.10090194195508957,
"num_tokens": 1850697.0,
"step": 955
},
{
"entropy": 6.817226839065552,
"epoch": 0.0557863846354999,
"grad_norm": 1.0234375,
"learning_rate": 0.0004795,
"loss": 6.497,
"mean_token_accuracy": 0.10740380734205246,
"num_tokens": 1860196.0,
"step": 960
},
{
"entropy": 6.774875259399414,
"epoch": 0.05607693872214313,
"grad_norm": 1.0546875,
"learning_rate": 0.000482,
"loss": 6.47,
"mean_token_accuracy": 0.1075842596590519,
"num_tokens": 1869000.0,
"step": 965
},
{
"entropy": 6.722468996047974,
"epoch": 0.056367492808786356,
"grad_norm": 1.1875,
"learning_rate": 0.0004845,
"loss": 6.469,
"mean_token_accuracy": 0.10600791200995445,
"num_tokens": 1878687.0,
"step": 970
},
{
"entropy": 6.728367662429809,
"epoch": 0.05665804689542958,
"grad_norm": 1.1640625,
"learning_rate": 0.000487,
"loss": 6.3467,
"mean_token_accuracy": 0.10569515079259872,
"num_tokens": 1886914.0,
"step": 975
},
{
"entropy": 6.671978425979614,
"epoch": 0.056948600982072814,
"grad_norm": 0.97265625,
"learning_rate": 0.0004895,
"loss": 6.5321,
"mean_token_accuracy": 0.10422437414526939,
"num_tokens": 1897392.0,
"step": 980
},
{
"entropy": 6.805356025695801,
"epoch": 0.05723915506871604,
"grad_norm": 1.109375,
"learning_rate": 0.000492,
"loss": 6.488,
"mean_token_accuracy": 0.10600305423140526,
"num_tokens": 1906215.0,
"step": 985
},
{
"entropy": 6.8313037872314455,
"epoch": 0.05752970915535927,
"grad_norm": 1.0859375,
"learning_rate": 0.0004945,
"loss": 6.5017,
"mean_token_accuracy": 0.10730748698115349,
"num_tokens": 1915376.0,
"step": 990
},
{
"entropy": 6.659111022949219,
"epoch": 0.0578202632420025,
"grad_norm": 1.03125,
"learning_rate": 0.000497,
"loss": 6.465,
"mean_token_accuracy": 0.10440039038658142,
"num_tokens": 1925558.0,
"step": 995
},
{
"entropy": 6.676358318328857,
"epoch": 0.05811081732864573,
"grad_norm": 1.0625,
"learning_rate": 0.0004995,
"loss": 6.4301,
"mean_token_accuracy": 0.10430914014577866,
"num_tokens": 1935176.0,
"step": 1000
},
{
"entropy": 6.770152616500854,
"epoch": 0.05840137141528896,
"grad_norm": 0.9921875,
"learning_rate": 0.000499998026082006,
"loss": 6.4924,
"mean_token_accuracy": 0.10445862039923667,
"num_tokens": 1945135.0,
"step": 1005
},
{
"entropy": 6.597527885437012,
"epoch": 0.05869192550193218,
"grad_norm": 1.1875,
"learning_rate": 0.0004999900070995136,
"loss": 6.4838,
"mean_token_accuracy": 0.10765932872891426,
"num_tokens": 1955585.0,
"step": 1010
},
{
"entropy": 6.867468976974488,
"epoch": 0.058982479588575415,
"grad_norm": 1.125,
"learning_rate": 0.0004999758199023239,
"loss": 6.4687,
"mean_token_accuracy": 0.10314074084162712,
"num_tokens": 1964750.0,
"step": 1015
},
{
"entropy": 6.624800300598144,
"epoch": 0.05927303367521864,
"grad_norm": 1.0,
"learning_rate": 0.0004999554648793858,
"loss": 6.5436,
"mean_token_accuracy": 0.10335941463708878,
"num_tokens": 1974697.0,
"step": 1020
},
{
"entropy": 6.7362236976623535,
"epoch": 0.05956358776186187,
"grad_norm": 1.09375,
"learning_rate": 0.0004999289425887425,
"loss": 6.4934,
"mean_token_accuracy": 0.10554013177752494,
"num_tokens": 1983384.0,
"step": 1025
},
{
"entropy": 6.754078722000122,
"epoch": 0.0598541418485051,
"grad_norm": 0.98828125,
"learning_rate": 0.0004998962537575161,
"loss": 6.5229,
"mean_token_accuracy": 0.11017107889056206,
"num_tokens": 1993790.0,
"step": 1030
},
{
"entropy": 6.697407197952271,
"epoch": 0.060144695935148325,
"grad_norm": 1.046875,
"learning_rate": 0.0004998573992818874,
"loss": 6.4027,
"mean_token_accuracy": 0.10623413920402527,
"num_tokens": 2003296.0,
"step": 1035
},
{
"entropy": 6.585323095321655,
"epoch": 0.06043525002179156,
"grad_norm": 1.0625,
"learning_rate": 0.0004998123802270715,
"loss": 6.3345,
"mean_token_accuracy": 0.11027837991714477,
"num_tokens": 2012481.0,
"step": 1040
},
{
"entropy": 6.705205965042114,
"epoch": 0.06072580410843478,
"grad_norm": 1.1796875,
"learning_rate": 0.0004997611978272886,
"loss": 6.4994,
"mean_token_accuracy": 0.10490612536668778,
"num_tokens": 2022382.0,
"step": 1045
},
{
"entropy": 6.638956928253174,
"epoch": 0.061016358195078016,
"grad_norm": 1.0546875,
"learning_rate": 0.0004997038534857298,
"loss": 6.4097,
"mean_token_accuracy": 0.11042128577828407,
"num_tokens": 2032290.0,
"step": 1050
},
{
"entropy": 6.6624797821044925,
"epoch": 0.06130691228172124,
"grad_norm": 0.984375,
"learning_rate": 0.0004996403487745194,
"loss": 6.3594,
"mean_token_accuracy": 0.10972521901130676,
"num_tokens": 2041094.0,
"step": 1055
},
{
"entropy": 6.609392881393433,
"epoch": 0.061597466368364474,
"grad_norm": 1.109375,
"learning_rate": 0.000499570685434671,
"loss": 6.5125,
"mean_token_accuracy": 0.10544388592243195,
"num_tokens": 2051169.0,
"step": 1060
},
{
"entropy": 6.6946526050567625,
"epoch": 0.0618880204550077,
"grad_norm": 1.03125,
"learning_rate": 0.0004994948653760405,
"loss": 6.3966,
"mean_token_accuracy": 0.1103939101099968,
"num_tokens": 2061310.0,
"step": 1065
},
{
"entropy": 6.619559907913208,
"epoch": 0.062178574541650926,
"grad_norm": 1.0390625,
"learning_rate": 0.0004994128906772729,
"loss": 6.3829,
"mean_token_accuracy": 0.10736953839659691,
"num_tokens": 2071537.0,
"step": 1070
},
{
"entropy": 6.6101906299591064,
"epoch": 0.06246912862829416,
"grad_norm": 0.9296875,
"learning_rate": 0.000499324763585746,
"loss": 6.4507,
"mean_token_accuracy": 0.10780780464410782,
"num_tokens": 2082540.0,
"step": 1075
},
{
"entropy": 6.621304225921631,
"epoch": 0.06275968271493738,
"grad_norm": 1.1328125,
"learning_rate": 0.0004992304865175085,
"loss": 6.4413,
"mean_token_accuracy": 0.11023736447095871,
"num_tokens": 2091313.0,
"step": 1080
},
{
"entropy": 6.691177225112915,
"epoch": 0.06305023680158062,
"grad_norm": 1.0234375,
"learning_rate": 0.0004991300620572138,
"loss": 6.4862,
"mean_token_accuracy": 0.10716225057840348,
"num_tokens": 2100826.0,
"step": 1085
},
{
"entropy": 6.671515083312988,
"epoch": 0.06334079088822385,
"grad_norm": 1.0625,
"learning_rate": 0.0004990234929580494,
"loss": 6.4177,
"mean_token_accuracy": 0.10876795202493668,
"num_tokens": 2109798.0,
"step": 1090
},
{
"entropy": 6.640522909164429,
"epoch": 0.06363134497486707,
"grad_norm": 0.9765625,
"learning_rate": 0.0004989107821416609,
"loss": 6.3138,
"mean_token_accuracy": 0.11188038140535354,
"num_tokens": 2119641.0,
"step": 1095
},
{
"entropy": 6.565330696105957,
"epoch": 0.0639218990615103,
"grad_norm": 1.140625,
"learning_rate": 0.0004987919326980723,
"loss": 6.3525,
"mean_token_accuracy": 0.11164129376411439,
"num_tokens": 2128724.0,
"step": 1100
},
{
"entropy": 6.521946573257447,
"epoch": 0.06421245314815353,
"grad_norm": 1.109375,
"learning_rate": 0.0004986669478856011,
"loss": 6.2737,
"mean_token_accuracy": 0.11544388085603714,
"num_tokens": 2137251.0,
"step": 1105
},
{
"entropy": 6.6156073093414305,
"epoch": 0.06450300723479675,
"grad_norm": 1.0,
"learning_rate": 0.0004985358311307688,
"loss": 6.3821,
"mean_token_accuracy": 0.118138437718153,
"num_tokens": 2146978.0,
"step": 1110
},
{
"entropy": 6.669202089309692,
"epoch": 0.06479356132143999,
"grad_norm": 0.98046875,
"learning_rate": 0.0004983985860282081,
"loss": 6.4636,
"mean_token_accuracy": 0.10260412320494652,
"num_tokens": 2157153.0,
"step": 1115
},
{
"entropy": 6.475356149673462,
"epoch": 0.06508411540808322,
"grad_norm": 0.9609375,
"learning_rate": 0.0004982552163405623,
"loss": 6.3599,
"mean_token_accuracy": 0.11348235085606576,
"num_tokens": 2166946.0,
"step": 1120
},
{
"entropy": 6.657857656478882,
"epoch": 0.06537466949472644,
"grad_norm": 1.0703125,
"learning_rate": 0.0004981057259983839,
"loss": 6.3772,
"mean_token_accuracy": 0.11038358807563782,
"num_tokens": 2177249.0,
"step": 1125
},
{
"entropy": 6.466132879257202,
"epoch": 0.06566522358136967,
"grad_norm": 0.99609375,
"learning_rate": 0.0004979501191000262,
"loss": 6.3098,
"mean_token_accuracy": 0.11056527942419052,
"num_tokens": 2187240.0,
"step": 1130
},
{
"entropy": 6.6453643321990965,
"epoch": 0.0659557776680129,
"grad_norm": 1.0625,
"learning_rate": 0.0004977883999115311,
"loss": 6.3145,
"mean_token_accuracy": 0.11672020331025124,
"num_tokens": 2196199.0,
"step": 1135
},
{
"entropy": 6.595391893386841,
"epoch": 0.06624633175465613,
"grad_norm": 1.0703125,
"learning_rate": 0.0004976205728665113,
"loss": 6.2689,
"mean_token_accuracy": 0.11631305515766144,
"num_tokens": 2205726.0,
"step": 1140
},
{
"entropy": 6.587292861938477,
"epoch": 0.06653688584129935,
"grad_norm": 0.9765625,
"learning_rate": 0.0004974466425660307,
"loss": 6.4457,
"mean_token_accuracy": 0.10664665251970291,
"num_tokens": 2216552.0,
"step": 1145
},
{
"entropy": 6.597306776046753,
"epoch": 0.06682743992794259,
"grad_norm": 0.953125,
"learning_rate": 0.0004972666137784759,
"loss": 6.3034,
"mean_token_accuracy": 0.11342373788356781,
"num_tokens": 2225935.0,
"step": 1150
},
{
"entropy": 6.644480466842651,
"epoch": 0.06711799401458582,
"grad_norm": 0.953125,
"learning_rate": 0.0004970804914394271,
"loss": 6.4604,
"mean_token_accuracy": 0.11499964445829391,
"num_tokens": 2235907.0,
"step": 1155
},
{
"entropy": 6.599408388137817,
"epoch": 0.06740854810122904,
"grad_norm": 1.1328125,
"learning_rate": 0.0004968882806515225,
"loss": 6.3881,
"mean_token_accuracy": 0.10959212481975555,
"num_tokens": 2244473.0,
"step": 1160
},
{
"entropy": 6.641416931152344,
"epoch": 0.06769910218787227,
"grad_norm": 1.1875,
"learning_rate": 0.0004966899866843177,
"loss": 6.4123,
"mean_token_accuracy": 0.1027280792593956,
"num_tokens": 2253834.0,
"step": 1165
},
{
"entropy": 6.5416028022766115,
"epoch": 0.0679896562745155,
"grad_norm": 1.015625,
"learning_rate": 0.000496485614974142,
"loss": 6.3413,
"mean_token_accuracy": 0.11207354813814163,
"num_tokens": 2263243.0,
"step": 1170
},
{
"entropy": 6.6198502540588375,
"epoch": 0.06828021036115874,
"grad_norm": 1.0859375,
"learning_rate": 0.0004962751711239492,
"loss": 6.3035,
"mean_token_accuracy": 0.11463942378759384,
"num_tokens": 2273008.0,
"step": 1175
},
{
"entropy": 6.430229234695434,
"epoch": 0.06857076444780195,
"grad_norm": 1.0078125,
"learning_rate": 0.0004960586609031636,
"loss": 6.3457,
"mean_token_accuracy": 0.1155870608985424,
"num_tokens": 2282522.0,
"step": 1180
},
{
"entropy": 6.601986408233643,
"epoch": 0.06886131853444519,
"grad_norm": 1.0625,
"learning_rate": 0.0004958360902475224,
"loss": 6.2529,
"mean_token_accuracy": 0.12027783617377281,
"num_tokens": 2292114.0,
"step": 1185
},
{
"entropy": 6.400939083099365,
"epoch": 0.06915187262108842,
"grad_norm": 0.94921875,
"learning_rate": 0.0004956074652589125,
"loss": 6.1978,
"mean_token_accuracy": 0.12538810446858406,
"num_tokens": 2301592.0,
"step": 1190
},
{
"entropy": 6.51713194847107,
"epoch": 0.06944242670773164,
"grad_norm": 0.9921875,
"learning_rate": 0.0004953727922052035,
"loss": 6.3201,
"mean_token_accuracy": 0.11454231590032578,
"num_tokens": 2310940.0,
"step": 1195
},
{
"entropy": 6.463452672958374,
"epoch": 0.06973298079437487,
"grad_norm": 1.0703125,
"learning_rate": 0.0004951320775200756,
"loss": 6.3959,
"mean_token_accuracy": 0.1151392012834549,
"num_tokens": 2320535.0,
"step": 1200
},
{
"entropy": 6.596390962600708,
"epoch": 0.0700235348810181,
"grad_norm": 0.96875,
"learning_rate": 0.0004948853278028436,
"loss": 6.2563,
"mean_token_accuracy": 0.12523823976516724,
"num_tokens": 2330431.0,
"step": 1205
},
{
"entropy": 6.3869446277618405,
"epoch": 0.07031408896766134,
"grad_norm": 1.0546875,
"learning_rate": 0.0004946325498182755,
"loss": 6.2036,
"mean_token_accuracy": 0.12079060897231102,
"num_tokens": 2339323.0,
"step": 1210
},
{
"entropy": 6.510322713851929,
"epoch": 0.07060464305430456,
"grad_norm": 1.0390625,
"learning_rate": 0.0004943737504964076,
"loss": 6.2992,
"mean_token_accuracy": 0.11487918049097061,
"num_tokens": 2349750.0,
"step": 1215
},
{
"entropy": 6.503530073165893,
"epoch": 0.07089519714094779,
"grad_norm": 1.1171875,
"learning_rate": 0.000494108936932354,
"loss": 6.2558,
"mean_token_accuracy": 0.1210679478943348,
"num_tokens": 2359147.0,
"step": 1220
},
{
"entropy": 6.520279359817505,
"epoch": 0.07118575122759102,
"grad_norm": 0.953125,
"learning_rate": 0.0004938381163861124,
"loss": 6.2786,
"mean_token_accuracy": 0.11829182729125023,
"num_tokens": 2368762.0,
"step": 1225
},
{
"entropy": 6.391372203826904,
"epoch": 0.07147630531423424,
"grad_norm": 0.9765625,
"learning_rate": 0.0004935612962823645,
"loss": 6.1568,
"mean_token_accuracy": 0.12013374790549278,
"num_tokens": 2378060.0,
"step": 1230
},
{
"entropy": 6.465664291381836,
"epoch": 0.07176685940087747,
"grad_norm": 1.0625,
"learning_rate": 0.0004932784842102739,
"loss": 6.2575,
"mean_token_accuracy": 0.12200002744793892,
"num_tokens": 2386997.0,
"step": 1235
},
{
"entropy": 6.6493157863616945,
"epoch": 0.0720574134875207,
"grad_norm": 1.2578125,
"learning_rate": 0.0004929896879232758,
"loss": 6.4026,
"mean_token_accuracy": 0.11086667999625206,
"num_tokens": 2396980.0,
"step": 1240
},
{
"entropy": 6.435001850128174,
"epoch": 0.07234796757416392,
"grad_norm": 1.0703125,
"learning_rate": 0.0004926949153388668,
"loss": 6.2556,
"mean_token_accuracy": 0.1203616626560688,
"num_tokens": 2406450.0,
"step": 1245
},
{
"entropy": 6.519892168045044,
"epoch": 0.07263852166080716,
"grad_norm": 1.03125,
"learning_rate": 0.0004923941745383859,
"loss": 6.2632,
"mean_token_accuracy": 0.11274134442210197,
"num_tokens": 2415985.0,
"step": 1250
},
{
"entropy": 6.457003879547119,
"epoch": 0.07292907574745039,
"grad_norm": 0.94921875,
"learning_rate": 0.000492087473766794,
"loss": 6.2928,
"mean_token_accuracy": 0.11486212983727455,
"num_tokens": 2425676.0,
"step": 1255
},
{
"entropy": 6.508018493652344,
"epoch": 0.07321962983409362,
"grad_norm": 1.0,
"learning_rate": 0.000491774821432448,
"loss": 6.2922,
"mean_token_accuracy": 0.10985862240195274,
"num_tokens": 2435918.0,
"step": 1260
},
{
"entropy": 6.5097509860992435,
"epoch": 0.07351018392073684,
"grad_norm": 1.0703125,
"learning_rate": 0.0004914562261068693,
"loss": 6.3562,
"mean_token_accuracy": 0.11788229197263718,
"num_tokens": 2445267.0,
"step": 1265
},
{
"entropy": 6.599736261367798,
"epoch": 0.07380073800738007,
"grad_norm": 1.140625,
"learning_rate": 0.0004911316965245098,
"loss": 6.3224,
"mean_token_accuracy": 0.11191006749868393,
"num_tokens": 2455885.0,
"step": 1270
},
{
"entropy": 6.489064168930054,
"epoch": 0.0740912920940233,
"grad_norm": 1.0234375,
"learning_rate": 0.000490801241582512,
"loss": 6.3483,
"mean_token_accuracy": 0.11579938605427742,
"num_tokens": 2465604.0,
"step": 1275
},
{
"entropy": 6.5532605171203615,
"epoch": 0.07438184618066652,
"grad_norm": 1.1015625,
"learning_rate": 0.000490464870340465,
"loss": 6.4458,
"mean_token_accuracy": 0.10784725919365883,
"num_tokens": 2475168.0,
"step": 1280
},
{
"entropy": 6.473039054870606,
"epoch": 0.07467240026730976,
"grad_norm": 1.1796875,
"learning_rate": 0.0004901225920201563,
"loss": 6.2243,
"mean_token_accuracy": 0.12185250818729401,
"num_tokens": 2484185.0,
"step": 1285
},
{
"entropy": 6.583461809158325,
"epoch": 0.07496295435395299,
"grad_norm": 1.1171875,
"learning_rate": 0.000489774416005319,
"loss": 6.3387,
"mean_token_accuracy": 0.11904568299651146,
"num_tokens": 2492992.0,
"step": 1290
},
{
"entropy": 6.418948078155518,
"epoch": 0.07525350844059622,
"grad_norm": 1.03125,
"learning_rate": 0.0004894203518413742,
"loss": 6.2065,
"mean_token_accuracy": 0.119369375705719,
"num_tokens": 2502541.0,
"step": 1295
},
{
"entropy": 6.468045377731324,
"epoch": 0.07554406252723944,
"grad_norm": 1.0546875,
"learning_rate": 0.0004890604092351701,
"loss": 6.2364,
"mean_token_accuracy": 0.11862708181142807,
"num_tokens": 2511947.0,
"step": 1300
},
{
"entropy": 6.385909509658814,
"epoch": 0.07583461661388267,
"grad_norm": 1.0703125,
"learning_rate": 0.000488694598054715,
"loss": 6.2525,
"mean_token_accuracy": 0.12124920263886452,
"num_tokens": 2521727.0,
"step": 1305
},
{
"entropy": 6.531244993209839,
"epoch": 0.07612517070052591,
"grad_norm": 1.0625,
"learning_rate": 0.0004883229283289071,
"loss": 6.2694,
"mean_token_accuracy": 0.1218131199479103,
"num_tokens": 2530680.0,
"step": 1310
},
{
"entropy": 6.422513055801391,
"epoch": 0.07641572478716913,
"grad_norm": 1.0703125,
"learning_rate": 0.00048794541024725993,
"loss": 6.1542,
"mean_token_accuracy": 0.12266649156808854,
"num_tokens": 2539414.0,
"step": 1315
},
{
"entropy": 6.491461181640625,
"epoch": 0.07670627887381236,
"grad_norm": 1.0390625,
"learning_rate": 0.0004875620541596221,
"loss": 6.3072,
"mean_token_accuracy": 0.1141884945333004,
"num_tokens": 2549609.0,
"step": 1320
},
{
"entropy": 6.4648158073425295,
"epoch": 0.07699683296045559,
"grad_norm": 1.0625,
"learning_rate": 0.00048717287057589454,
"loss": 6.2773,
"mean_token_accuracy": 0.11799687221646309,
"num_tokens": 2560081.0,
"step": 1325
},
{
"entropy": 6.400183534622192,
"epoch": 0.07728738704709882,
"grad_norm": 1.09375,
"learning_rate": 0.0004867778701657417,
"loss": 6.2328,
"mean_token_accuracy": 0.11631238982081413,
"num_tokens": 2569995.0,
"step": 1330
},
{
"entropy": 6.37140007019043,
"epoch": 0.07757794113374204,
"grad_norm": 1.046875,
"learning_rate": 0.00048637706375829955,
"loss": 6.1738,
"mean_token_accuracy": 0.1213558554649353,
"num_tokens": 2579502.0,
"step": 1335
},
{
"entropy": 6.476347970962524,
"epoch": 0.07786849522038528,
"grad_norm": 0.9921875,
"learning_rate": 0.000485970462341878,
"loss": 6.2553,
"mean_token_accuracy": 0.12006450816988945,
"num_tokens": 2589515.0,
"step": 1340
},
{
"entropy": 6.434140920639038,
"epoch": 0.07815904930702851,
"grad_norm": 1.0859375,
"learning_rate": 0.00048555807706366044,
"loss": 6.1897,
"mean_token_accuracy": 0.12782623916864394,
"num_tokens": 2598822.0,
"step": 1345
},
{
"entropy": 6.443134021759033,
"epoch": 0.07844960339367173,
"grad_norm": 0.93359375,
"learning_rate": 0.00048513991922939756,
"loss": 6.315,
"mean_token_accuracy": 0.11421679928898812,
"num_tokens": 2609169.0,
"step": 1350
},
{
"entropy": 6.484804105758667,
"epoch": 0.07874015748031496,
"grad_norm": 0.98046875,
"learning_rate": 0.00048471600030309744,
"loss": 6.2716,
"mean_token_accuracy": 0.11644304916262627,
"num_tokens": 2618683.0,
"step": 1355
},
{
"entropy": 6.466926431655883,
"epoch": 0.07903071156695819,
"grad_norm": 1.140625,
"learning_rate": 0.00048428633190671186,
"loss": 6.2371,
"mean_token_accuracy": 0.12091248780488968,
"num_tokens": 2627976.0,
"step": 1360
},
{
"entropy": 6.505730533599854,
"epoch": 0.07932126565360141,
"grad_norm": 1.0703125,
"learning_rate": 0.0004838509258198167,
"loss": 6.294,
"mean_token_accuracy": 0.11860666498541832,
"num_tokens": 2637235.0,
"step": 1365
},
{
"entropy": 6.393795537948608,
"epoch": 0.07961181974024464,
"grad_norm": 0.984375,
"learning_rate": 0.00048340979397929,
"loss": 6.2951,
"mean_token_accuracy": 0.11754858568310737,
"num_tokens": 2646698.0,
"step": 1370
},
{
"entropy": 6.505375099182129,
"epoch": 0.07990237382688788,
"grad_norm": 1.125,
"learning_rate": 0.00048296294847898386,
"loss": 6.2788,
"mean_token_accuracy": 0.12090856656432152,
"num_tokens": 2656357.0,
"step": 1375
},
{
"entropy": 6.434703159332275,
"epoch": 0.08019292791353111,
"grad_norm": 1.0859375,
"learning_rate": 0.0004825104015693934,
"loss": 6.1776,
"mean_token_accuracy": 0.11764631941914558,
"num_tokens": 2665561.0,
"step": 1380
},
{
"entropy": 6.437805318832398,
"epoch": 0.08048348200017433,
"grad_norm": 1.0859375,
"learning_rate": 0.0004820521656573208,
"loss": 6.1909,
"mean_token_accuracy": 0.12296778410673141,
"num_tokens": 2674600.0,
"step": 1385
},
{
"entropy": 6.368801641464233,
"epoch": 0.08077403608681756,
"grad_norm": 1.0234375,
"learning_rate": 0.00048158825330553505,
"loss": 6.1838,
"mean_token_accuracy": 0.12880179584026336,
"num_tokens": 2684944.0,
"step": 1390
},
{
"entropy": 6.461294555664063,
"epoch": 0.0810645901734608,
"grad_norm": 1.0078125,
"learning_rate": 0.00048111867723242763,
"loss": 6.1342,
"mean_token_accuracy": 0.12006727010011672,
"num_tokens": 2694467.0,
"step": 1395
},
{
"entropy": 6.442787504196167,
"epoch": 0.08135514426010401,
"grad_norm": 1.0546875,
"learning_rate": 0.0004806434503116637,
"loss": 6.2769,
"mean_token_accuracy": 0.11950750723481178,
"num_tokens": 2704499.0,
"step": 1400
},
{
"entropy": 6.378614997863769,
"epoch": 0.08164569834674724,
"grad_norm": 1.0,
"learning_rate": 0.0004801625855718296,
"loss": 6.1896,
"mean_token_accuracy": 0.11940810978412628,
"num_tokens": 2715424.0,
"step": 1405
},
{
"entropy": 6.41011266708374,
"epoch": 0.08193625243339048,
"grad_norm": 1.09375,
"learning_rate": 0.00047967609619607477,
"loss": 6.1788,
"mean_token_accuracy": 0.12036006227135658,
"num_tokens": 2724805.0,
"step": 1410
},
{
"entropy": 6.3130451202392575,
"epoch": 0.08222680652003371,
"grad_norm": 1.0234375,
"learning_rate": 0.0004791839955217513,
"loss": 6.1481,
"mean_token_accuracy": 0.12863539010286332,
"num_tokens": 2734216.0,
"step": 1415
},
{
"entropy": 6.424062490463257,
"epoch": 0.08251736060667693,
"grad_norm": 1.0234375,
"learning_rate": 0.00047868629704004786,
"loss": 6.2572,
"mean_token_accuracy": 0.11476619765162469,
"num_tokens": 2744146.0,
"step": 1420
},
{
"entropy": 6.422879314422607,
"epoch": 0.08280791469332016,
"grad_norm": 1.046875,
"learning_rate": 0.00047818301439561965,
"loss": 6.2419,
"mean_token_accuracy": 0.12102322354912758,
"num_tokens": 2754000.0,
"step": 1425
},
{
"entropy": 6.637474250793457,
"epoch": 0.0830984687799634,
"grad_norm": 1.046875,
"learning_rate": 0.00047767416138621454,
"loss": 6.288,
"mean_token_accuracy": 0.11775907129049301,
"num_tokens": 2763185.0,
"step": 1430
},
{
"entropy": 6.372423696517944,
"epoch": 0.08338902286660661,
"grad_norm": 1.078125,
"learning_rate": 0.000477159751962295,
"loss": 6.2381,
"mean_token_accuracy": 0.11884959116578102,
"num_tokens": 2773324.0,
"step": 1435
},
{
"entropy": 6.485676908493042,
"epoch": 0.08367957695324985,
"grad_norm": 1.0546875,
"learning_rate": 0.00047663980022665507,
"loss": 6.2207,
"mean_token_accuracy": 0.11649533435702324,
"num_tokens": 2783184.0,
"step": 1440
},
{
"entropy": 6.396980142593383,
"epoch": 0.08397013103989308,
"grad_norm": 0.9296875,
"learning_rate": 0.00047611432043403437,
"loss": 6.2223,
"mean_token_accuracy": 0.11544240266084671,
"num_tokens": 2793278.0,
"step": 1445
},
{
"entropy": 6.366146802902222,
"epoch": 0.08426068512653631,
"grad_norm": 1.0625,
"learning_rate": 0.0004755833269907267,
"loss": 6.1262,
"mean_token_accuracy": 0.12203074395656585,
"num_tokens": 2802164.0,
"step": 1450
},
{
"entropy": 6.457718706130981,
"epoch": 0.08455123921317953,
"grad_norm": 1.0078125,
"learning_rate": 0.0004750468344541857,
"loss": 6.1891,
"mean_token_accuracy": 0.11854342371225357,
"num_tokens": 2811537.0,
"step": 1455
},
{
"entropy": 6.381798458099365,
"epoch": 0.08484179329982276,
"grad_norm": 1.0546875,
"learning_rate": 0.00047450485753262525,
"loss": 6.2965,
"mean_token_accuracy": 0.11684540212154389,
"num_tokens": 2821861.0,
"step": 1460
},
{
"entropy": 6.412109518051148,
"epoch": 0.085132347386466,
"grad_norm": 0.98046875,
"learning_rate": 0.00047395741108461633,
"loss": 6.1718,
"mean_token_accuracy": 0.12374548763036727,
"num_tokens": 2831916.0,
"step": 1465
},
{
"entropy": 6.33392972946167,
"epoch": 0.08542290147310921,
"grad_norm": 1.0546875,
"learning_rate": 0.00047340451011867985,
"loss": 6.1604,
"mean_token_accuracy": 0.12683377638459206,
"num_tokens": 2840979.0,
"step": 1470
},
{
"entropy": 6.418259906768799,
"epoch": 0.08571345555975245,
"grad_norm": 1.1015625,
"learning_rate": 0.00047284616979287515,
"loss": 6.1782,
"mean_token_accuracy": 0.11932171955704689,
"num_tokens": 2851332.0,
"step": 1475
},
{
"entropy": 6.265405559539795,
"epoch": 0.08600400964639568,
"grad_norm": 1.03125,
"learning_rate": 0.00047228240541438433,
"loss": 6.073,
"mean_token_accuracy": 0.12999156266450881,
"num_tokens": 2860134.0,
"step": 1480
},
{
"entropy": 6.458755302429199,
"epoch": 0.08629456373303891,
"grad_norm": 1.1171875,
"learning_rate": 0.00047171323243909257,
"loss": 6.2126,
"mean_token_accuracy": 0.11848914325237274,
"num_tokens": 2869218.0,
"step": 1485
},
{
"entropy": 6.345139837265014,
"epoch": 0.08658511781968213,
"grad_norm": 0.98828125,
"learning_rate": 0.00047113866647116457,
"loss": 6.1426,
"mean_token_accuracy": 0.12274593263864517,
"num_tokens": 2878529.0,
"step": 1490
},
{
"entropy": 6.426075124740601,
"epoch": 0.08687567190632536,
"grad_norm": 1.0625,
"learning_rate": 0.0004705587232626164,
"loss": 6.1579,
"mean_token_accuracy": 0.11727055683732032,
"num_tokens": 2888149.0,
"step": 1495
},
{
"entropy": 6.3561450958251955,
"epoch": 0.0871662259929686,
"grad_norm": 1.03125,
"learning_rate": 0.00046997341871288424,
"loss": 6.1347,
"mean_token_accuracy": 0.12332948073744773,
"num_tokens": 2897790.0,
"step": 1500
},
{
"entropy": 6.316312408447265,
"epoch": 0.08745678007961181,
"grad_norm": 0.87890625,
"learning_rate": 0.0004693827688683879,
"loss": 6.2274,
"mean_token_accuracy": 0.12053183913230896,
"num_tokens": 2908168.0,
"step": 1505
},
{
"entropy": 6.390694427490234,
"epoch": 0.08774733416625505,
"grad_norm": 1.0,
"learning_rate": 0.0004687867899220914,
"loss": 6.116,
"mean_token_accuracy": 0.12294506877660752,
"num_tokens": 2918734.0,
"step": 1510
},
{
"entropy": 6.296877431869507,
"epoch": 0.08803788825289828,
"grad_norm": 1.0078125,
"learning_rate": 0.00046818549821305846,
"loss": 6.0839,
"mean_token_accuracy": 0.1293163001537323,
"num_tokens": 2927599.0,
"step": 1515
},
{
"entropy": 6.2974005222320555,
"epoch": 0.0883284423395415,
"grad_norm": 1.0234375,
"learning_rate": 0.00046757891022600494,
"loss": 6.1189,
"mean_token_accuracy": 0.12587246671319008,
"num_tokens": 2936707.0,
"step": 1520
},
{
"entropy": 6.4475304126739506,
"epoch": 0.08861899642618473,
"grad_norm": 1.0078125,
"learning_rate": 0.0004669670425908471,
"loss": 6.187,
"mean_token_accuracy": 0.12100831568241119,
"num_tokens": 2945607.0,
"step": 1525
},
{
"entropy": 6.347147464752197,
"epoch": 0.08890955051282796,
"grad_norm": 1.09375,
"learning_rate": 0.0004663499120822451,
"loss": 6.0989,
"mean_token_accuracy": 0.12438113316893577,
"num_tokens": 2954836.0,
"step": 1530
},
{
"entropy": 6.321421432495117,
"epoch": 0.0892001045994712,
"grad_norm": 1.0234375,
"learning_rate": 0.0004657275356191437,
"loss": 6.1061,
"mean_token_accuracy": 0.12466374784708023,
"num_tokens": 2964338.0,
"step": 1535
},
{
"entropy": 6.353213739395142,
"epoch": 0.08949065868611442,
"grad_norm": 1.0546875,
"learning_rate": 0.00046509993026430804,
"loss": 6.1634,
"mean_token_accuracy": 0.12038285210728646,
"num_tokens": 2973943.0,
"step": 1540
},
{
"entropy": 6.339908075332642,
"epoch": 0.08978121277275765,
"grad_norm": 1.0546875,
"learning_rate": 0.0004644671132238558,
"loss": 6.0839,
"mean_token_accuracy": 0.12774784490466118,
"num_tokens": 2983315.0,
"step": 1545
},
{
"entropy": 6.300918197631836,
"epoch": 0.09007176685940088,
"grad_norm": 1.1640625,
"learning_rate": 0.00046382910184678585,
"loss": 6.0278,
"mean_token_accuracy": 0.12856598794460297,
"num_tokens": 2992039.0,
"step": 1550
},
{
"entropy": 6.178817892074585,
"epoch": 0.0903623209460441,
"grad_norm": 0.9765625,
"learning_rate": 0.0004631859136245025,
"loss": 6.0594,
"mean_token_accuracy": 0.12656542137265206,
"num_tokens": 3001428.0,
"step": 1555
},
{
"entropy": 6.426393222808838,
"epoch": 0.09065287503268733,
"grad_norm": 0.96875,
"learning_rate": 0.0004625375661903357,
"loss": 6.1823,
"mean_token_accuracy": 0.12130758315324783,
"num_tokens": 3012060.0,
"step": 1560
},
{
"entropy": 6.289572381973267,
"epoch": 0.09094342911933057,
"grad_norm": 1.03125,
"learning_rate": 0.00046188407731905787,
"loss": 6.133,
"mean_token_accuracy": 0.11978519856929778,
"num_tokens": 3021371.0,
"step": 1565
},
{
"entropy": 6.355306005477905,
"epoch": 0.0912339832059738,
"grad_norm": 1.0390625,
"learning_rate": 0.00046122546492639643,
"loss": 6.1783,
"mean_token_accuracy": 0.12279156744480133,
"num_tokens": 3030934.0,
"step": 1570
},
{
"entropy": 6.265284681320191,
"epoch": 0.09152453729261702,
"grad_norm": 1.0546875,
"learning_rate": 0.000460561747068543,
"loss": 6.1286,
"mean_token_accuracy": 0.12520743757486344,
"num_tokens": 3041182.0,
"step": 1575
},
{
"entropy": 6.356278705596924,
"epoch": 0.09181509137926025,
"grad_norm": 1.0859375,
"learning_rate": 0.0004598929419416578,
"loss": 6.0982,
"mean_token_accuracy": 0.12530012279748917,
"num_tokens": 3050086.0,
"step": 1580
},
{
"entropy": 6.332121706008911,
"epoch": 0.09210564546590348,
"grad_norm": 1.078125,
"learning_rate": 0.00045921906788137123,
"loss": 6.2171,
"mean_token_accuracy": 0.12314857169985771,
"num_tokens": 3061403.0,
"step": 1585
},
{
"entropy": 6.35912766456604,
"epoch": 0.0923961995525467,
"grad_norm": 1.078125,
"learning_rate": 0.00045854014336228115,
"loss": 6.1708,
"mean_token_accuracy": 0.12304715439677238,
"num_tokens": 3070942.0,
"step": 1590
},
{
"entropy": 6.291305208206177,
"epoch": 0.09268675363918993,
"grad_norm": 1.0859375,
"learning_rate": 0.00045785618699744615,
"loss": 6.0504,
"mean_token_accuracy": 0.12217177525162697,
"num_tokens": 3079526.0,
"step": 1595
},
{
"entropy": 6.269596576690674,
"epoch": 0.09297730772583317,
"grad_norm": 1.0546875,
"learning_rate": 0.00045716721753787543,
"loss": 6.0384,
"mean_token_accuracy": 0.12933970913290976,
"num_tokens": 3090977.0,
"step": 1600
},
{
"entropy": 6.290025997161865,
"epoch": 0.0932678618124764,
"grad_norm": 1.0859375,
"learning_rate": 0.0004564732538720148,
"loss": 6.1565,
"mean_token_accuracy": 0.1253731794655323,
"num_tokens": 3100830.0,
"step": 1605
},
{
"entropy": 6.346334457397461,
"epoch": 0.09355841589911962,
"grad_norm": 0.921875,
"learning_rate": 0.00045577431502522877,
"loss": 6.1792,
"mean_token_accuracy": 0.12612521946430205,
"num_tokens": 3110285.0,
"step": 1610
},
{
"entropy": 6.386321640014648,
"epoch": 0.09384896998576285,
"grad_norm": 0.9921875,
"learning_rate": 0.0004550704201592787,
"loss": 6.0621,
"mean_token_accuracy": 0.12808025181293486,
"num_tokens": 3119690.0,
"step": 1615
},
{
"entropy": 6.274943828582764,
"epoch": 0.09413952407240608,
"grad_norm": 1.0,
"learning_rate": 0.0004543615885717981,
"loss": 6.1145,
"mean_token_accuracy": 0.12201056703925132,
"num_tokens": 3129656.0,
"step": 1620
},
{
"entropy": 6.2934671401977536,
"epoch": 0.0944300781590493,
"grad_norm": 1.0078125,
"learning_rate": 0.00045364783969576296,
"loss": 6.0519,
"mean_token_accuracy": 0.12800228744745254,
"num_tokens": 3140083.0,
"step": 1625
},
{
"entropy": 6.261609125137329,
"epoch": 0.09472063224569253,
"grad_norm": 1.078125,
"learning_rate": 0.0004529291930989592,
"loss": 6.0483,
"mean_token_accuracy": 0.13036949634552003,
"num_tokens": 3149747.0,
"step": 1630
},
{
"entropy": 6.2602826118469235,
"epoch": 0.09501118633233577,
"grad_norm": 0.9296875,
"learning_rate": 0.0004522056684834464,
"loss": 6.019,
"mean_token_accuracy": 0.12770563066005708,
"num_tokens": 3160367.0,
"step": 1635
},
{
"entropy": 6.249707126617432,
"epoch": 0.09530174041897899,
"grad_norm": 1.0625,
"learning_rate": 0.0004514772856850173,
"loss": 6.0068,
"mean_token_accuracy": 0.12763984724879265,
"num_tokens": 3169375.0,
"step": 1640
},
{
"entropy": 6.236002111434937,
"epoch": 0.09559229450562222,
"grad_norm": 1.0625,
"learning_rate": 0.0004507440646726542,
"loss": 6.0794,
"mean_token_accuracy": 0.13096466660499573,
"num_tokens": 3178907.0,
"step": 1645
},
{
"entropy": 6.366798305511475,
"epoch": 0.09588284859226545,
"grad_norm": 1.0,
"learning_rate": 0.0004500060255479818,
"loss": 6.0808,
"mean_token_accuracy": 0.12382574900984764,
"num_tokens": 3189336.0,
"step": 1650
},
{
"entropy": 6.227943420410156,
"epoch": 0.09617340267890868,
"grad_norm": 1.1015625,
"learning_rate": 0.0004492631885447151,
"loss": 6.1707,
"mean_token_accuracy": 0.12618450224399566,
"num_tokens": 3198787.0,
"step": 1655
},
{
"entropy": 6.291205787658692,
"epoch": 0.0964639567655519,
"grad_norm": 1.0625,
"learning_rate": 0.00044851557402810616,
"loss": 6.0351,
"mean_token_accuracy": 0.1262456052005291,
"num_tokens": 3208161.0,
"step": 1660
},
{
"entropy": 6.287449550628662,
"epoch": 0.09675451085219514,
"grad_norm": 1.015625,
"learning_rate": 0.00044776320249438444,
"loss": 6.095,
"mean_token_accuracy": 0.1295604422688484,
"num_tokens": 3217589.0,
"step": 1665
},
{
"entropy": 6.171545219421387,
"epoch": 0.09704506493883837,
"grad_norm": 0.9296875,
"learning_rate": 0.00044700609457019565,
"loss": 6.0335,
"mean_token_accuracy": 0.12443587705492973,
"num_tokens": 3227159.0,
"step": 1670
},
{
"entropy": 6.260297155380249,
"epoch": 0.09733561902548159,
"grad_norm": 0.984375,
"learning_rate": 0.0004462442710120359,
"loss": 6.0323,
"mean_token_accuracy": 0.13212064653635025,
"num_tokens": 3236765.0,
"step": 1675
},
{
"entropy": 6.34990553855896,
"epoch": 0.09762617311212482,
"grad_norm": 0.9140625,
"learning_rate": 0.000445477752705683,
"loss": 6.1196,
"mean_token_accuracy": 0.12271321415901185,
"num_tokens": 3247136.0,
"step": 1680
},
{
"entropy": 6.242118835449219,
"epoch": 0.09791672719876805,
"grad_norm": 1.1171875,
"learning_rate": 0.00044470656066562336,
"loss": 6.1049,
"mean_token_accuracy": 0.12386861220002174,
"num_tokens": 3256880.0,
"step": 1685
},
{
"entropy": 6.314453554153443,
"epoch": 0.09820728128541129,
"grad_norm": 1.015625,
"learning_rate": 0.0004439307160344765,
"loss": 6.138,
"mean_token_accuracy": 0.12304992526769638,
"num_tokens": 3267104.0,
"step": 1690
},
{
"entropy": 6.257071495056152,
"epoch": 0.0984978353720545,
"grad_norm": 1.0703125,
"learning_rate": 0.00044315024008241473,
"loss": 6.0882,
"mean_token_accuracy": 0.12182446792721749,
"num_tokens": 3276165.0,
"step": 1695
},
{
"entropy": 6.447886228561401,
"epoch": 0.09878838945869774,
"grad_norm": 1.015625,
"learning_rate": 0.0004423651542065806,
"loss": 6.2112,
"mean_token_accuracy": 0.12100318372249604,
"num_tokens": 3285600.0,
"step": 1700
},
{
"entropy": 6.263007783889771,
"epoch": 0.09907894354534097,
"grad_norm": 1.03125,
"learning_rate": 0.00044157547993050006,
"loss": 6.1135,
"mean_token_accuracy": 0.12620161846280098,
"num_tokens": 3295654.0,
"step": 1705
},
{
"entropy": 6.253702402114868,
"epoch": 0.09936949763198419,
"grad_norm": 1.0703125,
"learning_rate": 0.00044078123890349227,
"loss": 6.0644,
"mean_token_accuracy": 0.134315574914217,
"num_tokens": 3304743.0,
"step": 1710
},
{
"entropy": 6.289787006378174,
"epoch": 0.09966005171862742,
"grad_norm": 1.0390625,
"learning_rate": 0.00043998245290007606,
"loss": 6.0324,
"mean_token_accuracy": 0.12361097186803818,
"num_tokens": 3313951.0,
"step": 1715
},
{
"entropy": 6.332495594024659,
"epoch": 0.09995060580527065,
"grad_norm": 0.98828125,
"learning_rate": 0.00043917914381937323,
"loss": 6.0995,
"mean_token_accuracy": 0.1251884751021862,
"num_tokens": 3324508.0,
"step": 1720
},
{
"entropy": 6.194294214248657,
"epoch": 0.10024115989191389,
"grad_norm": 0.94140625,
"learning_rate": 0.00043837133368450815,
"loss": 6.054,
"mean_token_accuracy": 0.12405704930424691,
"num_tokens": 3335373.0,
"step": 1725
},
{
"entropy": 6.333467721939087,
"epoch": 0.1005317139785571,
"grad_norm": 1.0234375,
"learning_rate": 0.0004375590446420037,
"loss": 6.0678,
"mean_token_accuracy": 0.12813965305685998,
"num_tokens": 3345242.0,
"step": 1730
},
{
"entropy": 6.268236112594605,
"epoch": 0.10082226806520034,
"grad_norm": 1.125,
"learning_rate": 0.0004367422989611743,
"loss": 6.0504,
"mean_token_accuracy": 0.13582077920436858,
"num_tokens": 3354783.0,
"step": 1735
},
{
"entropy": 6.266420888900757,
"epoch": 0.10111282215184357,
"grad_norm": 0.9921875,
"learning_rate": 0.0004359211190335153,
"loss": 6.0742,
"mean_token_accuracy": 0.13280235901474952,
"num_tokens": 3363705.0,
"step": 1740
},
{
"entropy": 6.345703363418579,
"epoch": 0.10140337623848679,
"grad_norm": 1.0625,
"learning_rate": 0.00043509552737208923,
"loss": 6.1009,
"mean_token_accuracy": 0.12972408011555672,
"num_tokens": 3372331.0,
"step": 1745
},
{
"entropy": 6.221278953552246,
"epoch": 0.10169393032513002,
"grad_norm": 0.984375,
"learning_rate": 0.00043426554661090853,
"loss": 6.0122,
"mean_token_accuracy": 0.13363172858953476,
"num_tokens": 3380986.0,
"step": 1750
},
{
"entropy": 6.314662122726441,
"epoch": 0.10198448441177325,
"grad_norm": 1.046875,
"learning_rate": 0.00043343119950431516,
"loss": 6.0681,
"mean_token_accuracy": 0.12935666590929032,
"num_tokens": 3390852.0,
"step": 1755
},
{
"entropy": 6.2016339778900145,
"epoch": 0.10227503849841647,
"grad_norm": 1.03125,
"learning_rate": 0.00043259250892635644,
"loss": 6.0835,
"mean_token_accuracy": 0.1321997858583927,
"num_tokens": 3399916.0,
"step": 1760
},
{
"entropy": 6.3428630352020265,
"epoch": 0.1025655925850597,
"grad_norm": 0.92578125,
"learning_rate": 0.0004317494978701582,
"loss": 6.0995,
"mean_token_accuracy": 0.13536889478564262,
"num_tokens": 3409913.0,
"step": 1765
},
{
"entropy": 6.2461179256439205,
"epoch": 0.10285614667170294,
"grad_norm": 0.953125,
"learning_rate": 0.0004309021894472943,
"loss": 6.1217,
"mean_token_accuracy": 0.12532801926136017,
"num_tokens": 3420817.0,
"step": 1770
},
{
"entropy": 6.281768560409546,
"epoch": 0.10314670075834617,
"grad_norm": 0.9453125,
"learning_rate": 0.0004300506068871534,
"loss": 6.0642,
"mean_token_accuracy": 0.13035471364855766,
"num_tokens": 3430873.0,
"step": 1775
},
{
"entropy": 6.25596866607666,
"epoch": 0.10343725484498939,
"grad_norm": 1.0859375,
"learning_rate": 0.00042919477353630135,
"loss": 5.9967,
"mean_token_accuracy": 0.13541611135005951,
"num_tokens": 3440078.0,
"step": 1780
},
{
"entropy": 6.170544290542603,
"epoch": 0.10372780893163262,
"grad_norm": 0.9609375,
"learning_rate": 0.000428334712857842,
"loss": 5.9563,
"mean_token_accuracy": 0.1369057409465313,
"num_tokens": 3449029.0,
"step": 1785
},
{
"entropy": 6.206426477432251,
"epoch": 0.10401836301827586,
"grad_norm": 0.91015625,
"learning_rate": 0.00042747044843077304,
"loss": 6.0783,
"mean_token_accuracy": 0.13255516290664673,
"num_tokens": 3458880.0,
"step": 1790
},
{
"entropy": 6.415725946426392,
"epoch": 0.10430891710491907,
"grad_norm": 1.0234375,
"learning_rate": 0.00042660200394934047,
"loss": 6.1575,
"mean_token_accuracy": 0.1243210181593895,
"num_tokens": 3468132.0,
"step": 1795
},
{
"entropy": 6.199592542648316,
"epoch": 0.10459947119156231,
"grad_norm": 1.1015625,
"learning_rate": 0.00042572940322238844,
"loss": 6.0499,
"mean_token_accuracy": 0.1273614466190338,
"num_tokens": 3477429.0,
"step": 1800
},
{
"entropy": 6.205861282348633,
"epoch": 0.10489002527820554,
"grad_norm": 1.0859375,
"learning_rate": 0.00042485267017270664,
"loss": 6.0663,
"mean_token_accuracy": 0.12217539176344872,
"num_tokens": 3487526.0,
"step": 1805
},
{
"entropy": 6.246215867996216,
"epoch": 0.10518057936484877,
"grad_norm": 1.109375,
"learning_rate": 0.0004239718288363745,
"loss": 6.0049,
"mean_token_accuracy": 0.14162934869527816,
"num_tokens": 3496280.0,
"step": 1810
},
{
"entropy": 6.257954835891724,
"epoch": 0.10547113345149199,
"grad_norm": 1.109375,
"learning_rate": 0.0004230869033621023,
"loss": 6.0072,
"mean_token_accuracy": 0.13223105296492577,
"num_tokens": 3505871.0,
"step": 1815
},
{
"entropy": 6.254146718978882,
"epoch": 0.10576168753813522,
"grad_norm": 0.91015625,
"learning_rate": 0.0004221979180105688,
"loss": 5.9791,
"mean_token_accuracy": 0.13909292891621589,
"num_tokens": 3515846.0,
"step": 1820
},
{
"entropy": 6.229612159729004,
"epoch": 0.10605224162477846,
"grad_norm": 1.078125,
"learning_rate": 0.00042130489715375645,
"loss": 6.0716,
"mean_token_accuracy": 0.12691670581698417,
"num_tokens": 3525358.0,
"step": 1825
},
{
"entropy": 6.216009950637817,
"epoch": 0.10634279571142168,
"grad_norm": 1.046875,
"learning_rate": 0.00042040786527428335,
"loss": 5.985,
"mean_token_accuracy": 0.13652188181877137,
"num_tokens": 3534459.0,
"step": 1830
},
{
"entropy": 6.155793523788452,
"epoch": 0.10663334979806491,
"grad_norm": 1.0390625,
"learning_rate": 0.0004195068469647315,
"loss": 5.9546,
"mean_token_accuracy": 0.1332765720784664,
"num_tokens": 3545268.0,
"step": 1835
},
{
"entropy": 6.335662126541138,
"epoch": 0.10692390388470814,
"grad_norm": 1.078125,
"learning_rate": 0.00041860186692697297,
"loss": 6.0853,
"mean_token_accuracy": 0.13031049072742462,
"num_tokens": 3554281.0,
"step": 1840
},
{
"entropy": 6.235852289199829,
"epoch": 0.10721445797135137,
"grad_norm": 1.09375,
"learning_rate": 0.00041769294997149264,
"loss": 6.0309,
"mean_token_accuracy": 0.13206790015101433,
"num_tokens": 3563505.0,
"step": 1845
},
{
"entropy": 6.190238428115845,
"epoch": 0.10750501205799459,
"grad_norm": 1.0703125,
"learning_rate": 0.0004167801210167081,
"loss": 5.9761,
"mean_token_accuracy": 0.14046704694628714,
"num_tokens": 3573288.0,
"step": 1850
},
{
"entropy": 6.100773715972901,
"epoch": 0.10779556614463782,
"grad_norm": 1.0859375,
"learning_rate": 0.0004158634050882861,
"loss": 5.898,
"mean_token_accuracy": 0.14386857226490973,
"num_tokens": 3582156.0,
"step": 1855
},
{
"entropy": 6.168911600112915,
"epoch": 0.10808612023128106,
"grad_norm": 0.93359375,
"learning_rate": 0.0004149428273184569,
"loss": 6.0786,
"mean_token_accuracy": 0.13001196533441545,
"num_tokens": 3592708.0,
"step": 1860
},
{
"entropy": 6.220205640792846,
"epoch": 0.10837667431792428,
"grad_norm": 0.99609375,
"learning_rate": 0.0004140184129453253,
"loss": 5.9618,
"mean_token_accuracy": 0.13206626623868942,
"num_tokens": 3602983.0,
"step": 1865
},
{
"entropy": 6.231088352203369,
"epoch": 0.10866722840456751,
"grad_norm": 1.0234375,
"learning_rate": 0.000413090187312178,
"loss": 6.031,
"mean_token_accuracy": 0.13375057205557822,
"num_tokens": 3612779.0,
"step": 1870
},
{
"entropy": 6.18984317779541,
"epoch": 0.10895778249121074,
"grad_norm": 1.0,
"learning_rate": 0.0004121581758667898,
"loss": 6.0085,
"mean_token_accuracy": 0.1313602216541767,
"num_tokens": 3622850.0,
"step": 1875
},
{
"entropy": 6.1243922233581545,
"epoch": 0.10924833657785397,
"grad_norm": 0.9609375,
"learning_rate": 0.00041122240416072533,
"loss": 6.0192,
"mean_token_accuracy": 0.1339510276913643,
"num_tokens": 3632673.0,
"step": 1880
},
{
"entropy": 6.293065547943115,
"epoch": 0.1095388906644972,
"grad_norm": 1.1328125,
"learning_rate": 0.0004102828978486385,
"loss": 6.0195,
"mean_token_accuracy": 0.1283419005572796,
"num_tokens": 3642571.0,
"step": 1885
},
{
"entropy": 6.16258282661438,
"epoch": 0.10982944475114043,
"grad_norm": 1.03125,
"learning_rate": 0.0004093396826875695,
"loss": 6.0009,
"mean_token_accuracy": 0.13664330318570136,
"num_tokens": 3651864.0,
"step": 1890
},
{
"entropy": 6.2043415069580075,
"epoch": 0.11011999883778366,
"grad_norm": 0.99609375,
"learning_rate": 0.00040839278453623837,
"loss": 5.9716,
"mean_token_accuracy": 0.1314692884683609,
"num_tokens": 3662410.0,
"step": 1895
},
{
"entropy": 6.146759462356568,
"epoch": 0.11041055292442688,
"grad_norm": 1.0234375,
"learning_rate": 0.0004074422293543363,
"loss": 5.9287,
"mean_token_accuracy": 0.13767404705286027,
"num_tokens": 3672340.0,
"step": 1900
},
{
"entropy": 6.102005672454834,
"epoch": 0.11070110701107011,
"grad_norm": 0.92578125,
"learning_rate": 0.0004064880432018137,
"loss": 6.0753,
"mean_token_accuracy": 0.1314219541847706,
"num_tokens": 3682745.0,
"step": 1905
},
{
"entropy": 6.236651849746704,
"epoch": 0.11099166109771334,
"grad_norm": 1.0,
"learning_rate": 0.00040553025223816615,
"loss": 5.9814,
"mean_token_accuracy": 0.13747138530015945,
"num_tokens": 3692075.0,
"step": 1910
},
{
"entropy": 6.1932165145874025,
"epoch": 0.11128221518435656,
"grad_norm": 0.9921875,
"learning_rate": 0.00040456888272171653,
"loss": 5.9772,
"mean_token_accuracy": 0.13977260813117026,
"num_tokens": 3701639.0,
"step": 1915
},
{
"entropy": 6.29685697555542,
"epoch": 0.1115727692709998,
"grad_norm": 1.09375,
"learning_rate": 0.00040360396100889577,
"loss": 6.0266,
"mean_token_accuracy": 0.13467289954423906,
"num_tokens": 3711103.0,
"step": 1920
},
{
"entropy": 6.2153466701507565,
"epoch": 0.11186332335764303,
"grad_norm": 0.98046875,
"learning_rate": 0.0004026355135535202,
"loss": 6.0563,
"mean_token_accuracy": 0.1295161299407482,
"num_tokens": 3720229.0,
"step": 1925
},
{
"entropy": 6.287035751342773,
"epoch": 0.11215387744428626,
"grad_norm": 0.92578125,
"learning_rate": 0.000401663566906066,
"loss": 6.0289,
"mean_token_accuracy": 0.1336451180279255,
"num_tokens": 3730616.0,
"step": 1930
},
{
"entropy": 6.164524412155151,
"epoch": 0.11244443153092948,
"grad_norm": 1.0234375,
"learning_rate": 0.00040068814771294134,
"loss": 5.8945,
"mean_token_accuracy": 0.13720172494649888,
"num_tokens": 3739829.0,
"step": 1935
},
{
"entropy": 6.15112156867981,
"epoch": 0.11273498561757271,
"grad_norm": 1.046875,
"learning_rate": 0.0003997092827157562,
"loss": 6.0658,
"mean_token_accuracy": 0.12816951870918275,
"num_tokens": 3749830.0,
"step": 1940
},
{
"entropy": 6.2455854415893555,
"epoch": 0.11302553970421594,
"grad_norm": 1.0390625,
"learning_rate": 0.000398726998750589,
"loss": 5.9084,
"mean_token_accuracy": 0.14013779759407044,
"num_tokens": 3759583.0,
"step": 1945
},
{
"entropy": 6.15108060836792,
"epoch": 0.11331609379085916,
"grad_norm": 1.046875,
"learning_rate": 0.00039774132274725076,
"loss": 5.9655,
"mean_token_accuracy": 0.13013281747698785,
"num_tokens": 3769219.0,
"step": 1950
},
{
"entropy": 6.253466367721558,
"epoch": 0.1136066478775024,
"grad_norm": 0.94921875,
"learning_rate": 0.00039675228172854707,
"loss": 5.9664,
"mean_token_accuracy": 0.1329497739672661,
"num_tokens": 3778913.0,
"step": 1955
},
{
"entropy": 6.288890600204468,
"epoch": 0.11389720196414563,
"grad_norm": 1.0,
"learning_rate": 0.0003957599028095371,
"loss": 6.053,
"mean_token_accuracy": 0.133541439473629,
"num_tokens": 3788544.0,
"step": 1960
},
{
"entropy": 6.138053035736084,
"epoch": 0.11418775605078886,
"grad_norm": 1.109375,
"learning_rate": 0.00039476421319679017,
"loss": 5.8634,
"mean_token_accuracy": 0.1413568802177906,
"num_tokens": 3797921.0,
"step": 1965
},
{
"entropy": 6.105274248123169,
"epoch": 0.11447831013743208,
"grad_norm": 1.0078125,
"learning_rate": 0.00039376524018764,
"loss": 5.9334,
"mean_token_accuracy": 0.13669840842485428,
"num_tokens": 3807442.0,
"step": 1970
},
{
"entropy": 6.157075214385986,
"epoch": 0.11476886422407531,
"grad_norm": 1.0234375,
"learning_rate": 0.00039276301116943616,
"loss": 5.9183,
"mean_token_accuracy": 0.1388249270617962,
"num_tokens": 3817875.0,
"step": 1975
},
{
"entropy": 6.148885679244995,
"epoch": 0.11505941831071854,
"grad_norm": 1.0859375,
"learning_rate": 0.0003917575536187936,
"loss": 6.0358,
"mean_token_accuracy": 0.1268165521323681,
"num_tokens": 3826925.0,
"step": 1980
},
{
"entropy": 6.279354047775269,
"epoch": 0.11534997239736176,
"grad_norm": 1.0,
"learning_rate": 0.00039074889510083894,
"loss": 6.0141,
"mean_token_accuracy": 0.1367575228214264,
"num_tokens": 3836047.0,
"step": 1985
},
{
"entropy": 6.161008596420288,
"epoch": 0.115640526484005,
"grad_norm": 1.078125,
"learning_rate": 0.00038973706326845495,
"loss": 5.969,
"mean_token_accuracy": 0.1333487443625927,
"num_tokens": 3845874.0,
"step": 1990
},
{
"entropy": 6.21445050239563,
"epoch": 0.11593108057064823,
"grad_norm": 1.03125,
"learning_rate": 0.0003887220858615225,
"loss": 5.9627,
"mean_token_accuracy": 0.13459742665290833,
"num_tokens": 3855967.0,
"step": 1995
},
{
"entropy": 6.1095618724823,
"epoch": 0.11622163465729146,
"grad_norm": 0.98828125,
"learning_rate": 0.0003877039907061597,
"loss": 5.9908,
"mean_token_accuracy": 0.13831030651926995,
"num_tokens": 3866467.0,
"step": 2000
},
{
"entropy": 6.203917360305786,
"epoch": 0.11651218874393468,
"grad_norm": 1.09375,
"learning_rate": 0.0003866828057139598,
"loss": 5.9744,
"mean_token_accuracy": 0.13815115690231322,
"num_tokens": 3875916.0,
"step": 2005
},
{
"entropy": 6.18111400604248,
"epoch": 0.11680274283057791,
"grad_norm": 1.0546875,
"learning_rate": 0.00038565855888122503,
"loss": 5.9594,
"mean_token_accuracy": 0.1330759234726429,
"num_tokens": 3885987.0,
"step": 2010
},
{
"entropy": 6.2428779125213625,
"epoch": 0.11709329691722115,
"grad_norm": 1.109375,
"learning_rate": 0.00038463127828819975,
"loss": 6.0059,
"mean_token_accuracy": 0.13700252026319504,
"num_tokens": 3895809.0,
"step": 2015
},
{
"entropy": 6.195304489135742,
"epoch": 0.11738385100386436,
"grad_norm": 1.0078125,
"learning_rate": 0.00038360099209830043,
"loss": 6.0109,
"mean_token_accuracy": 0.132937653362751,
"num_tokens": 3905491.0,
"step": 2020
},
{
"entropy": 6.115702676773071,
"epoch": 0.1176744050905076,
"grad_norm": 0.921875,
"learning_rate": 0.0003825677285573433,
"loss": 5.8753,
"mean_token_accuracy": 0.14159394055604935,
"num_tokens": 3915073.0,
"step": 2025
},
{
"entropy": 6.103581476211548,
"epoch": 0.11796495917715083,
"grad_norm": 1.0625,
"learning_rate": 0.00038153151599277027,
"loss": 5.9516,
"mean_token_accuracy": 0.1373932972550392,
"num_tokens": 3924786.0,
"step": 2030
},
{
"entropy": 6.296129131317139,
"epoch": 0.11825551326379405,
"grad_norm": 0.97265625,
"learning_rate": 0.0003804923828128723,
"loss": 6.1096,
"mean_token_accuracy": 0.12966496869921684,
"num_tokens": 3934745.0,
"step": 2035
},
{
"entropy": 6.2405133724212645,
"epoch": 0.11854606735043728,
"grad_norm": 1.0234375,
"learning_rate": 0.0003794503575060104,
"loss": 5.928,
"mean_token_accuracy": 0.1365241065621376,
"num_tokens": 3945328.0,
"step": 2040
},
{
"entropy": 6.151012706756592,
"epoch": 0.11883662143708051,
"grad_norm": 0.93359375,
"learning_rate": 0.00037840546863983484,
"loss": 6.0549,
"mean_token_accuracy": 0.12878239378333092,
"num_tokens": 3955894.0,
"step": 2045
},
{
"entropy": 6.219704437255859,
"epoch": 0.11912717552372375,
"grad_norm": 1.078125,
"learning_rate": 0.0003773577448605015,
"loss": 5.9845,
"mean_token_accuracy": 0.13799333572387695,
"num_tokens": 3964895.0,
"step": 2050
},
{
"entropy": 6.154746055603027,
"epoch": 0.11941772961036697,
"grad_norm": 1.078125,
"learning_rate": 0.0003763072148918872,
"loss": 6.0396,
"mean_token_accuracy": 0.12946364358067514,
"num_tokens": 3974681.0,
"step": 2055
},
{
"entropy": 6.248775148391724,
"epoch": 0.1197082836970102,
"grad_norm": 1.046875,
"learning_rate": 0.0003752539075348017,
"loss": 6.0252,
"mean_token_accuracy": 0.13851658627390862,
"num_tokens": 3984402.0,
"step": 2060
},
{
"entropy": 6.127353715896606,
"epoch": 0.11999883778365343,
"grad_norm": 1.0703125,
"learning_rate": 0.00037419785166619817,
"loss": 6.0268,
"mean_token_accuracy": 0.1265586420893669,
"num_tokens": 3995279.0,
"step": 2065
},
{
"entropy": 6.17973141670227,
"epoch": 0.12028939187029665,
"grad_norm": 1.015625,
"learning_rate": 0.0003731390762383818,
"loss": 5.8617,
"mean_token_accuracy": 0.15186458677053452,
"num_tokens": 4003525.0,
"step": 2070
},
{
"entropy": 6.115002965927124,
"epoch": 0.12057994595693988,
"grad_norm": 1.0546875,
"learning_rate": 0.0003720776102782158,
"loss": 5.8387,
"mean_token_accuracy": 0.13773723766207696,
"num_tokens": 4012373.0,
"step": 2075
},
{
"entropy": 6.103297424316406,
"epoch": 0.12087050004358312,
"grad_norm": 0.9609375,
"learning_rate": 0.00037101348288632555,
"loss": 5.9031,
"mean_token_accuracy": 0.13564639389514924,
"num_tokens": 4021972.0,
"step": 2080
},
{
"entropy": 6.180972766876221,
"epoch": 0.12116105413022635,
"grad_norm": 0.921875,
"learning_rate": 0.0003699467232363012,
"loss": 5.9966,
"mean_token_accuracy": 0.134269118309021,
"num_tokens": 4032384.0,
"step": 2085
},
{
"entropy": 6.158924150466919,
"epoch": 0.12145160821686957,
"grad_norm": 1.0078125,
"learning_rate": 0.0003688773605738973,
"loss": 5.8791,
"mean_token_accuracy": 0.13695783466100692,
"num_tokens": 4041974.0,
"step": 2090
},
{
"entropy": 6.068432807922363,
"epoch": 0.1217421623035128,
"grad_norm": 1.0625,
"learning_rate": 0.00036780542421623134,
"loss": 5.9396,
"mean_token_accuracy": 0.13688302487134935,
"num_tokens": 4051694.0,
"step": 2095
},
{
"entropy": 6.212389612197876,
"epoch": 0.12203271639015603,
"grad_norm": 1.078125,
"learning_rate": 0.0003667309435509802,
"loss": 6.002,
"mean_token_accuracy": 0.13414775878190993,
"num_tokens": 4062828.0,
"step": 2100
},
{
"entropy": 6.241210889816284,
"epoch": 0.12232327047679925,
"grad_norm": 1.1015625,
"learning_rate": 0.0003656539480355741,
"loss": 5.9742,
"mean_token_accuracy": 0.13490709364414216,
"num_tokens": 4072012.0,
"step": 2105
},
{
"entropy": 6.0934614658355715,
"epoch": 0.12261382456344248,
"grad_norm": 0.99609375,
"learning_rate": 0.0003645744671963891,
"loss": 5.9166,
"mean_token_accuracy": 0.14179718866944313,
"num_tokens": 4081587.0,
"step": 2110
},
{
"entropy": 6.224443626403809,
"epoch": 0.12290437865008572,
"grad_norm": 1.1171875,
"learning_rate": 0.0003634925306279376,
"loss": 5.9655,
"mean_token_accuracy": 0.13849997371435166,
"num_tokens": 4091374.0,
"step": 2115
},
{
"entropy": 6.211995363235474,
"epoch": 0.12319493273672895,
"grad_norm": 1.15625,
"learning_rate": 0.0003624081679920574,
"loss": 5.9553,
"mean_token_accuracy": 0.1392621487379074,
"num_tokens": 4100532.0,
"step": 2120
},
{
"entropy": 6.090556001663208,
"epoch": 0.12348548682337217,
"grad_norm": 1.046875,
"learning_rate": 0.0003613214090170977,
"loss": 5.9123,
"mean_token_accuracy": 0.13530985191464423,
"num_tokens": 4110194.0,
"step": 2125
},
{
"entropy": 6.171910429000855,
"epoch": 0.1237760409100154,
"grad_norm": 0.98828125,
"learning_rate": 0.0003602322834971048,
"loss": 5.9322,
"mean_token_accuracy": 0.136457958817482,
"num_tokens": 4119816.0,
"step": 2130
},
{
"entropy": 6.220505046844482,
"epoch": 0.12406659499665863,
"grad_norm": 1.0078125,
"learning_rate": 0.0003591408212910051,
"loss": 6.0141,
"mean_token_accuracy": 0.13071410208940507,
"num_tokens": 4130072.0,
"step": 2135
},
{
"entropy": 6.171974468231201,
"epoch": 0.12435714908330185,
"grad_norm": 1.15625,
"learning_rate": 0.0003580470523217863,
"loss": 5.9574,
"mean_token_accuracy": 0.13431628346443175,
"num_tokens": 4139101.0,
"step": 2140
},
{
"entropy": 6.139108896255493,
"epoch": 0.12464770316994508,
"grad_norm": 0.921875,
"learning_rate": 0.0003569510065756771,
"loss": 5.8817,
"mean_token_accuracy": 0.13240241631865501,
"num_tokens": 4149500.0,
"step": 2145
},
{
"entropy": 6.066398859024048,
"epoch": 0.12493825725658832,
"grad_norm": 1.09375,
"learning_rate": 0.0003558527141013254,
"loss": 5.8244,
"mean_token_accuracy": 0.14197371304035186,
"num_tokens": 4158642.0,
"step": 2150
},
{
"entropy": 6.045716571807861,
"epoch": 0.12522881134323155,
"grad_norm": 1.0390625,
"learning_rate": 0.0003547522050089742,
"loss": 5.8963,
"mean_token_accuracy": 0.13848227709531785,
"num_tokens": 4167911.0,
"step": 2155
},
{
"entropy": 6.142486429214477,
"epoch": 0.12551936542987477,
"grad_norm": 1.0625,
"learning_rate": 0.00035364950946963606,
"loss": 5.8062,
"mean_token_accuracy": 0.14425584971904754,
"num_tokens": 4177589.0,
"step": 2160
},
{
"entropy": 6.1435057640075685,
"epoch": 0.125809919516518,
"grad_norm": 1.0,
"learning_rate": 0.0003525446577142663,
"loss": 5.9855,
"mean_token_accuracy": 0.13806044012308122,
"num_tokens": 4187332.0,
"step": 2165
},
{
"entropy": 6.165029859542846,
"epoch": 0.12610047360316123,
"grad_norm": 1.109375,
"learning_rate": 0.00035143768003293395,
"loss": 5.9359,
"mean_token_accuracy": 0.1438089445233345,
"num_tokens": 4196686.0,
"step": 2170
},
{
"entropy": 6.102525806427002,
"epoch": 0.12639102768980445,
"grad_norm": 1.046875,
"learning_rate": 0.0003503286067739913,
"loss": 5.8595,
"mean_token_accuracy": 0.14088326916098595,
"num_tokens": 4205908.0,
"step": 2175
},
{
"entropy": 6.056342458724975,
"epoch": 0.1266815817764477,
"grad_norm": 0.99609375,
"learning_rate": 0.00034921746834324193,
"loss": 5.8166,
"mean_token_accuracy": 0.14242705181241036,
"num_tokens": 4215992.0,
"step": 2180
},
{
"entropy": 6.085980653762817,
"epoch": 0.12697213586309092,
"grad_norm": 0.96875,
"learning_rate": 0.0003481042952031072,
"loss": 5.8801,
"mean_token_accuracy": 0.14103155285120011,
"num_tokens": 4227094.0,
"step": 2185
},
{
"entropy": 6.166931819915772,
"epoch": 0.12726268994973414,
"grad_norm": 0.9296875,
"learning_rate": 0.0003469891178717911,
"loss": 5.9574,
"mean_token_accuracy": 0.14575981721282005,
"num_tokens": 4236432.0,
"step": 2190
},
{
"entropy": 6.066125965118408,
"epoch": 0.12755324403637738,
"grad_norm": 1.1015625,
"learning_rate": 0.0003458719669224436,
"loss": 5.7683,
"mean_token_accuracy": 0.14632787331938743,
"num_tokens": 4245305.0,
"step": 2195
},
{
"entropy": 6.0759042263031,
"epoch": 0.1278437981230206,
"grad_norm": 1.0546875,
"learning_rate": 0.0003447528729823221,
"loss": 5.9274,
"mean_token_accuracy": 0.13991366624832152,
"num_tokens": 4255445.0,
"step": 2200
},
{
"entropy": 6.161522722244262,
"epoch": 0.12813435220966382,
"grad_norm": 1.0625,
"learning_rate": 0.0003436318667319525,
"loss": 5.9095,
"mean_token_accuracy": 0.14544984251260756,
"num_tokens": 4265114.0,
"step": 2205
},
{
"entropy": 6.1537879467010494,
"epoch": 0.12842490629630707,
"grad_norm": 1.0078125,
"learning_rate": 0.00034250897890428716,
"loss": 5.9025,
"mean_token_accuracy": 0.13503851667046546,
"num_tokens": 4274787.0,
"step": 2210
},
{
"entropy": 6.1523271083831785,
"epoch": 0.1287154603829503,
"grad_norm": 1.0703125,
"learning_rate": 0.0003413842402838633,
"loss": 5.9789,
"mean_token_accuracy": 0.13642423674464227,
"num_tokens": 4284509.0,
"step": 2215
},
{
"entropy": 6.217884969711304,
"epoch": 0.1290060144695935,
"grad_norm": 1.03125,
"learning_rate": 0.00034025768170595834,
"loss": 5.9438,
"mean_token_accuracy": 0.1450071580708027,
"num_tokens": 4294128.0,
"step": 2220
},
{
"entropy": 6.103723478317261,
"epoch": 0.12929656855623675,
"grad_norm": 1.0,
"learning_rate": 0.0003391293340557446,
"loss": 5.9493,
"mean_token_accuracy": 0.13352228179574013,
"num_tokens": 4303779.0,
"step": 2225
},
{
"entropy": 6.194416570663452,
"epoch": 0.12958712264287997,
"grad_norm": 0.921875,
"learning_rate": 0.0003379992282674431,
"loss": 5.9859,
"mean_token_accuracy": 0.14135119765996934,
"num_tokens": 4314520.0,
"step": 2230
},
{
"entropy": 6.184441709518433,
"epoch": 0.1298776767295232,
"grad_norm": 0.99609375,
"learning_rate": 0.0003368673953234749,
"loss": 5.9214,
"mean_token_accuracy": 0.13849867284297943,
"num_tokens": 4324477.0,
"step": 2235
},
{
"entropy": 6.059356164932251,
"epoch": 0.13016823081616644,
"grad_norm": 1.0390625,
"learning_rate": 0.00033573386625361176,
"loss": 5.8373,
"mean_token_accuracy": 0.14615851789712905,
"num_tokens": 4334009.0,
"step": 2240
},
{
"entropy": 6.182690954208374,
"epoch": 0.13045878490280965,
"grad_norm": 0.94140625,
"learning_rate": 0.00033459867213412567,
"loss": 5.9892,
"mean_token_accuracy": 0.13937190547585487,
"num_tokens": 4343748.0,
"step": 2245
},
{
"entropy": 6.105481243133545,
"epoch": 0.13074933898945287,
"grad_norm": 1.0234375,
"learning_rate": 0.000333461844086937,
"loss": 5.9097,
"mean_token_accuracy": 0.13728254288434982,
"num_tokens": 4352957.0,
"step": 2250
},
{
"entropy": 6.125918388366699,
"epoch": 0.13103989307609612,
"grad_norm": 1.0703125,
"learning_rate": 0.00033232341327876097,
"loss": 5.9005,
"mean_token_accuracy": 0.1423856124281883,
"num_tokens": 4362505.0,
"step": 2255
},
{
"entropy": 6.138764905929565,
"epoch": 0.13133044716273934,
"grad_norm": 1.015625,
"learning_rate": 0.0003311834109202531,
"loss": 5.9093,
"mean_token_accuracy": 0.14300098568201064,
"num_tokens": 4371664.0,
"step": 2260
},
{
"entropy": 6.16502833366394,
"epoch": 0.13162100124938259,
"grad_norm": 0.953125,
"learning_rate": 0.00033004186826515416,
"loss": 6.0271,
"mean_token_accuracy": 0.13194756507873534,
"num_tokens": 4382600.0,
"step": 2265
},
{
"entropy": 6.100395345687867,
"epoch": 0.1319115553360258,
"grad_norm": 0.9453125,
"learning_rate": 0.0003288988166094324,
"loss": 5.927,
"mean_token_accuracy": 0.13698131814599038,
"num_tokens": 4393672.0,
"step": 2270
},
{
"entropy": 6.156081247329712,
"epoch": 0.13220210942266902,
"grad_norm": 0.98828125,
"learning_rate": 0.00032775428729042656,
"loss": 5.8873,
"mean_token_accuracy": 0.13874078989028932,
"num_tokens": 4403156.0,
"step": 2275
},
{
"entropy": 6.1929707527160645,
"epoch": 0.13249266350931227,
"grad_norm": 1.1640625,
"learning_rate": 0.000326608311685986,
"loss": 5.9783,
"mean_token_accuracy": 0.13139391839504241,
"num_tokens": 4412742.0,
"step": 2280
},
{
"entropy": 6.086524152755738,
"epoch": 0.1327832175959555,
"grad_norm": 0.99609375,
"learning_rate": 0.0003254609212136108,
"loss": 5.8274,
"mean_token_accuracy": 0.15008396059274673,
"num_tokens": 4422232.0,
"step": 2285
},
{
"entropy": 6.099150848388672,
"epoch": 0.1330737716825987,
"grad_norm": 1.015625,
"learning_rate": 0.00032431214732959036,
"loss": 5.8815,
"mean_token_accuracy": 0.13752613663673402,
"num_tokens": 4432405.0,
"step": 2290
},
{
"entropy": 6.194040107727051,
"epoch": 0.13336432576924195,
"grad_norm": 1.1328125,
"learning_rate": 0.000323162021528141,
"loss": 5.9082,
"mean_token_accuracy": 0.1354072481393814,
"num_tokens": 4441504.0,
"step": 2295
},
{
"entropy": 6.136425590515136,
"epoch": 0.13365487985588517,
"grad_norm": 0.9375,
"learning_rate": 0.00032201057534054264,
"loss": 5.9503,
"mean_token_accuracy": 0.1408660188317299,
"num_tokens": 4452478.0,
"step": 2300
},
{
"entropy": 6.132717275619507,
"epoch": 0.1339454339425284,
"grad_norm": 0.96875,
"learning_rate": 0.00032085784033427414,
"loss": 5.8967,
"mean_token_accuracy": 0.13943730369210244,
"num_tokens": 4462267.0,
"step": 2305
},
{
"entropy": 6.183125257492065,
"epoch": 0.13423598802917164,
"grad_norm": 1.0390625,
"learning_rate": 0.0003197038481121478,
"loss": 5.9465,
"mean_token_accuracy": 0.14744184017181397,
"num_tokens": 4472580.0,
"step": 2310
},
{
"entropy": 6.118056774139404,
"epoch": 0.13452654211581486,
"grad_norm": 1.0625,
"learning_rate": 0.0003185486303114436,
"loss": 5.9433,
"mean_token_accuracy": 0.13671476170420646,
"num_tokens": 4481768.0,
"step": 2315
},
{
"entropy": 6.0453300952911375,
"epoch": 0.13481709620245808,
"grad_norm": 0.96484375,
"learning_rate": 0.0003173922186030409,
"loss": 5.8219,
"mean_token_accuracy": 0.14138804078102113,
"num_tokens": 4491269.0,
"step": 2320
},
{
"entropy": 6.065739154815674,
"epoch": 0.13510765028910132,
"grad_norm": 1.015625,
"learning_rate": 0.000316234644690551,
"loss": 5.7896,
"mean_token_accuracy": 0.14193628579378129,
"num_tokens": 4501357.0,
"step": 2325
},
{
"entropy": 6.165984678268432,
"epoch": 0.13539820437574454,
"grad_norm": 0.921875,
"learning_rate": 0.0003150759403094473,
"loss": 5.829,
"mean_token_accuracy": 0.14373186007142066,
"num_tokens": 4510972.0,
"step": 2330
},
{
"entropy": 6.087504100799561,
"epoch": 0.13568875846238776,
"grad_norm": 1.015625,
"learning_rate": 0.00031391613722619587,
"loss": 5.8799,
"mean_token_accuracy": 0.1405518189072609,
"num_tokens": 4520887.0,
"step": 2335
},
{
"entropy": 6.0864016056060795,
"epoch": 0.135979312549031,
"grad_norm": 1.1015625,
"learning_rate": 0.000312755267237384,
"loss": 5.8226,
"mean_token_accuracy": 0.14198774620890617,
"num_tokens": 4529971.0,
"step": 2340
},
{
"entropy": 6.121823167800903,
"epoch": 0.13626986663567422,
"grad_norm": 0.9765625,
"learning_rate": 0.0003115933621688488,
"loss": 5.9209,
"mean_token_accuracy": 0.1370498724281788,
"num_tokens": 4540375.0,
"step": 2345
},
{
"entropy": 6.068475914001465,
"epoch": 0.13656042072231747,
"grad_norm": 1.109375,
"learning_rate": 0.00031043045387480487,
"loss": 5.8503,
"mean_token_accuracy": 0.13427165821194648,
"num_tokens": 4549554.0,
"step": 2350
},
{
"entropy": 6.043267726898193,
"epoch": 0.1368509748089607,
"grad_norm": 0.953125,
"learning_rate": 0.0003092665742369703,
"loss": 5.7866,
"mean_token_accuracy": 0.14580736979842185,
"num_tokens": 4558697.0,
"step": 2355
},
{
"entropy": 6.053312063217163,
"epoch": 0.1371415288956039,
"grad_norm": 1.0703125,
"learning_rate": 0.00030810175516369343,
"loss": 5.8247,
"mean_token_accuracy": 0.14898887798190116,
"num_tokens": 4567592.0,
"step": 2360
},
{
"entropy": 6.1225522518157955,
"epoch": 0.13743208298224716,
"grad_norm": 1.03125,
"learning_rate": 0.0003069360285890775,
"loss": 5.8661,
"mean_token_accuracy": 0.14503989070653917,
"num_tokens": 4576594.0,
"step": 2365
},
{
"entropy": 6.086222171783447,
"epoch": 0.13772263706889037,
"grad_norm": 1.078125,
"learning_rate": 0.00030576942647210547,
"loss": 5.8143,
"mean_token_accuracy": 0.14734100848436354,
"num_tokens": 4585317.0,
"step": 2370
},
{
"entropy": 6.101655912399292,
"epoch": 0.1380131911555336,
"grad_norm": 1.15625,
"learning_rate": 0.00030460198079576355,
"loss": 5.8265,
"mean_token_accuracy": 0.15099092870950698,
"num_tokens": 4593621.0,
"step": 2375
},
{
"entropy": 6.059905099868774,
"epoch": 0.13830374524217684,
"grad_norm": 1.2890625,
"learning_rate": 0.0003034337235661648,
"loss": 5.8365,
"mean_token_accuracy": 0.13829366117715836,
"num_tokens": 4603537.0,
"step": 2380
},
{
"entropy": 6.074821090698242,
"epoch": 0.13859429932882006,
"grad_norm": 1.15625,
"learning_rate": 0.0003022646868116714,
"loss": 5.8688,
"mean_token_accuracy": 0.14191555976867676,
"num_tokens": 4613085.0,
"step": 2385
},
{
"entropy": 6.087924957275391,
"epoch": 0.13888485341546328,
"grad_norm": 1.015625,
"learning_rate": 0.0003010949025820163,
"loss": 5.8978,
"mean_token_accuracy": 0.13870447725057602,
"num_tokens": 4622750.0,
"step": 2390
},
{
"entropy": 6.221824693679809,
"epoch": 0.13917540750210652,
"grad_norm": 1.0625,
"learning_rate": 0.0002999244029474252,
"loss": 5.9973,
"mean_token_accuracy": 0.13631478250026702,
"num_tokens": 4632787.0,
"step": 2395
},
{
"entropy": 6.084641647338867,
"epoch": 0.13946596158874974,
"grad_norm": 1.046875,
"learning_rate": 0.00029875321999773684,
"loss": 5.8022,
"mean_token_accuracy": 0.14561834558844566,
"num_tokens": 4642277.0,
"step": 2400
},
{
"entropy": 6.126396894454956,
"epoch": 0.13975651567539296,
"grad_norm": 1.03125,
"learning_rate": 0.00029758138584152333,
"loss": 5.8342,
"mean_token_accuracy": 0.14577654898166656,
"num_tokens": 4651764.0,
"step": 2405
},
{
"entropy": 6.0264387130737305,
"epoch": 0.1400470697620362,
"grad_norm": 1.0625,
"learning_rate": 0.0002964089326052102,
"loss": 5.8166,
"mean_token_accuracy": 0.14705195873975754,
"num_tokens": 4661938.0,
"step": 2410
},
{
"entropy": 6.151280355453491,
"epoch": 0.14033762384867943,
"grad_norm": 1.140625,
"learning_rate": 0.0002952358924321949,
"loss": 5.8146,
"mean_token_accuracy": 0.14316702708601953,
"num_tokens": 4670960.0,
"step": 2415
},
{
"entropy": 6.228445291519165,
"epoch": 0.14062817793532267,
"grad_norm": 1.0703125,
"learning_rate": 0.00029406229748196657,
"loss": 5.9368,
"mean_token_accuracy": 0.1354992315173149,
"num_tokens": 4680777.0,
"step": 2420
},
{
"entropy": 6.084039545059204,
"epoch": 0.1409187320219659,
"grad_norm": 1.1015625,
"learning_rate": 0.0002928881799292235,
"loss": 5.7482,
"mean_token_accuracy": 0.15117157846689225,
"num_tokens": 4690390.0,
"step": 2425
},
{
"entropy": 6.010620021820069,
"epoch": 0.1412092861086091,
"grad_norm": 1.015625,
"learning_rate": 0.00029171357196299154,
"loss": 5.9686,
"mean_token_accuracy": 0.14216312393546104,
"num_tokens": 4701133.0,
"step": 2430
},
{
"entropy": 6.182425451278687,
"epoch": 0.14149984019525236,
"grad_norm": 0.9609375,
"learning_rate": 0.0002905385057857414,
"loss": 5.9243,
"mean_token_accuracy": 0.14200448989868164,
"num_tokens": 4711962.0,
"step": 2435
},
{
"entropy": 6.141026830673217,
"epoch": 0.14179039428189558,
"grad_norm": 1.1171875,
"learning_rate": 0.0002893630136125058,
"loss": 5.8835,
"mean_token_accuracy": 0.14305603951215745,
"num_tokens": 4721748.0,
"step": 2440
},
{
"entropy": 6.0170793533325195,
"epoch": 0.1420809483685388,
"grad_norm": 0.98828125,
"learning_rate": 0.0002881871276699967,
"loss": 5.766,
"mean_token_accuracy": 0.14974772036075593,
"num_tokens": 4731178.0,
"step": 2445
},
{
"entropy": 5.9737021923065186,
"epoch": 0.14237150245518204,
"grad_norm": 1.046875,
"learning_rate": 0.00028701088019572114,
"loss": 5.7396,
"mean_token_accuracy": 0.148803973197937,
"num_tokens": 4739590.0,
"step": 2450
},
{
"entropy": 6.12122106552124,
"epoch": 0.14266205654182526,
"grad_norm": 1.03125,
"learning_rate": 0.0002858343034370977,
"loss": 5.9511,
"mean_token_accuracy": 0.14803745746612548,
"num_tokens": 4749840.0,
"step": 2455
},
{
"entropy": 6.207825994491577,
"epoch": 0.14295261062846848,
"grad_norm": 1.109375,
"learning_rate": 0.00028465742965057267,
"loss": 5.9567,
"mean_token_accuracy": 0.13596878871321677,
"num_tokens": 4759347.0,
"step": 2460
},
{
"entropy": 6.198162078857422,
"epoch": 0.14324316471511173,
"grad_norm": 1.0546875,
"learning_rate": 0.00028348029110073533,
"loss": 5.8925,
"mean_token_accuracy": 0.14506246596574784,
"num_tokens": 4769911.0,
"step": 2465
},
{
"entropy": 6.0526893615722654,
"epoch": 0.14353371880175494,
"grad_norm": 1.0234375,
"learning_rate": 0.00028230292005943365,
"loss": 5.8162,
"mean_token_accuracy": 0.1422274589538574,
"num_tokens": 4780775.0,
"step": 2470
},
{
"entropy": 6.0685014724731445,
"epoch": 0.14382427288839816,
"grad_norm": 1.0625,
"learning_rate": 0.00028112534880488945,
"loss": 5.8628,
"mean_token_accuracy": 0.1423807591199875,
"num_tokens": 4790845.0,
"step": 2475
},
{
"entropy": 6.090683555603027,
"epoch": 0.1441148269750414,
"grad_norm": 0.95703125,
"learning_rate": 0.0002799476096208137,
"loss": 5.8106,
"mean_token_accuracy": 0.1501375898718834,
"num_tokens": 4801214.0,
"step": 2480
},
{
"entropy": 6.07487530708313,
"epoch": 0.14440538106168463,
"grad_norm": 0.9765625,
"learning_rate": 0.00027876973479552087,
"loss": 5.7633,
"mean_token_accuracy": 0.1493755668401718,
"num_tokens": 4810720.0,
"step": 2485
},
{
"entropy": 6.07555193901062,
"epoch": 0.14469593514832785,
"grad_norm": 1.1171875,
"learning_rate": 0.00027759175662104424,
"loss": 5.7415,
"mean_token_accuracy": 0.14737216681241988,
"num_tokens": 4820078.0,
"step": 2490
},
{
"entropy": 6.0236917495727536,
"epoch": 0.1449864892349711,
"grad_norm": 1.015625,
"learning_rate": 0.0002764137073922508,
"loss": 5.8242,
"mean_token_accuracy": 0.14522194787859916,
"num_tokens": 4830561.0,
"step": 2495
},
{
"entropy": 6.035146951675415,
"epoch": 0.1452770433216143,
"grad_norm": 1.078125,
"learning_rate": 0.00027523561940595505,
"loss": 5.8653,
"mean_token_accuracy": 0.1423221454024315,
"num_tokens": 4839849.0,
"step": 2500
},
{
"entropy": 6.047043657302856,
"epoch": 0.14556759740825756,
"grad_norm": 0.984375,
"learning_rate": 0.0002740575249600342,
"loss": 5.809,
"mean_token_accuracy": 0.14349082857370377,
"num_tokens": 4850113.0,
"step": 2505
},
{
"entropy": 6.110679817199707,
"epoch": 0.14585815149490078,
"grad_norm": 1.171875,
"learning_rate": 0.00027287945635254263,
"loss": 5.7927,
"mean_token_accuracy": 0.13830389603972434,
"num_tokens": 4859572.0,
"step": 2510
},
{
"entropy": 6.028803539276123,
"epoch": 0.146148705581544,
"grad_norm": 1.03125,
"learning_rate": 0.00027170144588082635,
"loss": 5.8538,
"mean_token_accuracy": 0.14364985525608062,
"num_tokens": 4870238.0,
"step": 2515
},
{
"entropy": 6.05920820236206,
"epoch": 0.14643925966818724,
"grad_norm": 0.96875,
"learning_rate": 0.00027052352584063763,
"loss": 5.7673,
"mean_token_accuracy": 0.14797088503837585,
"num_tokens": 4879743.0,
"step": 2520
},
{
"entropy": 6.063130807876587,
"epoch": 0.14672981375483046,
"grad_norm": 1.03125,
"learning_rate": 0.00026934572852524907,
"loss": 5.8141,
"mean_token_accuracy": 0.1519768014550209,
"num_tokens": 4888874.0,
"step": 2525
},
{
"entropy": 6.081371688842774,
"epoch": 0.14702036784147368,
"grad_norm": 1.0390625,
"learning_rate": 0.00026816808622456937,
"loss": 5.8422,
"mean_token_accuracy": 0.14229626208543777,
"num_tokens": 4898244.0,
"step": 2530
},
{
"entropy": 6.169855117797852,
"epoch": 0.14731092192811693,
"grad_norm": 1.015625,
"learning_rate": 0.0002669906312242569,
"loss": 5.8431,
"mean_token_accuracy": 0.1464947611093521,
"num_tokens": 4907978.0,
"step": 2535
},
{
"entropy": 6.089100074768067,
"epoch": 0.14760147601476015,
"grad_norm": 0.953125,
"learning_rate": 0.00026581339580483525,
"loss": 5.8544,
"mean_token_accuracy": 0.14092473834753036,
"num_tokens": 4917647.0,
"step": 2540
},
{
"entropy": 6.051334381103516,
"epoch": 0.14789203010140337,
"grad_norm": 1.09375,
"learning_rate": 0.0002646364122408082,
"loss": 5.8866,
"mean_token_accuracy": 0.1406030498445034,
"num_tokens": 4927322.0,
"step": 2545
},
{
"entropy": 6.089036989212036,
"epoch": 0.1481825841880466,
"grad_norm": 0.90234375,
"learning_rate": 0.0002634597127997749,
"loss": 5.8912,
"mean_token_accuracy": 0.14071550071239472,
"num_tokens": 4938095.0,
"step": 2550
},
{
"entropy": 6.179394674301148,
"epoch": 0.14847313827468983,
"grad_norm": 0.98828125,
"learning_rate": 0.0002622833297415445,
"loss": 5.8789,
"mean_token_accuracy": 0.14479369372129441,
"num_tokens": 4947768.0,
"step": 2555
},
{
"entropy": 6.177506828308106,
"epoch": 0.14876369236133305,
"grad_norm": 1.0,
"learning_rate": 0.0002611072953172531,
"loss": 5.8719,
"mean_token_accuracy": 0.14385495483875274,
"num_tokens": 4957776.0,
"step": 2560
},
{
"entropy": 6.029763984680176,
"epoch": 0.1490542464479763,
"grad_norm": 1.09375,
"learning_rate": 0.00025993164176847845,
"loss": 5.8281,
"mean_token_accuracy": 0.15120696425437927,
"num_tokens": 4966992.0,
"step": 2565
},
{
"entropy": 6.071042394638061,
"epoch": 0.14934480053461952,
"grad_norm": 1.03125,
"learning_rate": 0.0002587564013263564,
"loss": 5.8395,
"mean_token_accuracy": 0.14219800606369973,
"num_tokens": 4976216.0,
"step": 2570
},
{
"entropy": 6.099312973022461,
"epoch": 0.14963535462126276,
"grad_norm": 1.109375,
"learning_rate": 0.0002575816062106974,
"loss": 5.7508,
"mean_token_accuracy": 0.14777441769838334,
"num_tokens": 4985143.0,
"step": 2575
},
{
"entropy": 6.13524899482727,
"epoch": 0.14992590870790598,
"grad_norm": 1.1015625,
"learning_rate": 0.00025640728862910293,
"loss": 5.9437,
"mean_token_accuracy": 0.13997391015291213,
"num_tokens": 4995058.0,
"step": 2580
},
{
"entropy": 6.081535530090332,
"epoch": 0.1502164627945492,
"grad_norm": 1.0625,
"learning_rate": 0.00025523348077608285,
"loss": 5.7767,
"mean_token_accuracy": 0.14847566336393356,
"num_tokens": 5003930.0,
"step": 2585
},
{
"entropy": 6.133546257019043,
"epoch": 0.15050701688119245,
"grad_norm": 0.98828125,
"learning_rate": 0.00025406021483217225,
"loss": 5.8917,
"mean_token_accuracy": 0.14307338669896125,
"num_tokens": 5013907.0,
"step": 2590
},
{
"entropy": 6.05073037147522,
"epoch": 0.15079757096783566,
"grad_norm": 1.03125,
"learning_rate": 0.00025288752296304963,
"loss": 5.7465,
"mean_token_accuracy": 0.1435894712805748,
"num_tokens": 5024028.0,
"step": 2595
},
{
"entropy": 6.054743099212646,
"epoch": 0.15108812505447888,
"grad_norm": 1.0546875,
"learning_rate": 0.000251715437318655,
"loss": 5.8135,
"mean_token_accuracy": 0.14531584978103637,
"num_tokens": 5033555.0,
"step": 2600
},
{
"entropy": 6.053986740112305,
"epoch": 0.15137867914112213,
"grad_norm": 1.0078125,
"learning_rate": 0.0002505439900323084,
"loss": 5.862,
"mean_token_accuracy": 0.14667272865772246,
"num_tokens": 5043180.0,
"step": 2605
},
{
"entropy": 6.088306331634522,
"epoch": 0.15166923322776535,
"grad_norm": 0.9921875,
"learning_rate": 0.00024937321321982894,
"loss": 5.7691,
"mean_token_accuracy": 0.14470289498567582,
"num_tokens": 5052220.0,
"step": 2610
},
{
"entropy": 6.0878173351287845,
"epoch": 0.15195978731440857,
"grad_norm": 1.03125,
"learning_rate": 0.00024820313897865433,
"loss": 5.7726,
"mean_token_accuracy": 0.150884909927845,
"num_tokens": 5061544.0,
"step": 2615
},
{
"entropy": 6.131782722473145,
"epoch": 0.15225034140105181,
"grad_norm": 1.09375,
"learning_rate": 0.00024703379938696105,
"loss": 5.9184,
"mean_token_accuracy": 0.1400494635105133,
"num_tokens": 5070611.0,
"step": 2620
},
{
"entropy": 6.153847122192383,
"epoch": 0.15254089548769503,
"grad_norm": 1.0703125,
"learning_rate": 0.00024586522650278447,
"loss": 5.874,
"mean_token_accuracy": 0.1386608324944973,
"num_tokens": 5080750.0,
"step": 2625
},
{
"entropy": 6.137750577926636,
"epoch": 0.15283144957433825,
"grad_norm": 0.99609375,
"learning_rate": 0.00024469745236314064,
"loss": 5.8592,
"mean_token_accuracy": 0.13961437940597535,
"num_tokens": 5090067.0,
"step": 2630
},
{
"entropy": 6.123914098739624,
"epoch": 0.1531220036609815,
"grad_norm": 1.0546875,
"learning_rate": 0.00024353050898314767,
"loss": 5.8592,
"mean_token_accuracy": 0.14175378978252412,
"num_tokens": 5100053.0,
"step": 2635
},
{
"entropy": 6.1782163143157955,
"epoch": 0.15341255774762472,
"grad_norm": 1.0,
"learning_rate": 0.00024236442835514743,
"loss": 5.8117,
"mean_token_accuracy": 0.1458034932613373,
"num_tokens": 5109296.0,
"step": 2640
},
{
"entropy": 6.071597719192505,
"epoch": 0.15370311183426794,
"grad_norm": 1.1328125,
"learning_rate": 0.00024119924244782965,
"loss": 5.8673,
"mean_token_accuracy": 0.14649384766817092,
"num_tokens": 5118744.0,
"step": 2645
},
{
"entropy": 6.130202531814575,
"epoch": 0.15399366592091118,
"grad_norm": 1.046875,
"learning_rate": 0.00024003498320535462,
"loss": 5.8775,
"mean_token_accuracy": 0.1437153235077858,
"num_tokens": 5127763.0,
"step": 2650
},
{
"entropy": 6.148328161239624,
"epoch": 0.1542842200075544,
"grad_norm": 1.171875,
"learning_rate": 0.00023887168254647727,
"loss": 5.9019,
"mean_token_accuracy": 0.14456916153430938,
"num_tokens": 5138067.0,
"step": 2655
},
{
"entropy": 6.150043106079101,
"epoch": 0.15457477409419765,
"grad_norm": 0.98046875,
"learning_rate": 0.00023770937236367308,
"loss": 5.8983,
"mean_token_accuracy": 0.1399595282971859,
"num_tokens": 5148280.0,
"step": 2660
},
{
"entropy": 6.082090711593628,
"epoch": 0.15486532818084087,
"grad_norm": 1.046875,
"learning_rate": 0.00023654808452226278,
"loss": 5.7799,
"mean_token_accuracy": 0.15123223662376403,
"num_tokens": 5158182.0,
"step": 2665
},
{
"entropy": 6.0570969581604,
"epoch": 0.15515588226748409,
"grad_norm": 1.0078125,
"learning_rate": 0.00023538785085953912,
"loss": 5.7383,
"mean_token_accuracy": 0.14949096888303756,
"num_tokens": 5167524.0,
"step": 2670
},
{
"entropy": 6.062791872024536,
"epoch": 0.15544643635412733,
"grad_norm": 0.98046875,
"learning_rate": 0.00023422870318389404,
"loss": 5.7904,
"mean_token_accuracy": 0.13950854763388634,
"num_tokens": 5177581.0,
"step": 2675
},
{
"entropy": 6.112806892395019,
"epoch": 0.15573699044077055,
"grad_norm": 0.97265625,
"learning_rate": 0.0002330706732739468,
"loss": 5.783,
"mean_token_accuracy": 0.14393220096826553,
"num_tokens": 5187156.0,
"step": 2680
},
{
"entropy": 6.065037822723388,
"epoch": 0.15602754452741377,
"grad_norm": 0.9765625,
"learning_rate": 0.00023191379287767211,
"loss": 5.8843,
"mean_token_accuracy": 0.14143779054284095,
"num_tokens": 5198015.0,
"step": 2685
},
{
"entropy": 6.149574279785156,
"epoch": 0.15631809861405702,
"grad_norm": 0.98828125,
"learning_rate": 0.0002307580937115305,
"loss": 5.8311,
"mean_token_accuracy": 0.14658329337835313,
"num_tokens": 5207961.0,
"step": 2690
},
{
"entropy": 6.041101455688477,
"epoch": 0.15660865270070023,
"grad_norm": 1.0,
"learning_rate": 0.00022960360745959846,
"loss": 5.8328,
"mean_token_accuracy": 0.14369555339217185,
"num_tokens": 5217318.0,
"step": 2695
},
{
"entropy": 6.05631422996521,
"epoch": 0.15689920678734345,
"grad_norm": 1.0234375,
"learning_rate": 0.00022845036577269972,
"loss": 5.6925,
"mean_token_accuracy": 0.1581657573580742,
"num_tokens": 5226393.0,
"step": 2700
},
{
"entropy": 5.980499124526977,
"epoch": 0.1571897608739867,
"grad_norm": 1.03125,
"learning_rate": 0.00022729840026753777,
"loss": 5.6844,
"mean_token_accuracy": 0.14944447427988053,
"num_tokens": 5236003.0,
"step": 2705
},
{
"entropy": 6.103996896743775,
"epoch": 0.15748031496062992,
"grad_norm": 1.0703125,
"learning_rate": 0.0002261477425258287,
"loss": 5.853,
"mean_token_accuracy": 0.1508561223745346,
"num_tokens": 5246472.0,
"step": 2710
},
{
"entropy": 6.170705938339234,
"epoch": 0.15777086904727314,
"grad_norm": 1.046875,
"learning_rate": 0.0002249984240934358,
"loss": 5.931,
"mean_token_accuracy": 0.1417808599770069,
"num_tokens": 5256921.0,
"step": 2715
},
{
"entropy": 5.98744764328003,
"epoch": 0.15806142313391638,
"grad_norm": 1.0234375,
"learning_rate": 0.00022385047647950464,
"loss": 5.7333,
"mean_token_accuracy": 0.15196397304534912,
"num_tokens": 5266832.0,
"step": 2720
},
{
"entropy": 5.991374492645264,
"epoch": 0.1583519772205596,
"grad_norm": 1.09375,
"learning_rate": 0.0002227039311555986,
"loss": 5.7585,
"mean_token_accuracy": 0.1483888141810894,
"num_tokens": 5276386.0,
"step": 2725
},
{
"entropy": 6.100140237808228,
"epoch": 0.15864253130720282,
"grad_norm": 1.015625,
"learning_rate": 0.0002215588195548372,
"loss": 5.7618,
"mean_token_accuracy": 0.14937272816896438,
"num_tokens": 5285959.0,
"step": 2730
},
{
"entropy": 6.154450845718384,
"epoch": 0.15893308539384607,
"grad_norm": 1.0078125,
"learning_rate": 0.00022041517307103337,
"loss": 5.7947,
"mean_token_accuracy": 0.14946697056293487,
"num_tokens": 5295457.0,
"step": 2735
},
{
"entropy": 6.096340894699097,
"epoch": 0.1592236394804893,
"grad_norm": 0.8984375,
"learning_rate": 0.0002192730230578331,
"loss": 5.8109,
"mean_token_accuracy": 0.14488886743783952,
"num_tokens": 5306092.0,
"step": 2740
},
{
"entropy": 6.052392101287841,
"epoch": 0.15951419356713253,
"grad_norm": 1.140625,
"learning_rate": 0.0002181324008278559,
"loss": 5.8358,
"mean_token_accuracy": 0.14705842584371567,
"num_tokens": 5314960.0,
"step": 2745
},
{
"entropy": 5.998585510253906,
"epoch": 0.15980474765377575,
"grad_norm": 1.0625,
"learning_rate": 0.00021699333765183655,
"loss": 5.7745,
"mean_token_accuracy": 0.15136635154485703,
"num_tokens": 5324390.0,
"step": 2750
},
{
"entropy": 6.0869420051574705,
"epoch": 0.16009530174041897,
"grad_norm": 0.98046875,
"learning_rate": 0.0002158558647577673,
"loss": 5.812,
"mean_token_accuracy": 0.14605457559227944,
"num_tokens": 5334650.0,
"step": 2755
},
{
"entropy": 6.197345113754272,
"epoch": 0.16038585582706222,
"grad_norm": 0.93359375,
"learning_rate": 0.00021472001333004215,
"loss": 5.8713,
"mean_token_accuracy": 0.1437445230782032,
"num_tokens": 5343958.0,
"step": 2760
},
{
"entropy": 6.1040606021881105,
"epoch": 0.16067640991370544,
"grad_norm": 1.0078125,
"learning_rate": 0.00021358581450860186,
"loss": 5.8254,
"mean_token_accuracy": 0.14942396581172943,
"num_tokens": 5353428.0,
"step": 2765
},
{
"entropy": 6.14132776260376,
"epoch": 0.16096696400034866,
"grad_norm": 0.9375,
"learning_rate": 0.0002124532993880799,
"loss": 5.8757,
"mean_token_accuracy": 0.14288587495684624,
"num_tokens": 5364132.0,
"step": 2770
},
{
"entropy": 6.148387336730957,
"epoch": 0.1612575180869919,
"grad_norm": 0.94921875,
"learning_rate": 0.00021132249901695044,
"loss": 5.853,
"mean_token_accuracy": 0.14395386576652527,
"num_tokens": 5374066.0,
"step": 2775
},
{
"entropy": 6.0949657440185545,
"epoch": 0.16154807217363512,
"grad_norm": 1.1015625,
"learning_rate": 0.00021019344439667705,
"loss": 5.8179,
"mean_token_accuracy": 0.1493311658501625,
"num_tokens": 5383479.0,
"step": 2780
},
{
"entropy": 5.99338116645813,
"epoch": 0.16183862626027834,
"grad_norm": 0.9921875,
"learning_rate": 0.00020906616648086213,
"loss": 5.683,
"mean_token_accuracy": 0.15754484832286836,
"num_tokens": 5392894.0,
"step": 2785
},
{
"entropy": 6.032156229019165,
"epoch": 0.1621291803469216,
"grad_norm": 0.984375,
"learning_rate": 0.00020794069617439942,
"loss": 5.8144,
"mean_token_accuracy": 0.1486166849732399,
"num_tokens": 5402886.0,
"step": 2790
},
{
"entropy": 6.081609296798706,
"epoch": 0.1624197344335648,
"grad_norm": 1.015625,
"learning_rate": 0.00020681706433262593,
"loss": 5.7009,
"mean_token_accuracy": 0.1564814940094948,
"num_tokens": 5411656.0,
"step": 2795
},
{
"entropy": 6.125734090805054,
"epoch": 0.16271028852020802,
"grad_norm": 1.046875,
"learning_rate": 0.00020569530176047602,
"loss": 5.8456,
"mean_token_accuracy": 0.14672704488039018,
"num_tokens": 5421650.0,
"step": 2800
},
{
"entropy": 6.053952217102051,
"epoch": 0.16300084260685127,
"grad_norm": 0.97265625,
"learning_rate": 0.0002045754392116374,
"loss": 5.7943,
"mean_token_accuracy": 0.153436142206192,
"num_tokens": 5431154.0,
"step": 2805
},
{
"entropy": 6.064789199829102,
"epoch": 0.1632913966934945,
"grad_norm": 1.0234375,
"learning_rate": 0.00020345750738770757,
"loss": 5.8505,
"mean_token_accuracy": 0.14092413783073426,
"num_tokens": 5441464.0,
"step": 2810
},
{
"entropy": 6.110472869873047,
"epoch": 0.16358195078013774,
"grad_norm": 0.9765625,
"learning_rate": 0.00020234153693735214,
"loss": 5.8195,
"mean_token_accuracy": 0.14966847896575927,
"num_tokens": 5452077.0,
"step": 2815
},
{
"entropy": 6.1211137771606445,
"epoch": 0.16387250486678095,
"grad_norm": 1.0625,
"learning_rate": 0.0002012275584554647,
"loss": 5.8616,
"mean_token_accuracy": 0.14003771468997,
"num_tokens": 5461792.0,
"step": 2820
},
{
"entropy": 6.047807073593139,
"epoch": 0.16416305895342417,
"grad_norm": 1.0234375,
"learning_rate": 0.00020011560248232803,
"loss": 5.7094,
"mean_token_accuracy": 0.15667359828948973,
"num_tokens": 5471637.0,
"step": 2825
},
{
"entropy": 6.167580080032349,
"epoch": 0.16445361304006742,
"grad_norm": 1.015625,
"learning_rate": 0.00019900569950277692,
"loss": 5.9432,
"mean_token_accuracy": 0.14090102761983872,
"num_tokens": 5482341.0,
"step": 2830
},
{
"entropy": 6.063886308670044,
"epoch": 0.16474416712671064,
"grad_norm": 1.078125,
"learning_rate": 0.00019789787994536228,
"loss": 5.7621,
"mean_token_accuracy": 0.15223144590854645,
"num_tokens": 5492335.0,
"step": 2835
},
{
"entropy": 6.057334852218628,
"epoch": 0.16503472121335386,
"grad_norm": 0.97265625,
"learning_rate": 0.00019679217418151667,
"loss": 5.7486,
"mean_token_accuracy": 0.15512095093727113,
"num_tokens": 5501879.0,
"step": 2840
},
{
"entropy": 6.074555587768555,
"epoch": 0.1653252752999971,
"grad_norm": 0.91015625,
"learning_rate": 0.00019568861252472236,
"loss": 5.7906,
"mean_token_accuracy": 0.15294522792100906,
"num_tokens": 5512419.0,
"step": 2845
},
{
"entropy": 6.052340698242188,
"epoch": 0.16561582938664032,
"grad_norm": 1.2578125,
"learning_rate": 0.00019458722522967952,
"loss": 5.6966,
"mean_token_accuracy": 0.15103003978729249,
"num_tokens": 5521017.0,
"step": 2850
},
{
"entropy": 6.01881628036499,
"epoch": 0.16590638347328354,
"grad_norm": 1.1015625,
"learning_rate": 0.00019348804249147723,
"loss": 5.7061,
"mean_token_accuracy": 0.14818918108940124,
"num_tokens": 5530916.0,
"step": 2855
},
{
"entropy": 6.014477682113648,
"epoch": 0.1661969375599268,
"grad_norm": 0.97265625,
"learning_rate": 0.0001923910944447655,
"loss": 5.6511,
"mean_token_accuracy": 0.16009259968996048,
"num_tokens": 5540228.0,
"step": 2860
},
{
"entropy": 6.070650434494018,
"epoch": 0.16648749164657,
"grad_norm": 1.03125,
"learning_rate": 0.00019129641116292928,
"loss": 5.7298,
"mean_token_accuracy": 0.15569742172956466,
"num_tokens": 5549921.0,
"step": 2865
},
{
"entropy": 6.1058845043182375,
"epoch": 0.16677804573321323,
"grad_norm": 1.078125,
"learning_rate": 0.00019020402265726343,
"loss": 5.8318,
"mean_token_accuracy": 0.143881855905056,
"num_tokens": 5560308.0,
"step": 2870
},
{
"entropy": 6.0743451595306395,
"epoch": 0.16706859981985647,
"grad_norm": 1.125,
"learning_rate": 0.0001891139588761509,
"loss": 5.7595,
"mean_token_accuracy": 0.1477736845612526,
"num_tokens": 5569026.0,
"step": 2875
},
{
"entropy": 6.015086269378662,
"epoch": 0.1673591539064997,
"grad_norm": 1.03125,
"learning_rate": 0.00018802624970424076,
"loss": 5.725,
"mean_token_accuracy": 0.15312366485595702,
"num_tokens": 5578812.0,
"step": 2880
},
{
"entropy": 6.130430459976196,
"epoch": 0.1676497079931429,
"grad_norm": 1.015625,
"learning_rate": 0.00018694092496162945,
"loss": 5.831,
"mean_token_accuracy": 0.14988720864057542,
"num_tokens": 5588763.0,
"step": 2885
},
{
"entropy": 6.03325777053833,
"epoch": 0.16794026207978616,
"grad_norm": 1.0859375,
"learning_rate": 0.00018585801440304306,
"loss": 5.6702,
"mean_token_accuracy": 0.15091593116521834,
"num_tokens": 5597719.0,
"step": 2890
},
{
"entropy": 6.071933698654175,
"epoch": 0.16823081616642938,
"grad_norm": 1.0390625,
"learning_rate": 0.00018477754771702165,
"loss": 5.7461,
"mean_token_accuracy": 0.14670687392354012,
"num_tokens": 5607376.0,
"step": 2895
},
{
"entropy": 6.061179494857788,
"epoch": 0.16852137025307262,
"grad_norm": 1.0078125,
"learning_rate": 0.00018369955452510506,
"loss": 5.757,
"mean_token_accuracy": 0.14725697934627532,
"num_tokens": 5617227.0,
"step": 2900
},
{
"entropy": 5.983553457260132,
"epoch": 0.16881192433971584,
"grad_norm": 1.0078125,
"learning_rate": 0.0001826240643810212,
"loss": 5.6661,
"mean_token_accuracy": 0.1538853704929352,
"num_tokens": 5626241.0,
"step": 2905
},
{
"entropy": 6.049111032485962,
"epoch": 0.16910247842635906,
"grad_norm": 0.984375,
"learning_rate": 0.0001815511067698758,
"loss": 5.8158,
"mean_token_accuracy": 0.14741248339414598,
"num_tokens": 5636969.0,
"step": 2910
},
{
"entropy": 5.976936292648316,
"epoch": 0.1693930325130023,
"grad_norm": 0.984375,
"learning_rate": 0.0001804807111073436,
"loss": 5.6947,
"mean_token_accuracy": 0.14919188469648362,
"num_tokens": 5646430.0,
"step": 2915
},
{
"entropy": 6.0697746753692625,
"epoch": 0.16968358659964553,
"grad_norm": 1.046875,
"learning_rate": 0.0001794129067388625,
"loss": 5.7424,
"mean_token_accuracy": 0.15479477643966674,
"num_tokens": 5656049.0,
"step": 2920
},
{
"entropy": 6.103779983520508,
"epoch": 0.16997414068628874,
"grad_norm": 1.0625,
"learning_rate": 0.00017834772293882868,
"loss": 5.7761,
"mean_token_accuracy": 0.14724364280700683,
"num_tokens": 5665657.0,
"step": 2925
},
{
"entropy": 6.004815101623535,
"epoch": 0.170264694772932,
"grad_norm": 1.078125,
"learning_rate": 0.000177285188909794,
"loss": 5.6713,
"mean_token_accuracy": 0.14558330550789833,
"num_tokens": 5675306.0,
"step": 2930
},
{
"entropy": 6.077218818664551,
"epoch": 0.1705552488595752,
"grad_norm": 1.0625,
"learning_rate": 0.0001762253337816656,
"loss": 5.8962,
"mean_token_accuracy": 0.14256232976913452,
"num_tokens": 5685295.0,
"step": 2935
},
{
"entropy": 5.960014724731446,
"epoch": 0.17084580294621843,
"grad_norm": 1.046875,
"learning_rate": 0.00017516818661090738,
"loss": 5.5849,
"mean_token_accuracy": 0.153633750975132,
"num_tokens": 5694626.0,
"step": 2940
},
{
"entropy": 6.0667417526245115,
"epoch": 0.17113635703286167,
"grad_norm": 0.98046875,
"learning_rate": 0.0001741137763797428,
"loss": 5.7658,
"mean_token_accuracy": 0.1459375351667404,
"num_tokens": 5704247.0,
"step": 2945
},
{
"entropy": 6.021193981170654,
"epoch": 0.1714269111195049,
"grad_norm": 1.1484375,
"learning_rate": 0.00017306213199536115,
"loss": 5.6436,
"mean_token_accuracy": 0.16296780705451966,
"num_tokens": 5712536.0,
"step": 2950
},
{
"entropy": 6.047375822067261,
"epoch": 0.1717174652061481,
"grad_norm": 1.046875,
"learning_rate": 0.0001720132822891243,
"loss": 5.7144,
"mean_token_accuracy": 0.1537844330072403,
"num_tokens": 5721641.0,
"step": 2955
},
{
"entropy": 5.961157178878784,
"epoch": 0.17200801929279136,
"grad_norm": 1.0234375,
"learning_rate": 0.0001709672560157769,
"loss": 5.6832,
"mean_token_accuracy": 0.15097189098596572,
"num_tokens": 5731404.0,
"step": 2960
},
{
"entropy": 6.061686420440674,
"epoch": 0.17229857337943458,
"grad_norm": 0.984375,
"learning_rate": 0.00016992408185265758,
"loss": 5.7934,
"mean_token_accuracy": 0.14903522282838821,
"num_tokens": 5741006.0,
"step": 2965
},
{
"entropy": 6.143270587921142,
"epoch": 0.17258912746607782,
"grad_norm": 1.0,
"learning_rate": 0.00016888378839891298,
"loss": 5.7955,
"mean_token_accuracy": 0.14605200439691543,
"num_tokens": 5751124.0,
"step": 2970
},
{
"entropy": 6.10756402015686,
"epoch": 0.17287968155272104,
"grad_norm": 1.0703125,
"learning_rate": 0.0001678464041747137,
"loss": 5.7795,
"mean_token_accuracy": 0.15010488629341126,
"num_tokens": 5761261.0,
"step": 2975
},
{
"entropy": 6.070679616928101,
"epoch": 0.17317023563936426,
"grad_norm": 1.0546875,
"learning_rate": 0.00016681195762047223,
"loss": 5.7228,
"mean_token_accuracy": 0.1600602760910988,
"num_tokens": 5769855.0,
"step": 2980
},
{
"entropy": 6.0531364440917965,
"epoch": 0.1734607897260075,
"grad_norm": 1.1015625,
"learning_rate": 0.00016578047709606337,
"loss": 5.7577,
"mean_token_accuracy": 0.14312802702188493,
"num_tokens": 5779394.0,
"step": 2985
},
{
"entropy": 6.0887946605682375,
"epoch": 0.17375134381265073,
"grad_norm": 0.98046875,
"learning_rate": 0.00016475199088004678,
"loss": 5.8185,
"mean_token_accuracy": 0.15020564645528794,
"num_tokens": 5789442.0,
"step": 2990
},
{
"entropy": 6.0648823261260985,
"epoch": 0.17404189789929395,
"grad_norm": 1.015625,
"learning_rate": 0.00016372652716889163,
"loss": 5.7086,
"mean_token_accuracy": 0.15479273200035096,
"num_tokens": 5798269.0,
"step": 2995
},
{
"entropy": 6.102453422546387,
"epoch": 0.1743324519859372,
"grad_norm": 1.015625,
"learning_rate": 0.0001627041140762035,
"loss": 5.7227,
"mean_token_accuracy": 0.14934398829936982,
"num_tokens": 5808608.0,
"step": 3000
},
{
"entropy": 6.100079727172852,
"epoch": 0.1746230060725804,
"grad_norm": 1.1484375,
"learning_rate": 0.00016168477963195382,
"loss": 5.777,
"mean_token_accuracy": 0.14889512956142426,
"num_tokens": 5818134.0,
"step": 3005
},
{
"entropy": 6.064034843444825,
"epoch": 0.17491356015922363,
"grad_norm": 0.984375,
"learning_rate": 0.0001606685517817114,
"loss": 5.7285,
"mean_token_accuracy": 0.15235000252723693,
"num_tokens": 5828237.0,
"step": 3010
},
{
"entropy": 6.020033311843872,
"epoch": 0.17520411424586688,
"grad_norm": 1.015625,
"learning_rate": 0.00015965545838587592,
"loss": 5.7206,
"mean_token_accuracy": 0.1511564001441002,
"num_tokens": 5837597.0,
"step": 3015
},
{
"entropy": 6.070970773696899,
"epoch": 0.1754946683325101,
"grad_norm": 1.0546875,
"learning_rate": 0.00015864552721891467,
"loss": 5.6728,
"mean_token_accuracy": 0.1517861396074295,
"num_tokens": 5846728.0,
"step": 3020
},
{
"entropy": 6.065035820007324,
"epoch": 0.17578522241915331,
"grad_norm": 0.91796875,
"learning_rate": 0.00015763878596860076,
"loss": 5.831,
"mean_token_accuracy": 0.14596770107746124,
"num_tokens": 5857655.0,
"step": 3025
},
{
"entropy": 5.956533288955688,
"epoch": 0.17607577650579656,
"grad_norm": 0.94921875,
"learning_rate": 0.00015663526223525412,
"loss": 5.6927,
"mean_token_accuracy": 0.1539006546139717,
"num_tokens": 5868854.0,
"step": 3030
},
{
"entropy": 6.1010034561157225,
"epoch": 0.17636633059243978,
"grad_norm": 0.9609375,
"learning_rate": 0.0001556349835309848,
"loss": 5.8291,
"mean_token_accuracy": 0.1526801697909832,
"num_tokens": 5879362.0,
"step": 3035
},
{
"entropy": 6.131938362121582,
"epoch": 0.176656884679083,
"grad_norm": 1.0,
"learning_rate": 0.0001546379772789389,
"loss": 5.8241,
"mean_token_accuracy": 0.15398107543587686,
"num_tokens": 5888753.0,
"step": 3040
},
{
"entropy": 6.086278247833252,
"epoch": 0.17694743876572624,
"grad_norm": 1.078125,
"learning_rate": 0.00015364427081254622,
"loss": 5.7149,
"mean_token_accuracy": 0.15152743011713027,
"num_tokens": 5898860.0,
"step": 3045
},
{
"entropy": 6.081228303909302,
"epoch": 0.17723799285236946,
"grad_norm": 1.09375,
"learning_rate": 0.00015265389137477165,
"loss": 5.7008,
"mean_token_accuracy": 0.14895583540201188,
"num_tokens": 5908191.0,
"step": 3050
},
{
"entropy": 5.977576351165771,
"epoch": 0.1775285469390127,
"grad_norm": 1.046875,
"learning_rate": 0.00015166686611736786,
"loss": 5.6745,
"mean_token_accuracy": 0.15492946803569793,
"num_tokens": 5918266.0,
"step": 3055
},
{
"entropy": 5.968826675415039,
"epoch": 0.17781910102565593,
"grad_norm": 0.99609375,
"learning_rate": 0.00015068322210013064,
"loss": 5.6773,
"mean_token_accuracy": 0.15108609497547149,
"num_tokens": 5927628.0,
"step": 3060
},
{
"entropy": 6.01093373298645,
"epoch": 0.17810965511229915,
"grad_norm": 1.0859375,
"learning_rate": 0.0001497029862901578,
"loss": 5.6803,
"mean_token_accuracy": 0.15184399709105492,
"num_tokens": 5937323.0,
"step": 3065
},
{
"entropy": 6.0069934844970705,
"epoch": 0.1784002091989424,
"grad_norm": 1.0,
"learning_rate": 0.00014872618556110905,
"loss": 5.7033,
"mean_token_accuracy": 0.15416699647903442,
"num_tokens": 5946767.0,
"step": 3070
},
{
"entropy": 6.092588567733765,
"epoch": 0.1786907632855856,
"grad_norm": 1.046875,
"learning_rate": 0.00014775284669246992,
"loss": 5.7472,
"mean_token_accuracy": 0.1522138647735119,
"num_tokens": 5956469.0,
"step": 3075
},
{
"entropy": 6.093777990341186,
"epoch": 0.17898131737222883,
"grad_norm": 1.0078125,
"learning_rate": 0.00014678299636881716,
"loss": 5.7564,
"mean_token_accuracy": 0.15174834728240966,
"num_tokens": 5965882.0,
"step": 3080
},
{
"entropy": 6.043925333023071,
"epoch": 0.17927187145887208,
"grad_norm": 1.0703125,
"learning_rate": 0.0001458166611790873,
"loss": 5.6561,
"mean_token_accuracy": 0.15863914489746095,
"num_tokens": 5974696.0,
"step": 3085
},
{
"entropy": 6.098592472076416,
"epoch": 0.1795624255455153,
"grad_norm": 1.0078125,
"learning_rate": 0.00014485386761584773,
"loss": 5.771,
"mean_token_accuracy": 0.15556092113256453,
"num_tokens": 5984525.0,
"step": 3090
},
{
"entropy": 6.02237606048584,
"epoch": 0.17985297963215852,
"grad_norm": 1.046875,
"learning_rate": 0.00014389464207457042,
"loss": 5.7049,
"mean_token_accuracy": 0.1512262910604477,
"num_tokens": 5993382.0,
"step": 3095
},
{
"entropy": 5.985553407669068,
"epoch": 0.18014353371880176,
"grad_norm": 1.1328125,
"learning_rate": 0.00014293901085290795,
"loss": 5.6662,
"mean_token_accuracy": 0.15523958802223206,
"num_tokens": 6003127.0,
"step": 3100
},
{
"entropy": 6.131969976425171,
"epoch": 0.18043408780544498,
"grad_norm": 1.0859375,
"learning_rate": 0.00014198700014997307,
"loss": 5.8075,
"mean_token_accuracy": 0.1487472876906395,
"num_tokens": 6013336.0,
"step": 3105
},
{
"entropy": 6.103825998306275,
"epoch": 0.1807246418920882,
"grad_norm": 0.953125,
"learning_rate": 0.00014103863606562016,
"loss": 5.8092,
"mean_token_accuracy": 0.14103479385375978,
"num_tokens": 6023446.0,
"step": 3110
},
{
"entropy": 6.070917987823487,
"epoch": 0.18101519597873145,
"grad_norm": 0.96875,
"learning_rate": 0.00014009394459972964,
"loss": 5.7103,
"mean_token_accuracy": 0.15287835896015167,
"num_tokens": 6034264.0,
"step": 3115
},
{
"entropy": 6.039470911026001,
"epoch": 0.18130575006537467,
"grad_norm": 1.1484375,
"learning_rate": 0.00013915295165149513,
"loss": 5.7217,
"mean_token_accuracy": 0.15219166725873948,
"num_tokens": 6043048.0,
"step": 3120
},
{
"entropy": 6.063743162155151,
"epoch": 0.18159630415201788,
"grad_norm": 0.96875,
"learning_rate": 0.00013821568301871384,
"loss": 5.7789,
"mean_token_accuracy": 0.14522581547498703,
"num_tokens": 6053255.0,
"step": 3125
},
{
"entropy": 5.9902284145355225,
"epoch": 0.18188685823866113,
"grad_norm": 1.015625,
"learning_rate": 0.00013728216439707862,
"loss": 5.6082,
"mean_token_accuracy": 0.1634930595755577,
"num_tokens": 6062968.0,
"step": 3130
},
{
"entropy": 6.016612720489502,
"epoch": 0.18217741232530435,
"grad_norm": 1.078125,
"learning_rate": 0.00013635242137947419,
"loss": 5.5827,
"mean_token_accuracy": 0.16410193741321563,
"num_tokens": 6072071.0,
"step": 3135
},
{
"entropy": 6.09333872795105,
"epoch": 0.1824679664119476,
"grad_norm": 0.9609375,
"learning_rate": 0.00013542647945527498,
"loss": 5.8434,
"mean_token_accuracy": 0.1482686847448349,
"num_tokens": 6081887.0,
"step": 3140
},
{
"entropy": 6.0756289005279545,
"epoch": 0.18275852049859082,
"grad_norm": 0.97265625,
"learning_rate": 0.0001345043640096465,
"loss": 5.7092,
"mean_token_accuracy": 0.1483364373445511,
"num_tokens": 6092254.0,
"step": 3145
},
{
"entropy": 6.047313165664673,
"epoch": 0.18304907458523403,
"grad_norm": 1.0234375,
"learning_rate": 0.00013358610032284957,
"loss": 5.72,
"mean_token_accuracy": 0.15549504160881042,
"num_tokens": 6102497.0,
"step": 3150
},
{
"entropy": 6.062089967727661,
"epoch": 0.18333962867187728,
"grad_norm": 1.1015625,
"learning_rate": 0.000132671713569547,
"loss": 5.734,
"mean_token_accuracy": 0.1556983083486557,
"num_tokens": 6112277.0,
"step": 3155
},
{
"entropy": 6.076785612106323,
"epoch": 0.1836301827585205,
"grad_norm": 1.1328125,
"learning_rate": 0.0001317612288181136,
"loss": 5.7002,
"mean_token_accuracy": 0.1554645776748657,
"num_tokens": 6121471.0,
"step": 3160
},
{
"entropy": 6.072947454452515,
"epoch": 0.18392073684516372,
"grad_norm": 0.953125,
"learning_rate": 0.00013085467102994864,
"loss": 5.7982,
"mean_token_accuracy": 0.14808910414576532,
"num_tokens": 6131717.0,
"step": 3165
},
{
"entropy": 6.053280687332153,
"epoch": 0.18421129093180696,
"grad_norm": 1.125,
"learning_rate": 0.00012995206505879198,
"loss": 5.708,
"mean_token_accuracy": 0.15172735154628753,
"num_tokens": 6142002.0,
"step": 3170
},
{
"entropy": 6.130218172073365,
"epoch": 0.18450184501845018,
"grad_norm": 0.9921875,
"learning_rate": 0.0001290534356500421,
"loss": 5.7836,
"mean_token_accuracy": 0.14181277081370353,
"num_tokens": 6152034.0,
"step": 3175
},
{
"entropy": 6.088483095169067,
"epoch": 0.1847923991050934,
"grad_norm": 1.0390625,
"learning_rate": 0.00012815880744007827,
"loss": 5.7851,
"mean_token_accuracy": 0.1522969976067543,
"num_tokens": 6161737.0,
"step": 3180
},
{
"entropy": 5.9690474510192875,
"epoch": 0.18508295319173665,
"grad_norm": 1.0703125,
"learning_rate": 0.00012726820495558483,
"loss": 5.6982,
"mean_token_accuracy": 0.153187994658947,
"num_tokens": 6170088.0,
"step": 3185
},
{
"entropy": 6.107737588882446,
"epoch": 0.18537350727837987,
"grad_norm": 1.046875,
"learning_rate": 0.00012638165261287868,
"loss": 5.8119,
"mean_token_accuracy": 0.1520651862025261,
"num_tokens": 6179914.0,
"step": 3190
},
{
"entropy": 6.0809577941894535,
"epoch": 0.1856640613650231,
"grad_norm": 1.0703125,
"learning_rate": 0.0001254991747172402,
"loss": 5.7642,
"mean_token_accuracy": 0.14715377390384674,
"num_tokens": 6191145.0,
"step": 3195
},
{
"entropy": 5.9991779804229735,
"epoch": 0.18595461545166633,
"grad_norm": 0.97265625,
"learning_rate": 0.00012462079546224662,
"loss": 5.6445,
"mean_token_accuracy": 0.15297795236110687,
"num_tokens": 6201587.0,
"step": 3200
},
{
"entropy": 6.071609449386597,
"epoch": 0.18624516953830955,
"grad_norm": 0.98046875,
"learning_rate": 0.00012374653892910896,
"loss": 5.7869,
"mean_token_accuracy": 0.14952635765075684,
"num_tokens": 6210659.0,
"step": 3205
},
{
"entropy": 6.06493353843689,
"epoch": 0.1865357236249528,
"grad_norm": 1.1015625,
"learning_rate": 0.00012287642908601166,
"loss": 5.6995,
"mean_token_accuracy": 0.16741256564855575,
"num_tokens": 6220179.0,
"step": 3210
},
{
"entropy": 6.087040662765503,
"epoch": 0.18682627771159602,
"grad_norm": 1.0078125,
"learning_rate": 0.00012201048978745569,
"loss": 5.8608,
"mean_token_accuracy": 0.15103042125701904,
"num_tokens": 6230568.0,
"step": 3215
},
{
"entropy": 6.1079224109649655,
"epoch": 0.18711683179823924,
"grad_norm": 1.0390625,
"learning_rate": 0.00012114874477360427,
"loss": 5.7732,
"mean_token_accuracy": 0.15811807066202163,
"num_tokens": 6240571.0,
"step": 3220
},
{
"entropy": 6.066962003707886,
"epoch": 0.18740738588488248,
"grad_norm": 1.03125,
"learning_rate": 0.00012029121766963236,
"loss": 5.6959,
"mean_token_accuracy": 0.151812843978405,
"num_tokens": 6250232.0,
"step": 3225
},
{
"entropy": 6.136503267288208,
"epoch": 0.1876979399715257,
"grad_norm": 1.09375,
"learning_rate": 0.00011943793198507858,
"loss": 5.8473,
"mean_token_accuracy": 0.14256954118609427,
"num_tokens": 6260191.0,
"step": 3230
},
{
"entropy": 6.1150651454925535,
"epoch": 0.18798849405816892,
"grad_norm": 0.94921875,
"learning_rate": 0.00011858891111320104,
"loss": 5.7593,
"mean_token_accuracy": 0.15107759982347488,
"num_tokens": 6270348.0,
"step": 3235
},
{
"entropy": 6.040066814422607,
"epoch": 0.18827904814481217,
"grad_norm": 1.046875,
"learning_rate": 0.0001177441783303359,
"loss": 5.7222,
"mean_token_accuracy": 0.14626873433589935,
"num_tokens": 6281212.0,
"step": 3240
},
{
"entropy": 6.0591840744018555,
"epoch": 0.18856960223145539,
"grad_norm": 1.03125,
"learning_rate": 0.00011690375679525896,
"loss": 5.7569,
"mean_token_accuracy": 0.15225170105695723,
"num_tokens": 6290194.0,
"step": 3245
},
{
"entropy": 6.092053413391113,
"epoch": 0.1888601563180986,
"grad_norm": 1.046875,
"learning_rate": 0.00011606766954855124,
"loss": 5.7817,
"mean_token_accuracy": 0.15150906667113304,
"num_tokens": 6299875.0,
"step": 3250
},
{
"entropy": 6.102940845489502,
"epoch": 0.18915071040474185,
"grad_norm": 1.0859375,
"learning_rate": 0.00011523593951196702,
"loss": 5.7722,
"mean_token_accuracy": 0.15258708745241165,
"num_tokens": 6309359.0,
"step": 3255
},
{
"entropy": 6.080491304397583,
"epoch": 0.18944126449138507,
"grad_norm": 0.953125,
"learning_rate": 0.00011440858948780523,
"loss": 5.7347,
"mean_token_accuracy": 0.1517005071043968,
"num_tokens": 6320783.0,
"step": 3260
},
{
"entropy": 5.987936544418335,
"epoch": 0.1897318185780283,
"grad_norm": 1.1015625,
"learning_rate": 0.00011358564215828484,
"loss": 5.5715,
"mean_token_accuracy": 0.1621371328830719,
"num_tokens": 6330150.0,
"step": 3265
},
{
"entropy": 5.964755249023438,
"epoch": 0.19002237266467153,
"grad_norm": 1.0234375,
"learning_rate": 0.00011276712008492254,
"loss": 5.7216,
"mean_token_accuracy": 0.15560902208089827,
"num_tokens": 6340421.0,
"step": 3270
},
{
"entropy": 6.003197145462036,
"epoch": 0.19031292675131475,
"grad_norm": 1.046875,
"learning_rate": 0.00011195304570791451,
"loss": 5.7291,
"mean_token_accuracy": 0.14981625527143477,
"num_tokens": 6351093.0,
"step": 3275
},
{
"entropy": 6.069776487350464,
"epoch": 0.19060348083795797,
"grad_norm": 1.0078125,
"learning_rate": 0.00011114344134552094,
"loss": 5.7178,
"mean_token_accuracy": 0.14927603229880332,
"num_tokens": 6360373.0,
"step": 3280
},
{
"entropy": 6.128297328948975,
"epoch": 0.19089403492460122,
"grad_norm": 1.1015625,
"learning_rate": 0.0001103383291934545,
"loss": 5.8211,
"mean_token_accuracy": 0.15346538573503493,
"num_tokens": 6371291.0,
"step": 3285
},
{
"entropy": 6.147673416137695,
"epoch": 0.19118458901124444,
"grad_norm": 1.125,
"learning_rate": 0.00010953773132427141,
"loss": 5.706,
"mean_token_accuracy": 0.15231086164712906,
"num_tokens": 6380869.0,
"step": 3290
},
{
"entropy": 6.075566673278809,
"epoch": 0.19147514309788768,
"grad_norm": 1.0625,
"learning_rate": 0.00010874166968676677,
"loss": 5.7058,
"mean_token_accuracy": 0.15668356716632842,
"num_tokens": 6390174.0,
"step": 3295
},
{
"entropy": 6.030622339248657,
"epoch": 0.1917656971845309,
"grad_norm": 1.0703125,
"learning_rate": 0.00010795016610537251,
"loss": 5.6748,
"mean_token_accuracy": 0.15518099069595337,
"num_tokens": 6399768.0,
"step": 3300
},
{
"entropy": 6.077046346664429,
"epoch": 0.19205625127117412,
"grad_norm": 1.0625,
"learning_rate": 0.00010716324227955904,
"loss": 5.802,
"mean_token_accuracy": 0.1477528505027294,
"num_tokens": 6409460.0,
"step": 3305
},
{
"entropy": 6.034036684036255,
"epoch": 0.19234680535781737,
"grad_norm": 1.078125,
"learning_rate": 0.0001063809197832406,
"loss": 5.6572,
"mean_token_accuracy": 0.16222479790449143,
"num_tokens": 6417968.0,
"step": 3310
},
{
"entropy": 6.085168313980103,
"epoch": 0.1926373594444606,
"grad_norm": 1.09375,
"learning_rate": 0.00010560322006418368,
"loss": 5.7371,
"mean_token_accuracy": 0.1469581514596939,
"num_tokens": 6427402.0,
"step": 3315
},
{
"entropy": 6.114069175720215,
"epoch": 0.1929279135311038,
"grad_norm": 1.0390625,
"learning_rate": 0.00010483016444341887,
"loss": 5.7702,
"mean_token_accuracy": 0.15116747766733168,
"num_tokens": 6437203.0,
"step": 3320
},
{
"entropy": 6.091079235076904,
"epoch": 0.19321846761774705,
"grad_norm": 1.0234375,
"learning_rate": 0.00010406177411465654,
"loss": 5.6856,
"mean_token_accuracy": 0.15697493702173232,
"num_tokens": 6446440.0,
"step": 3325
},
{
"entropy": 6.068310308456421,
"epoch": 0.19350902170439027,
"grad_norm": 1.0234375,
"learning_rate": 0.00010329807014370562,
"loss": 5.6496,
"mean_token_accuracy": 0.157624289393425,
"num_tokens": 6455842.0,
"step": 3330
},
{
"entropy": 6.069522714614868,
"epoch": 0.1937995757910335,
"grad_norm": 1.109375,
"learning_rate": 0.00010253907346789632,
"loss": 5.6689,
"mean_token_accuracy": 0.15749077796936034,
"num_tokens": 6464538.0,
"step": 3335
},
{
"entropy": 5.971843290328979,
"epoch": 0.19409012987767674,
"grad_norm": 0.984375,
"learning_rate": 0.00010178480489550596,
"loss": 5.6299,
"mean_token_accuracy": 0.15191589742898942,
"num_tokens": 6474646.0,
"step": 3340
},
{
"entropy": 6.010790205001831,
"epoch": 0.19438068396431996,
"grad_norm": 1.109375,
"learning_rate": 0.00010103528510518836,
"loss": 5.7641,
"mean_token_accuracy": 0.14547111392021178,
"num_tokens": 6484397.0,
"step": 3345
},
{
"entropy": 6.044341564178467,
"epoch": 0.19467123805096317,
"grad_norm": 1.046875,
"learning_rate": 0.0001002905346454073,
"loss": 5.7108,
"mean_token_accuracy": 0.14943148642778398,
"num_tokens": 6494254.0,
"step": 3350
},
{
"entropy": 6.041453218460083,
"epoch": 0.19496179213760642,
"grad_norm": 0.9921875,
"learning_rate": 9.955057393387285e-05,
"loss": 5.6536,
"mean_token_accuracy": 0.16039068698883058,
"num_tokens": 6503862.0,
"step": 3355
},
{
"entropy": 6.147222709655762,
"epoch": 0.19525234622424964,
"grad_norm": 1.0703125,
"learning_rate": 9.88154232569816e-05,
"loss": 5.7122,
"mean_token_accuracy": 0.14973994195461274,
"num_tokens": 6513644.0,
"step": 3360
},
{
"entropy": 6.084700441360473,
"epoch": 0.1955429003108929,
"grad_norm": 1.0390625,
"learning_rate": 9.808510276926075e-05,
"loss": 5.7,
"mean_token_accuracy": 0.14390370547771453,
"num_tokens": 6523991.0,
"step": 3365
},
{
"entropy": 6.049271821975708,
"epoch": 0.1958334543975361,
"grad_norm": 1.0390625,
"learning_rate": 9.735963249281549e-05,
"loss": 5.7439,
"mean_token_accuracy": 0.14755677580833435,
"num_tokens": 6533761.0,
"step": 3370
},
{
"entropy": 6.067014598846436,
"epoch": 0.19612400848417932,
"grad_norm": 1.1484375,
"learning_rate": 9.663903231677974e-05,
"loss": 5.7136,
"mean_token_accuracy": 0.1520987056195736,
"num_tokens": 6544115.0,
"step": 3375
},
{
"entropy": 6.102639961242676,
"epoch": 0.19641456257082257,
"grad_norm": 0.9375,
"learning_rate": 9.592332199677145e-05,
"loss": 5.7757,
"mean_token_accuracy": 0.1473358005285263,
"num_tokens": 6555207.0,
"step": 3380
},
{
"entropy": 6.06288423538208,
"epoch": 0.1967051166574658,
"grad_norm": 0.92578125,
"learning_rate": 9.521252115435061e-05,
"loss": 5.7049,
"mean_token_accuracy": 0.15305796936154364,
"num_tokens": 6564725.0,
"step": 3385
},
{
"entropy": 6.05938458442688,
"epoch": 0.196995670744109,
"grad_norm": 0.99609375,
"learning_rate": 9.450664927648126e-05,
"loss": 5.7016,
"mean_token_accuracy": 0.14934950321912766,
"num_tokens": 6575036.0,
"step": 3390
},
{
"entropy": 6.02567343711853,
"epoch": 0.19728622483075225,
"grad_norm": 1.0,
"learning_rate": 9.380572571499758e-05,
"loss": 5.7274,
"mean_token_accuracy": 0.1436111845076084,
"num_tokens": 6585489.0,
"step": 3395
},
{
"entropy": 6.0807962894439695,
"epoch": 0.19757677891739547,
"grad_norm": 0.93359375,
"learning_rate": 9.310976968607307e-05,
"loss": 5.7484,
"mean_token_accuracy": 0.1487852841615677,
"num_tokens": 6594848.0,
"step": 3400
},
{
"entropy": 6.089216613769532,
"epoch": 0.1978673330040387,
"grad_norm": 1.046875,
"learning_rate": 9.241880026969381e-05,
"loss": 5.7464,
"mean_token_accuracy": 0.14339143857359887,
"num_tokens": 6605983.0,
"step": 3405
},
{
"entropy": 6.055511140823365,
"epoch": 0.19815788709068194,
"grad_norm": 1.0703125,
"learning_rate": 9.173283640913537e-05,
"loss": 5.6737,
"mean_token_accuracy": 0.1530997022986412,
"num_tokens": 6615093.0,
"step": 3410
},
{
"entropy": 6.013196659088135,
"epoch": 0.19844844117732516,
"grad_norm": 1.1015625,
"learning_rate": 9.10518969104436e-05,
"loss": 5.6165,
"mean_token_accuracy": 0.15827906131744385,
"num_tokens": 6623551.0,
"step": 3415
},
{
"entropy": 5.9676004409790036,
"epoch": 0.19873899526396838,
"grad_norm": 1.0234375,
"learning_rate": 9.037600044191868e-05,
"loss": 5.7311,
"mean_token_accuracy": 0.1509920448064804,
"num_tokens": 6633611.0,
"step": 3420
},
{
"entropy": 6.004412841796875,
"epoch": 0.19902954935061162,
"grad_norm": 1.0859375,
"learning_rate": 8.970516553360383e-05,
"loss": 5.6986,
"mean_token_accuracy": 0.1615770772099495,
"num_tokens": 6642605.0,
"step": 3425
},
{
"entropy": 6.040467405319214,
"epoch": 0.19932010343725484,
"grad_norm": 1.125,
"learning_rate": 8.903941057677692e-05,
"loss": 5.7086,
"mean_token_accuracy": 0.15024487525224686,
"num_tokens": 6652398.0,
"step": 3430
},
{
"entropy": 6.084765291213989,
"epoch": 0.19961065752389806,
"grad_norm": 1.0625,
"learning_rate": 8.837875382344635e-05,
"loss": 5.6634,
"mean_token_accuracy": 0.15238699167966843,
"num_tokens": 6661667.0,
"step": 3435
},
{
"entropy": 6.060661029815674,
"epoch": 0.1999012116105413,
"grad_norm": 1.0234375,
"learning_rate": 8.772321338585076e-05,
"loss": 5.6415,
"mean_token_accuracy": 0.16024876087903978,
"num_tokens": 6670613.0,
"step": 3440
},
{
"entropy": 6.083744382858276,
"epoch": 0.20019176569718453,
"grad_norm": 1.046875,
"learning_rate": 8.707280723596242e-05,
"loss": 5.7538,
"mean_token_accuracy": 0.1504388488829136,
"num_tokens": 6679543.0,
"step": 3445
},
{
"entropy": 6.077122068405151,
"epoch": 0.20048231978382777,
"grad_norm": 0.95703125,
"learning_rate": 8.64275532049944e-05,
"loss": 5.8425,
"mean_token_accuracy": 0.140785413980484,
"num_tokens": 6689638.0,
"step": 3450
},
{
"entropy": 6.026986122131348,
"epoch": 0.200772873870471,
"grad_norm": 1.0390625,
"learning_rate": 8.578746898291198e-05,
"loss": 5.7096,
"mean_token_accuracy": 0.15388644784688948,
"num_tokens": 6699561.0,
"step": 3455
},
{
"entropy": 6.036118078231811,
"epoch": 0.2010634279571142,
"grad_norm": 1.0625,
"learning_rate": 8.515257211794742e-05,
"loss": 5.7424,
"mean_token_accuracy": 0.1555853232741356,
"num_tokens": 6709542.0,
"step": 3460
},
{
"entropy": 6.055869913101196,
"epoch": 0.20135398204375746,
"grad_norm": 0.99609375,
"learning_rate": 8.452288001611896e-05,
"loss": 5.6998,
"mean_token_accuracy": 0.15517012774944305,
"num_tokens": 6718545.0,
"step": 3465
},
{
"entropy": 6.0092888355255125,
"epoch": 0.20164453613040068,
"grad_norm": 1.1171875,
"learning_rate": 8.389840994075379e-05,
"loss": 5.5914,
"mean_token_accuracy": 0.15963410586118698,
"num_tokens": 6727491.0,
"step": 3470
},
{
"entropy": 6.133456993103027,
"epoch": 0.2019350902170439,
"grad_norm": 1.09375,
"learning_rate": 8.327917901201435e-05,
"loss": 5.862,
"mean_token_accuracy": 0.1492785707116127,
"num_tokens": 6737690.0,
"step": 3475
},
{
"entropy": 6.023900270462036,
"epoch": 0.20222564430368714,
"grad_norm": 1.078125,
"learning_rate": 8.266520420642931e-05,
"loss": 5.638,
"mean_token_accuracy": 0.16246868669986725,
"num_tokens": 6747049.0,
"step": 3480
},
{
"entropy": 6.069623184204102,
"epoch": 0.20251619839033036,
"grad_norm": 1.1796875,
"learning_rate": 8.205650235642828e-05,
"loss": 5.7306,
"mean_token_accuracy": 0.15011803209781646,
"num_tokens": 6756199.0,
"step": 3485
},
{
"entropy": 6.060231304168701,
"epoch": 0.20280675247697358,
"grad_norm": 1.125,
"learning_rate": 8.145309014987978e-05,
"loss": 5.6926,
"mean_token_accuracy": 0.14650730416178703,
"num_tokens": 6765595.0,
"step": 3490
},
{
"entropy": 6.029575967788697,
"epoch": 0.20309730656361683,
"grad_norm": 1.171875,
"learning_rate": 8.085498412963437e-05,
"loss": 5.628,
"mean_token_accuracy": 0.16038369089365007,
"num_tokens": 6775078.0,
"step": 3495
},
{
"entropy": 6.112934684753418,
"epoch": 0.20338786065026004,
"grad_norm": 0.95703125,
"learning_rate": 8.026220069307078e-05,
"loss": 5.7506,
"mean_token_accuracy": 0.15270390585064889,
"num_tokens": 6785419.0,
"step": 3500
},
{
"entropy": 6.068965864181519,
"epoch": 0.20367841473690326,
"grad_norm": 0.9609375,
"learning_rate": 7.967475609164621e-05,
"loss": 5.7165,
"mean_token_accuracy": 0.15362876802682876,
"num_tokens": 6795289.0,
"step": 3505
},
{
"entropy": 5.970501708984375,
"epoch": 0.2039689688235465,
"grad_norm": 0.99609375,
"learning_rate": 7.909266643045124e-05,
"loss": 5.606,
"mean_token_accuracy": 0.16173766702413558,
"num_tokens": 6804425.0,
"step": 3510
},
{
"entropy": 6.029342889785767,
"epoch": 0.20425952291018973,
"grad_norm": 1.0546875,
"learning_rate": 7.851594766776802e-05,
"loss": 5.6689,
"mean_token_accuracy": 0.1656169682741165,
"num_tokens": 6814102.0,
"step": 3515
},
{
"entropy": 6.081440544128418,
"epoch": 0.20455007699683295,
"grad_norm": 0.98828125,
"learning_rate": 7.794461561463265e-05,
"loss": 5.6685,
"mean_token_accuracy": 0.16380728930234909,
"num_tokens": 6824693.0,
"step": 3520
},
{
"entropy": 6.080014085769653,
"epoch": 0.2048406310834762,
"grad_norm": 0.96875,
"learning_rate": 7.7378685934402e-05,
"loss": 5.7272,
"mean_token_accuracy": 0.1497935637831688,
"num_tokens": 6834951.0,
"step": 3525
},
{
"entropy": 6.026291465759277,
"epoch": 0.2051311851701194,
"grad_norm": 1.0703125,
"learning_rate": 7.68181741423242e-05,
"loss": 5.6537,
"mean_token_accuracy": 0.15431195497512817,
"num_tokens": 6843922.0,
"step": 3530
},
{
"entropy": 5.994410610198974,
"epoch": 0.20542173925676266,
"grad_norm": 1.109375,
"learning_rate": 7.626309560511313e-05,
"loss": 5.5725,
"mean_token_accuracy": 0.15747455805540084,
"num_tokens": 6852903.0,
"step": 3535
},
{
"entropy": 6.06242561340332,
"epoch": 0.20571229334340588,
"grad_norm": 1.0859375,
"learning_rate": 7.571346554052724e-05,
"loss": 5.6528,
"mean_token_accuracy": 0.15987856090068817,
"num_tokens": 6862748.0,
"step": 3540
},
{
"entropy": 6.058012628555298,
"epoch": 0.2060028474300491,
"grad_norm": 0.984375,
"learning_rate": 7.516929901695249e-05,
"loss": 5.7014,
"mean_token_accuracy": 0.15260830670595169,
"num_tokens": 6873043.0,
"step": 3545
},
{
"entropy": 6.067829179763794,
"epoch": 0.20629340151669234,
"grad_norm": 1.0,
"learning_rate": 7.463061095298893e-05,
"loss": 5.6857,
"mean_token_accuracy": 0.15442362278699875,
"num_tokens": 6883256.0,
"step": 3550
},
{
"entropy": 6.019204711914062,
"epoch": 0.20658395560333556,
"grad_norm": 0.99609375,
"learning_rate": 7.409741611704198e-05,
"loss": 5.7186,
"mean_token_accuracy": 0.15627539306879043,
"num_tokens": 6893350.0,
"step": 3555
},
{
"entropy": 6.0289897441864015,
"epoch": 0.20687450968997878,
"grad_norm": 1.03125,
"learning_rate": 7.35697291269174e-05,
"loss": 5.7174,
"mean_token_accuracy": 0.15497809946537017,
"num_tokens": 6904378.0,
"step": 3560
},
{
"entropy": 6.094915437698364,
"epoch": 0.20716506377662203,
"grad_norm": 1.0390625,
"learning_rate": 7.304756444942056e-05,
"loss": 5.821,
"mean_token_accuracy": 0.14591436162590982,
"num_tokens": 6913293.0,
"step": 3565
},
{
"entropy": 6.109770011901856,
"epoch": 0.20745561786326525,
"grad_norm": 0.98828125,
"learning_rate": 7.253093639995994e-05,
"loss": 5.6726,
"mean_token_accuracy": 0.1514420121908188,
"num_tokens": 6922950.0,
"step": 3570
},
{
"entropy": 6.070370769500732,
"epoch": 0.20774617194990846,
"grad_norm": 1.0859375,
"learning_rate": 7.20198591421544e-05,
"loss": 5.6393,
"mean_token_accuracy": 0.15796124786138535,
"num_tokens": 6931746.0,
"step": 3575
},
{
"entropy": 6.085724258422852,
"epoch": 0.2080367260365517,
"grad_norm": 1.0390625,
"learning_rate": 7.151434668744517e-05,
"loss": 5.7744,
"mean_token_accuracy": 0.1553097262978554,
"num_tokens": 6941617.0,
"step": 3580
},
{
"entropy": 6.020153665542603,
"epoch": 0.20832728012319493,
"grad_norm": 1.0703125,
"learning_rate": 7.101441289471153e-05,
"loss": 5.6818,
"mean_token_accuracy": 0.1518129140138626,
"num_tokens": 6950743.0,
"step": 3585
},
{
"entropy": 6.0454991340637205,
"epoch": 0.20861783420983815,
"grad_norm": 1.09375,
"learning_rate": 7.052007146989098e-05,
"loss": 5.7299,
"mean_token_accuracy": 0.145194461196661,
"num_tokens": 6960665.0,
"step": 3590
},
{
"entropy": 6.010702991485596,
"epoch": 0.2089083882964814,
"grad_norm": 1.0546875,
"learning_rate": 7.003133596560341e-05,
"loss": 5.6009,
"mean_token_accuracy": 0.15414568036794662,
"num_tokens": 6970316.0,
"step": 3595
},
{
"entropy": 6.042399644851685,
"epoch": 0.20919894238312461,
"grad_norm": 1.109375,
"learning_rate": 6.954821978077952e-05,
"loss": 5.6891,
"mean_token_accuracy": 0.15473626405000687,
"num_tokens": 6980116.0,
"step": 3600
},
{
"entropy": 6.05302414894104,
"epoch": 0.20948949646976786,
"grad_norm": 1.1015625,
"learning_rate": 6.907073616029356e-05,
"loss": 5.7069,
"mean_token_accuracy": 0.15373171120882034,
"num_tokens": 6989901.0,
"step": 3605
},
{
"entropy": 6.045177841186524,
"epoch": 0.20978005055641108,
"grad_norm": 1.015625,
"learning_rate": 6.85988981946002e-05,
"loss": 5.7036,
"mean_token_accuracy": 0.15624198466539382,
"num_tokens": 6999405.0,
"step": 3610
},
{
"entropy": 6.048674821853638,
"epoch": 0.2100706046430543,
"grad_norm": 0.99609375,
"learning_rate": 6.813271881937564e-05,
"loss": 5.8436,
"mean_token_accuracy": 0.14876155257225038,
"num_tokens": 7010198.0,
"step": 3615
},
{
"entropy": 6.046766996383667,
"epoch": 0.21036115872969754,
"grad_norm": 1.0390625,
"learning_rate": 6.767221081516286e-05,
"loss": 5.677,
"mean_token_accuracy": 0.16342443078756333,
"num_tokens": 7019930.0,
"step": 3620
},
{
"entropy": 6.053871488571167,
"epoch": 0.21065171281634076,
"grad_norm": 0.9765625,
"learning_rate": 6.72173868070215e-05,
"loss": 5.6483,
"mean_token_accuracy": 0.15757839530706405,
"num_tokens": 7029485.0,
"step": 3625
},
{
"entropy": 6.064899587631226,
"epoch": 0.21094226690298398,
"grad_norm": 1.0546875,
"learning_rate": 6.676825926418149e-05,
"loss": 5.7667,
"mean_token_accuracy": 0.15013407468795775,
"num_tokens": 7038891.0,
"step": 3630
},
{
"entropy": 6.028255796432495,
"epoch": 0.21123282098962723,
"grad_norm": 1.046875,
"learning_rate": 6.632484049970122e-05,
"loss": 5.6891,
"mean_token_accuracy": 0.1538076549768448,
"num_tokens": 7048477.0,
"step": 3635
},
{
"entropy": 6.019302177429199,
"epoch": 0.21152337507627045,
"grad_norm": 1.0390625,
"learning_rate": 6.588714267013019e-05,
"loss": 5.5945,
"mean_token_accuracy": 0.16957512646913528,
"num_tokens": 7057705.0,
"step": 3640
},
{
"entropy": 6.048618459701538,
"epoch": 0.21181392916291367,
"grad_norm": 1.0703125,
"learning_rate": 6.545517777517544e-05,
"loss": 5.6504,
"mean_token_accuracy": 0.1564537763595581,
"num_tokens": 7067174.0,
"step": 3645
},
{
"entropy": 6.0795738697052,
"epoch": 0.2121044832495569,
"grad_norm": 1.1953125,
"learning_rate": 6.502895765737281e-05,
"loss": 5.7639,
"mean_token_accuracy": 0.15077428221702577,
"num_tokens": 7076250.0,
"step": 3650
},
{
"entropy": 6.046699380874633,
"epoch": 0.21239503733620013,
"grad_norm": 1.125,
"learning_rate": 6.460849400176212e-05,
"loss": 5.7002,
"mean_token_accuracy": 0.1563662827014923,
"num_tokens": 7085347.0,
"step": 3655
},
{
"entropy": 5.965932989120484,
"epoch": 0.21268559142284335,
"grad_norm": 1.0234375,
"learning_rate": 6.419379833556694e-05,
"loss": 5.6007,
"mean_token_accuracy": 0.15481184422969818,
"num_tokens": 7094639.0,
"step": 3660
},
{
"entropy": 6.034517765045166,
"epoch": 0.2129761455094866,
"grad_norm": 1.1484375,
"learning_rate": 6.378488202787835e-05,
"loss": 5.6765,
"mean_token_accuracy": 0.1551177941262722,
"num_tokens": 7103819.0,
"step": 3665
},
{
"entropy": 6.1038103103637695,
"epoch": 0.21326669959612982,
"grad_norm": 1.1484375,
"learning_rate": 6.33817562893435e-05,
"loss": 5.7149,
"mean_token_accuracy": 0.15616475343704223,
"num_tokens": 7113904.0,
"step": 3670
},
{
"entropy": 6.051551055908203,
"epoch": 0.21355725368277303,
"grad_norm": 1.078125,
"learning_rate": 6.29844321718582e-05,
"loss": 5.6672,
"mean_token_accuracy": 0.15475056320428848,
"num_tokens": 7123241.0,
"step": 3675
},
{
"entropy": 5.995166349411011,
"epoch": 0.21384780776941628,
"grad_norm": 1.1171875,
"learning_rate": 6.259292056826383e-05,
"loss": 5.5651,
"mean_token_accuracy": 0.167102213203907,
"num_tokens": 7131676.0,
"step": 3680
},
{
"entropy": 6.0871337890625,
"epoch": 0.2141383618560595,
"grad_norm": 1.015625,
"learning_rate": 6.220723221204873e-05,
"loss": 5.6893,
"mean_token_accuracy": 0.15597724542021751,
"num_tokens": 7141237.0,
"step": 3685
},
{
"entropy": 6.110262441635132,
"epoch": 0.21442891594270275,
"grad_norm": 1.0546875,
"learning_rate": 6.182737767705406e-05,
"loss": 5.7844,
"mean_token_accuracy": 0.15404658690094947,
"num_tokens": 7150696.0,
"step": 3690
},
{
"entropy": 6.048623514175415,
"epoch": 0.21471947002934597,
"grad_norm": 1.0234375,
"learning_rate": 6.145336737718375e-05,
"loss": 5.6907,
"mean_token_accuracy": 0.1541660889983177,
"num_tokens": 7160444.0,
"step": 3695
},
{
"entropy": 6.030835723876953,
"epoch": 0.21501002411598918,
"grad_norm": 1.0390625,
"learning_rate": 6.10852115661191e-05,
"loss": 5.7177,
"mean_token_accuracy": 0.1561812475323677,
"num_tokens": 7170170.0,
"step": 3700
},
{
"entropy": 5.982100868225098,
"epoch": 0.21530057820263243,
"grad_norm": 1.1796875,
"learning_rate": 6.072292033703766e-05,
"loss": 5.6144,
"mean_token_accuracy": 0.15959885716438293,
"num_tokens": 7179594.0,
"step": 3705
},
{
"entropy": 5.983188962936401,
"epoch": 0.21559113228927565,
"grad_norm": 1.125,
"learning_rate": 6.036650362233648e-05,
"loss": 5.5925,
"mean_token_accuracy": 0.15840766131877898,
"num_tokens": 7189139.0,
"step": 3710
},
{
"entropy": 6.049566268920898,
"epoch": 0.21588168637591887,
"grad_norm": 1.0546875,
"learning_rate": 6.0015971193359824e-05,
"loss": 5.7009,
"mean_token_accuracy": 0.1592419296503067,
"num_tokens": 7198259.0,
"step": 3715
},
{
"entropy": 6.067971229553223,
"epoch": 0.21617224046256212,
"grad_norm": 1.0078125,
"learning_rate": 5.9671332660131306e-05,
"loss": 5.6917,
"mean_token_accuracy": 0.15316389575600625,
"num_tokens": 7208420.0,
"step": 3720
},
{
"entropy": 6.0361669063568115,
"epoch": 0.21646279454920533,
"grad_norm": 1.1171875,
"learning_rate": 5.933259747109042e-05,
"loss": 5.7355,
"mean_token_accuracy": 0.15207717418670655,
"num_tokens": 7219045.0,
"step": 3725
},
{
"entropy": 6.09136929512024,
"epoch": 0.21675334863584855,
"grad_norm": 1.0234375,
"learning_rate": 5.899977491283351e-05,
"loss": 5.7985,
"mean_token_accuracy": 0.1460676074028015,
"num_tokens": 7229337.0,
"step": 3730
},
{
"entropy": 6.045521926879883,
"epoch": 0.2170439027224918,
"grad_norm": 1.0703125,
"learning_rate": 5.867287410985908e-05,
"loss": 5.676,
"mean_token_accuracy": 0.15704198330640792,
"num_tokens": 7239217.0,
"step": 3735
},
{
"entropy": 6.094875192642212,
"epoch": 0.21733445680913502,
"grad_norm": 0.9609375,
"learning_rate": 5.835190402431779e-05,
"loss": 5.7444,
"mean_token_accuracy": 0.1542062446475029,
"num_tokens": 7249671.0,
"step": 3740
},
{
"entropy": 6.0608758449554445,
"epoch": 0.21762501089577824,
"grad_norm": 1.0625,
"learning_rate": 5.803687345576673e-05,
"loss": 5.6701,
"mean_token_accuracy": 0.16079277247190477,
"num_tokens": 7259028.0,
"step": 3745
},
{
"entropy": 6.12493200302124,
"epoch": 0.21791556498242148,
"grad_norm": 1.140625,
"learning_rate": 5.7727791040927977e-05,
"loss": 5.8492,
"mean_token_accuracy": 0.1423958010971546,
"num_tokens": 7269519.0,
"step": 3750
},
{
"entropy": 6.104196643829345,
"epoch": 0.2182061190690647,
"grad_norm": 1.0,
"learning_rate": 5.742466525345213e-05,
"loss": 5.7234,
"mean_token_accuracy": 0.15387822464108467,
"num_tokens": 7278822.0,
"step": 3755
},
{
"entropy": 6.136565637588501,
"epoch": 0.21849667315570795,
"grad_norm": 1.0859375,
"learning_rate": 5.7127504403685775e-05,
"loss": 5.7549,
"mean_token_accuracy": 0.159536774456501,
"num_tokens": 7289431.0,
"step": 3760
},
{
"entropy": 6.078890037536621,
"epoch": 0.21878722724235117,
"grad_norm": 0.9453125,
"learning_rate": 5.6836316638443664e-05,
"loss": 5.7342,
"mean_token_accuracy": 0.1513395741581917,
"num_tokens": 7299735.0,
"step": 3765
},
{
"entropy": 6.022913074493408,
"epoch": 0.2190777813289944,
"grad_norm": 1.0625,
"learning_rate": 5.655110994078553e-05,
"loss": 5.64,
"mean_token_accuracy": 0.15833066552877426,
"num_tokens": 7308798.0,
"step": 3770
},
{
"entropy": 6.052260112762451,
"epoch": 0.21936833541563763,
"grad_norm": 1.0859375,
"learning_rate": 5.6271892129797056e-05,
"loss": 5.6916,
"mean_token_accuracy": 0.1567830964922905,
"num_tokens": 7318159.0,
"step": 3775
},
{
"entropy": 5.97794771194458,
"epoch": 0.21965888950228085,
"grad_norm": 1.015625,
"learning_rate": 5.599867086037556e-05,
"loss": 5.631,
"mean_token_accuracy": 0.15575975477695464,
"num_tokens": 7328327.0,
"step": 3780
},
{
"entropy": 6.021618509292603,
"epoch": 0.21994944358892407,
"grad_norm": 1.0546875,
"learning_rate": 5.573145362302012e-05,
"loss": 5.6629,
"mean_token_accuracy": 0.15210975110530853,
"num_tokens": 7337595.0,
"step": 3785
},
{
"entropy": 6.101519775390625,
"epoch": 0.22023999767556732,
"grad_norm": 1.0546875,
"learning_rate": 5.5470247743626404e-05,
"loss": 5.8443,
"mean_token_accuracy": 0.14724614471197128,
"num_tokens": 7347049.0,
"step": 3790
},
{
"entropy": 6.0834808349609375,
"epoch": 0.22053055176221054,
"grad_norm": 1.1328125,
"learning_rate": 5.5215060383285414e-05,
"loss": 5.7311,
"mean_token_accuracy": 0.1475095644593239,
"num_tokens": 7356908.0,
"step": 3795
},
{
"entropy": 6.040810489654541,
"epoch": 0.22082110584885375,
"grad_norm": 1.046875,
"learning_rate": 5.496589853808759e-05,
"loss": 5.6475,
"mean_token_accuracy": 0.1548875778913498,
"num_tokens": 7366823.0,
"step": 3800
},
{
"entropy": 6.104894399642944,
"epoch": 0.221111659935497,
"grad_norm": 1.0859375,
"learning_rate": 5.47227690389308e-05,
"loss": 5.762,
"mean_token_accuracy": 0.1513790875673294,
"num_tokens": 7376502.0,
"step": 3805
},
{
"entropy": 6.138718318939209,
"epoch": 0.22140221402214022,
"grad_norm": 1.0234375,
"learning_rate": 5.448567855133306e-05,
"loss": 5.7874,
"mean_token_accuracy": 0.1471219077706337,
"num_tokens": 7385835.0,
"step": 3810
},
{
"entropy": 6.074849843978882,
"epoch": 0.22169276810878344,
"grad_norm": 1.0078125,
"learning_rate": 5.425463357524986e-05,
"loss": 5.668,
"mean_token_accuracy": 0.16173817664384843,
"num_tokens": 7395524.0,
"step": 3815
},
{
"entropy": 6.055670976638794,
"epoch": 0.22198332219542669,
"grad_norm": 1.0859375,
"learning_rate": 5.402964044489591e-05,
"loss": 5.6701,
"mean_token_accuracy": 0.15527116730809212,
"num_tokens": 7404724.0,
"step": 3820
},
{
"entropy": 6.029075717926025,
"epoch": 0.2222738762820699,
"grad_norm": 1.1640625,
"learning_rate": 5.381070532857153e-05,
"loss": 5.6407,
"mean_token_accuracy": 0.15602124780416488,
"num_tokens": 7413992.0,
"step": 3825
},
{
"entropy": 6.046741342544555,
"epoch": 0.22256443036871312,
"grad_norm": 1.0234375,
"learning_rate": 5.359783422849357e-05,
"loss": 5.6456,
"mean_token_accuracy": 0.15968140810728074,
"num_tokens": 7424449.0,
"step": 3830
},
{
"entropy": 6.006699466705323,
"epoch": 0.22285498445535637,
"grad_norm": 1.046875,
"learning_rate": 5.3391032980630736e-05,
"loss": 5.6126,
"mean_token_accuracy": 0.15793580561876297,
"num_tokens": 7433151.0,
"step": 3835
},
{
"entropy": 6.002806901931763,
"epoch": 0.2231455385419996,
"grad_norm": 1.03125,
"learning_rate": 5.31903072545437e-05,
"loss": 5.6538,
"mean_token_accuracy": 0.15995650440454484,
"num_tokens": 7442773.0,
"step": 3840
},
{
"entropy": 6.006146192550659,
"epoch": 0.22343609262864284,
"grad_norm": 1.1328125,
"learning_rate": 5.29956625532297e-05,
"loss": 5.6508,
"mean_token_accuracy": 0.15787655711174012,
"num_tokens": 7451316.0,
"step": 3845
},
{
"entropy": 6.016618442535401,
"epoch": 0.22372664671528605,
"grad_norm": 1.0390625,
"learning_rate": 5.280710421297146e-05,
"loss": 5.6834,
"mean_token_accuracy": 0.15794799476861954,
"num_tokens": 7459834.0,
"step": 3850
},
{
"entropy": 6.0921612739562985,
"epoch": 0.22401720080192927,
"grad_norm": 1.0234375,
"learning_rate": 5.2624637403191165e-05,
"loss": 5.8168,
"mean_token_accuracy": 0.14339097440242768,
"num_tokens": 7470885.0,
"step": 3855
},
{
"entropy": 6.014931774139404,
"epoch": 0.22430775488857252,
"grad_norm": 1.046875,
"learning_rate": 5.2448267126308605e-05,
"loss": 5.662,
"mean_token_accuracy": 0.1614661380648613,
"num_tokens": 7480045.0,
"step": 3860
},
{
"entropy": 6.109454822540283,
"epoch": 0.22459830897521574,
"grad_norm": 0.9765625,
"learning_rate": 5.2277998217603954e-05,
"loss": 5.7537,
"mean_token_accuracy": 0.1512547492980957,
"num_tokens": 7489734.0,
"step": 3865
},
{
"entropy": 6.119794225692749,
"epoch": 0.22488886306185896,
"grad_norm": 1.1015625,
"learning_rate": 5.211383534508541e-05,
"loss": 5.6229,
"mean_token_accuracy": 0.16408309638500213,
"num_tokens": 7499443.0,
"step": 3870
},
{
"entropy": 6.05659670829773,
"epoch": 0.2251794171485022,
"grad_norm": 0.99609375,
"learning_rate": 5.1955783009361044e-05,
"loss": 5.6906,
"mean_token_accuracy": 0.15638800263404845,
"num_tokens": 7509085.0,
"step": 3875
},
{
"entropy": 6.054824876785278,
"epoch": 0.22546997123514542,
"grad_norm": 1.015625,
"learning_rate": 5.180384554351543e-05,
"loss": 5.6424,
"mean_token_accuracy": 0.15496647357940674,
"num_tokens": 7519510.0,
"step": 3880
},
{
"entropy": 6.047353935241699,
"epoch": 0.22576052532178864,
"grad_norm": 1.1640625,
"learning_rate": 5.1658027112990976e-05,
"loss": 5.6524,
"mean_token_accuracy": 0.1555292695760727,
"num_tokens": 7528004.0,
"step": 3885
},
{
"entropy": 6.07807502746582,
"epoch": 0.2260510794084319,
"grad_norm": 1.078125,
"learning_rate": 5.151833171547365e-05,
"loss": 5.7679,
"mean_token_accuracy": 0.14742455780506133,
"num_tokens": 7538394.0,
"step": 3890
},
{
"entropy": 6.009245443344116,
"epoch": 0.2263416334950751,
"grad_norm": 1.078125,
"learning_rate": 5.1384763180783274e-05,
"loss": 5.6188,
"mean_token_accuracy": 0.1577170416712761,
"num_tokens": 7547998.0,
"step": 3895
},
{
"entropy": 6.032008123397827,
"epoch": 0.22663218758171833,
"grad_norm": 1.0625,
"learning_rate": 5.125732517076876e-05,
"loss": 5.7206,
"mean_token_accuracy": 0.15503439009189607,
"num_tokens": 7557914.0,
"step": 3900
},
{
"entropy": 6.0673810005187985,
"epoch": 0.22692274166836157,
"grad_norm": 1.015625,
"learning_rate": 5.113602117920747e-05,
"loss": 5.7497,
"mean_token_accuracy": 0.14790019690990447,
"num_tokens": 7567606.0,
"step": 3905
},
{
"entropy": 6.01710991859436,
"epoch": 0.2272132957550048,
"grad_norm": 1.0078125,
"learning_rate": 5.102085453170966e-05,
"loss": 5.6722,
"mean_token_accuracy": 0.1605480507016182,
"num_tokens": 7576774.0,
"step": 3910
},
{
"entropy": 6.016513299942017,
"epoch": 0.227503849841648,
"grad_norm": 1.59375,
"learning_rate": 5.091182838562709e-05,
"loss": 5.7362,
"mean_token_accuracy": 0.1543383590877056,
"num_tokens": 7586490.0,
"step": 3915
},
{
"entropy": 6.041366291046143,
"epoch": 0.22779440392829126,
"grad_norm": 1.2265625,
"learning_rate": 5.0808945729966716e-05,
"loss": 5.62,
"mean_token_accuracy": 0.16046071499586106,
"num_tokens": 7595527.0,
"step": 3920
},
{
"entropy": 6.037348937988281,
"epoch": 0.22808495801493447,
"grad_norm": 1.015625,
"learning_rate": 5.071220938530844e-05,
"loss": 5.6927,
"mean_token_accuracy": 0.15463001430034637,
"num_tokens": 7605829.0,
"step": 3925
},
{
"entropy": 6.089096164703369,
"epoch": 0.22837551210157772,
"grad_norm": 1.046875,
"learning_rate": 5.0621622003728026e-05,
"loss": 5.7329,
"mean_token_accuracy": 0.1547334223985672,
"num_tokens": 7615363.0,
"step": 3930
},
{
"entropy": 6.023686742782592,
"epoch": 0.22866606618822094,
"grad_norm": 1.1640625,
"learning_rate": 5.053718606872433e-05,
"loss": 5.6753,
"mean_token_accuracy": 0.15325114876031876,
"num_tokens": 7624752.0,
"step": 3935
},
{
"entropy": 6.073769903182983,
"epoch": 0.22895662027486416,
"grad_norm": 1.0234375,
"learning_rate": 5.0458903895151134e-05,
"loss": 5.714,
"mean_token_accuracy": 0.15586349815130235,
"num_tokens": 7633958.0,
"step": 3940
},
{
"entropy": 6.045828437805175,
"epoch": 0.2292471743615074,
"grad_norm": 1.0390625,
"learning_rate": 5.038677762915381e-05,
"loss": 5.6356,
"mean_token_accuracy": 0.16323170363903045,
"num_tokens": 7643278.0,
"step": 3945
},
{
"entropy": 6.108937072753906,
"epoch": 0.22953772844815062,
"grad_norm": 0.99609375,
"learning_rate": 5.032080924811033e-05,
"loss": 5.7243,
"mean_token_accuracy": 0.15668186247348787,
"num_tokens": 7653378.0,
"step": 3950
},
{
"entropy": 6.043759202957153,
"epoch": 0.22982828253479384,
"grad_norm": 1.1015625,
"learning_rate": 5.026100056057718e-05,
"loss": 5.6752,
"mean_token_accuracy": 0.16054976135492324,
"num_tokens": 7663218.0,
"step": 3955
},
{
"entropy": 6.083030700683594,
"epoch": 0.2301188366214371,
"grad_norm": 1.0703125,
"learning_rate": 5.0207353206239764e-05,
"loss": 5.6591,
"mean_token_accuracy": 0.15775054395198823,
"num_tokens": 7672578.0,
"step": 3960
},
{
"entropy": 6.10399055480957,
"epoch": 0.2304093907080803,
"grad_norm": 1.09375,
"learning_rate": 5.0159868655867436e-05,
"loss": 5.7522,
"mean_token_accuracy": 0.1491328150033951,
"num_tokens": 7682335.0,
"step": 3965
},
{
"entropy": 6.117215824127197,
"epoch": 0.23069994479472353,
"grad_norm": 1.015625,
"learning_rate": 5.011854821127305e-05,
"loss": 5.7766,
"mean_token_accuracy": 0.15085969120264053,
"num_tokens": 7692363.0,
"step": 3970
},
{
"entropy": 6.1109333515167235,
"epoch": 0.23099049888136677,
"grad_norm": 1.125,
"learning_rate": 5.008339300527755e-05,
"loss": 5.6969,
"mean_token_accuracy": 0.16164307296276093,
"num_tokens": 7701880.0,
"step": 3975
},
{
"entropy": 6.090880966186523,
"epoch": 0.23128105296801,
"grad_norm": 1.0,
"learning_rate": 5.005440400167859e-05,
"loss": 5.722,
"mean_token_accuracy": 0.15293850004673004,
"num_tokens": 7711203.0,
"step": 3980
},
{
"entropy": 6.051671552658081,
"epoch": 0.2315716070546532,
"grad_norm": 1.1171875,
"learning_rate": 5.003158199522442e-05,
"loss": 5.6291,
"mean_token_accuracy": 0.15574965327978135,
"num_tokens": 7720154.0,
"step": 3985
},
{
"entropy": 6.046511125564575,
"epoch": 0.23186216114129646,
"grad_norm": 0.94140625,
"learning_rate": 5.0014927611591806e-05,
"loss": 5.6926,
"mean_token_accuracy": 0.15550984293222428,
"num_tokens": 7730469.0,
"step": 3990
},
{
"entropy": 6.068615293502807,
"epoch": 0.23215271522793968,
"grad_norm": 1.046875,
"learning_rate": 5.000444130736916e-05,
"loss": 5.6147,
"mean_token_accuracy": 0.158310903608799,
"num_tokens": 7739190.0,
"step": 3995
},
{
"entropy": 6.060236883163452,
"epoch": 0.23244326931458292,
"grad_norm": 0.9765625,
"learning_rate": 5.0000123370043736e-05,
"loss": 5.7513,
"mean_token_accuracy": 0.15361028015613556,
"num_tokens": 7749106.0,
"step": 4000
}
],
"logging_steps": 5,
"max_steps": 4000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1686628301045760.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}