Files
P2-split3_prob_Qwen3-8B-Bas…/trainer_state.json

7694 lines
217 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 765,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.58074951171875,
"epoch": 0.00392156862745098,
"grad_norm": 5.946658172831002,
"learning_rate": 0.0,
"loss": 1.4047,
"mean_token_accuracy": 0.6541401473805308,
"num_tokens": 394099.0,
"step": 1
},
{
"entropy": 0.57122802734375,
"epoch": 0.00784313725490196,
"grad_norm": 5.621097040674523,
"learning_rate": 5.128205128205128e-07,
"loss": 1.3784,
"mean_token_accuracy": 0.6572609562426805,
"num_tokens": 836994.0,
"step": 2
},
{
"entropy": 0.568328857421875,
"epoch": 0.011764705882352941,
"grad_norm": 5.89101271491728,
"learning_rate": 1.0256410256410257e-06,
"loss": 1.3794,
"mean_token_accuracy": 0.6572064198553562,
"num_tokens": 1266293.0,
"step": 3
},
{
"entropy": 0.559295654296875,
"epoch": 0.01568627450980392,
"grad_norm": 5.7409378697761975,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.4025,
"mean_token_accuracy": 0.6549294730648398,
"num_tokens": 1704596.0,
"step": 4
},
{
"entropy": 0.5743408203125,
"epoch": 0.0196078431372549,
"grad_norm": 5.80997969854973,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.3921,
"mean_token_accuracy": 0.6552458582445979,
"num_tokens": 2124104.0,
"step": 5
},
{
"entropy": 0.5599365234375,
"epoch": 0.023529411764705882,
"grad_norm": 5.314201402415645,
"learning_rate": 2.564102564102564e-06,
"loss": 1.3713,
"mean_token_accuracy": 0.6574546648189425,
"num_tokens": 2587214.0,
"step": 6
},
{
"entropy": 0.565338134765625,
"epoch": 0.027450980392156862,
"grad_norm": 5.22023313812165,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.3643,
"mean_token_accuracy": 0.6583243450149894,
"num_tokens": 3036984.0,
"step": 7
},
{
"entropy": 0.565338134765625,
"epoch": 0.03137254901960784,
"grad_norm": 4.414498696886829,
"learning_rate": 3.58974358974359e-06,
"loss": 1.2991,
"mean_token_accuracy": 0.671512926928699,
"num_tokens": 3486417.0,
"step": 8
},
{
"entropy": 0.557159423828125,
"epoch": 0.03529411764705882,
"grad_norm": 4.436540831930959,
"learning_rate": 4.102564102564103e-06,
"loss": 1.2795,
"mean_token_accuracy": 0.670446545816958,
"num_tokens": 3929660.0,
"step": 9
},
{
"entropy": 0.591888427734375,
"epoch": 0.0392156862745098,
"grad_norm": 3.712979430380243,
"learning_rate": 4.615384615384616e-06,
"loss": 1.1742,
"mean_token_accuracy": 0.6933586550876498,
"num_tokens": 4322786.0,
"step": 10
},
{
"entropy": 0.5743408203125,
"epoch": 0.043137254901960784,
"grad_norm": 3.424125055764591,
"learning_rate": 5.128205128205128e-06,
"loss": 1.1474,
"mean_token_accuracy": 0.6944275395944715,
"num_tokens": 4734477.0,
"step": 11
},
{
"entropy": 0.555755615234375,
"epoch": 0.047058823529411764,
"grad_norm": 3.226923962462494,
"learning_rate": 5.641025641025641e-06,
"loss": 1.1461,
"mean_token_accuracy": 0.6922548627480865,
"num_tokens": 5179678.0,
"step": 12
},
{
"entropy": 0.535308837890625,
"epoch": 0.050980392156862744,
"grad_norm": 4.36070110135058,
"learning_rate": 6.153846153846155e-06,
"loss": 1.0418,
"mean_token_accuracy": 0.7161724548786879,
"num_tokens": 5610766.0,
"step": 13
},
{
"entropy": 0.535797119140625,
"epoch": 0.054901960784313725,
"grad_norm": 4.551396130956159,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0402,
"mean_token_accuracy": 0.7155537949874997,
"num_tokens": 6047549.0,
"step": 14
},
{
"entropy": 0.55712890625,
"epoch": 0.058823529411764705,
"grad_norm": 3.8573225106802878,
"learning_rate": 7.17948717948718e-06,
"loss": 0.9984,
"mean_token_accuracy": 0.7221314841881394,
"num_tokens": 6462160.0,
"step": 15
},
{
"entropy": 0.533477783203125,
"epoch": 0.06274509803921569,
"grad_norm": 3.288341958711061,
"learning_rate": 7.692307692307694e-06,
"loss": 0.9651,
"mean_token_accuracy": 0.7295441031455994,
"num_tokens": 6903680.0,
"step": 16
},
{
"entropy": 0.540863037109375,
"epoch": 0.06666666666666667,
"grad_norm": 2.7586452982074854,
"learning_rate": 8.205128205128205e-06,
"loss": 0.9166,
"mean_token_accuracy": 0.7403897074982524,
"num_tokens": 7336790.0,
"step": 17
},
{
"entropy": 0.542144775390625,
"epoch": 0.07058823529411765,
"grad_norm": 3.9060092823437444,
"learning_rate": 8.717948717948719e-06,
"loss": 0.9228,
"mean_token_accuracy": 0.7334894333034754,
"num_tokens": 7778939.0,
"step": 18
},
{
"entropy": 0.54852294921875,
"epoch": 0.07450980392156863,
"grad_norm": 3.6701854828060054,
"learning_rate": 9.230769230769232e-06,
"loss": 0.8874,
"mean_token_accuracy": 0.7458300339058042,
"num_tokens": 8178997.0,
"step": 19
},
{
"entropy": 0.542633056640625,
"epoch": 0.0784313725490196,
"grad_norm": 2.959831958866965,
"learning_rate": 9.743589743589744e-06,
"loss": 0.8613,
"mean_token_accuracy": 0.7516397852450609,
"num_tokens": 8590073.0,
"step": 20
},
{
"entropy": 0.53009033203125,
"epoch": 0.08235294117647059,
"grad_norm": 2.5206963996671266,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.8448,
"mean_token_accuracy": 0.7533183237537742,
"num_tokens": 9023810.0,
"step": 21
},
{
"entropy": 0.545440673828125,
"epoch": 0.08627450980392157,
"grad_norm": 2.6387540158464966,
"learning_rate": 1.076923076923077e-05,
"loss": 0.8468,
"mean_token_accuracy": 0.7532298862934113,
"num_tokens": 9454262.0,
"step": 22
},
{
"entropy": 0.530426025390625,
"epoch": 0.09019607843137255,
"grad_norm": 2.2953463674608385,
"learning_rate": 1.1282051282051283e-05,
"loss": 0.8381,
"mean_token_accuracy": 0.7586212726309896,
"num_tokens": 9903204.0,
"step": 23
},
{
"entropy": 0.530609130859375,
"epoch": 0.09411764705882353,
"grad_norm": 2.233239119821126,
"learning_rate": 1.1794871794871796e-05,
"loss": 0.7955,
"mean_token_accuracy": 0.7648084424436092,
"num_tokens": 10332966.0,
"step": 24
},
{
"entropy": 0.549896240234375,
"epoch": 0.09803921568627451,
"grad_norm": 2.298568482502839,
"learning_rate": 1.230769230769231e-05,
"loss": 0.7833,
"mean_token_accuracy": 0.7674612868577242,
"num_tokens": 10758034.0,
"step": 25
},
{
"entropy": 0.532623291015625,
"epoch": 0.10196078431372549,
"grad_norm": 2.1793696589064484,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.7821,
"mean_token_accuracy": 0.7678138092160225,
"num_tokens": 11188541.0,
"step": 26
},
{
"entropy": 0.5162353515625,
"epoch": 0.10588235294117647,
"grad_norm": 1.8946351853054721,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.7349,
"mean_token_accuracy": 0.7782322531566024,
"num_tokens": 11604712.0,
"step": 27
},
{
"entropy": 0.52069091796875,
"epoch": 0.10980392156862745,
"grad_norm": 1.773459568207231,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.7433,
"mean_token_accuracy": 0.7758480096235871,
"num_tokens": 12040052.0,
"step": 28
},
{
"entropy": 0.514068603515625,
"epoch": 0.11372549019607843,
"grad_norm": 1.721966950909518,
"learning_rate": 1.435897435897436e-05,
"loss": 0.7389,
"mean_token_accuracy": 0.7765677766874433,
"num_tokens": 12466998.0,
"step": 29
},
{
"entropy": 0.512847900390625,
"epoch": 0.11764705882352941,
"grad_norm": 2.021391596891171,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7790881004184484,
"num_tokens": 12901157.0,
"step": 30
},
{
"entropy": 0.517608642578125,
"epoch": 0.12156862745098039,
"grad_norm": 1.9325766267617275,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.7287,
"mean_token_accuracy": 0.7784575093537569,
"num_tokens": 13338766.0,
"step": 31
},
{
"entropy": 0.5120849609375,
"epoch": 0.12549019607843137,
"grad_norm": 1.9477974950463575,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.6998,
"mean_token_accuracy": 0.7857342725619674,
"num_tokens": 13766779.0,
"step": 32
},
{
"entropy": 0.519744873046875,
"epoch": 0.12941176470588237,
"grad_norm": 2.348720028802442,
"learning_rate": 1.641025641025641e-05,
"loss": 0.6994,
"mean_token_accuracy": 0.7838903805240989,
"num_tokens": 14181903.0,
"step": 33
},
{
"entropy": 0.501556396484375,
"epoch": 0.13333333333333333,
"grad_norm": 1.693617785569785,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.7034,
"mean_token_accuracy": 0.7820147024467587,
"num_tokens": 14621503.0,
"step": 34
},
{
"entropy": 0.50091552734375,
"epoch": 0.13725490196078433,
"grad_norm": 1.8273930967809657,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.693,
"mean_token_accuracy": 0.7863769382238388,
"num_tokens": 15047501.0,
"step": 35
},
{
"entropy": 0.506439208984375,
"epoch": 0.1411764705882353,
"grad_norm": 1.7334306818931762,
"learning_rate": 1.794871794871795e-05,
"loss": 0.6676,
"mean_token_accuracy": 0.789363824762404,
"num_tokens": 15469986.0,
"step": 36
},
{
"entropy": 0.50177001953125,
"epoch": 0.1450980392156863,
"grad_norm": 1.9378805916268533,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.6858,
"mean_token_accuracy": 0.7883838685229421,
"num_tokens": 15911435.0,
"step": 37
},
{
"entropy": 0.502349853515625,
"epoch": 0.14901960784313725,
"grad_norm": 1.7939770238739534,
"learning_rate": 1.8974358974358975e-05,
"loss": 0.6574,
"mean_token_accuracy": 0.7959268698468804,
"num_tokens": 16329945.0,
"step": 38
},
{
"entropy": 0.50067138671875,
"epoch": 0.15294117647058825,
"grad_norm": 1.7417461015153004,
"learning_rate": 1.9487179487179488e-05,
"loss": 0.6562,
"mean_token_accuracy": 0.7929449649527669,
"num_tokens": 16759988.0,
"step": 39
},
{
"entropy": 0.504241943359375,
"epoch": 0.1568627450980392,
"grad_norm": 1.7893435723544215,
"learning_rate": 2e-05,
"loss": 0.6543,
"mean_token_accuracy": 0.7933545038104057,
"num_tokens": 17192249.0,
"step": 40
},
{
"entropy": 0.497528076171875,
"epoch": 0.1607843137254902,
"grad_norm": 1.6271906758881933,
"learning_rate": 1.9999906374137693e-05,
"loss": 0.667,
"mean_token_accuracy": 0.7928741071373224,
"num_tokens": 17635747.0,
"step": 41
},
{
"entropy": 0.499053955078125,
"epoch": 0.16470588235294117,
"grad_norm": 1.5398454598257563,
"learning_rate": 1.9999625498303936e-05,
"loss": 0.6406,
"mean_token_accuracy": 0.7975211972370744,
"num_tokens": 18069109.0,
"step": 42
},
{
"entropy": 0.494903564453125,
"epoch": 0.16862745098039217,
"grad_norm": 1.5523620609569904,
"learning_rate": 1.999915737775817e-05,
"loss": 0.661,
"mean_token_accuracy": 0.791739515028894,
"num_tokens": 18529022.0,
"step": 43
},
{
"entropy": 0.5147705078125,
"epoch": 0.17254901960784313,
"grad_norm": 1.7516517318464624,
"learning_rate": 1.999850202126604e-05,
"loss": 0.6412,
"mean_token_accuracy": 0.7992083700373769,
"num_tokens": 18950266.0,
"step": 44
},
{
"entropy": 0.493988037109375,
"epoch": 0.17647058823529413,
"grad_norm": 1.443212801622469,
"learning_rate": 1.9997659441099205e-05,
"loss": 0.6292,
"mean_token_accuracy": 0.8008020855486393,
"num_tokens": 19382389.0,
"step": 45
},
{
"entropy": 0.493621826171875,
"epoch": 0.1803921568627451,
"grad_norm": 1.8120426269144319,
"learning_rate": 1.9996629653035128e-05,
"loss": 0.6301,
"mean_token_accuracy": 0.7986322436481714,
"num_tokens": 19814774.0,
"step": 46
},
{
"entropy": 0.483428955078125,
"epoch": 0.1843137254901961,
"grad_norm": 1.504152484077655,
"learning_rate": 1.999541267635676e-05,
"loss": 0.6146,
"mean_token_accuracy": 0.8071837406605482,
"num_tokens": 20252726.0,
"step": 47
},
{
"entropy": 0.48736572265625,
"epoch": 0.18823529411764706,
"grad_norm": 1.5994176249090368,
"learning_rate": 1.999400853385221e-05,
"loss": 0.6079,
"mean_token_accuracy": 0.8068799478933215,
"num_tokens": 20693231.0,
"step": 48
},
{
"entropy": 0.503387451171875,
"epoch": 0.19215686274509805,
"grad_norm": 1.6595523586084884,
"learning_rate": 1.999241725181428e-05,
"loss": 0.622,
"mean_token_accuracy": 0.8015562687069178,
"num_tokens": 21098332.0,
"step": 49
},
{
"entropy": 0.4970703125,
"epoch": 0.19607843137254902,
"grad_norm": 1.7649764098243288,
"learning_rate": 1.9990638860040007e-05,
"loss": 0.6198,
"mean_token_accuracy": 0.8020276734605432,
"num_tokens": 21516995.0,
"step": 50
},
{
"entropy": 0.49176025390625,
"epoch": 0.2,
"grad_norm": 1.5530289762258092,
"learning_rate": 1.9988673391830082e-05,
"loss": 0.6094,
"mean_token_accuracy": 0.805065156891942,
"num_tokens": 21940900.0,
"step": 51
},
{
"entropy": 0.486602783203125,
"epoch": 0.20392156862745098,
"grad_norm": 1.5600492363872476,
"learning_rate": 1.9986520883988233e-05,
"loss": 0.6072,
"mean_token_accuracy": 0.8033135803416371,
"num_tokens": 22373270.0,
"step": 52
},
{
"entropy": 0.49688720703125,
"epoch": 0.20784313725490197,
"grad_norm": 1.3244940248118993,
"learning_rate": 1.9984181376820542e-05,
"loss": 0.6056,
"mean_token_accuracy": 0.804581237025559,
"num_tokens": 22795686.0,
"step": 53
},
{
"entropy": 0.477386474609375,
"epoch": 0.21176470588235294,
"grad_norm": 1.4188422513941414,
"learning_rate": 1.9981654914134684e-05,
"loss": 0.613,
"mean_token_accuracy": 0.8030998045578599,
"num_tokens": 23237348.0,
"step": 54
},
{
"entropy": 0.491363525390625,
"epoch": 0.21568627450980393,
"grad_norm": 1.3877977869185212,
"learning_rate": 1.997894154323911e-05,
"loss": 0.5934,
"mean_token_accuracy": 0.8087702514603734,
"num_tokens": 23658608.0,
"step": 55
},
{
"entropy": 0.485931396484375,
"epoch": 0.2196078431372549,
"grad_norm": 1.3154982477831079,
"learning_rate": 1.9976041314942156e-05,
"loss": 0.5871,
"mean_token_accuracy": 0.810174492187798,
"num_tokens": 24088562.0,
"step": 56
},
{
"entropy": 0.4814453125,
"epoch": 0.2235294117647059,
"grad_norm": 1.4183101521509853,
"learning_rate": 1.99729542835511e-05,
"loss": 0.61,
"mean_token_accuracy": 0.8048477824777365,
"num_tokens": 24525442.0,
"step": 57
},
{
"entropy": 0.48516845703125,
"epoch": 0.22745098039215686,
"grad_norm": 1.3527504787456999,
"learning_rate": 1.9969680506871138e-05,
"loss": 0.5784,
"mean_token_accuracy": 0.8116851877421141,
"num_tokens": 24950326.0,
"step": 58
},
{
"entropy": 0.482147216796875,
"epoch": 0.23137254901960785,
"grad_norm": 1.4718082614826202,
"learning_rate": 1.9966220046204295e-05,
"loss": 0.5971,
"mean_token_accuracy": 0.8080776166170835,
"num_tokens": 25389519.0,
"step": 59
},
{
"entropy": 0.48907470703125,
"epoch": 0.23529411764705882,
"grad_norm": 1.4367294679131557,
"learning_rate": 1.99625729663483e-05,
"loss": 0.599,
"mean_token_accuracy": 0.8051031418144703,
"num_tokens": 25815656.0,
"step": 60
},
{
"entropy": 0.485626220703125,
"epoch": 0.23921568627450981,
"grad_norm": 1.5409462289066693,
"learning_rate": 1.995873933559535e-05,
"loss": 0.5969,
"mean_token_accuracy": 0.8079076996073127,
"num_tokens": 26249330.0,
"step": 61
},
{
"entropy": 0.47406005859375,
"epoch": 0.24313725490196078,
"grad_norm": 1.2353178316165938,
"learning_rate": 1.9954719225730847e-05,
"loss": 0.5713,
"mean_token_accuracy": 0.8120078714564443,
"num_tokens": 26683828.0,
"step": 62
},
{
"entropy": 0.474151611328125,
"epoch": 0.24705882352941178,
"grad_norm": 1.2967444220306512,
"learning_rate": 1.9950512712032038e-05,
"loss": 0.5836,
"mean_token_accuracy": 0.810034915804863,
"num_tokens": 27116083.0,
"step": 63
},
{
"entropy": 0.474334716796875,
"epoch": 0.25098039215686274,
"grad_norm": 1.3949569129085198,
"learning_rate": 1.9946119873266615e-05,
"loss": 0.5853,
"mean_token_accuracy": 0.8108502132818103,
"num_tokens": 27564582.0,
"step": 64
},
{
"entropy": 0.477752685546875,
"epoch": 0.2549019607843137,
"grad_norm": 1.1933833335696828,
"learning_rate": 1.9941540791691245e-05,
"loss": 0.5797,
"mean_token_accuracy": 0.8111855685710907,
"num_tokens": 28010664.0,
"step": 65
},
{
"entropy": 0.485443115234375,
"epoch": 0.25882352941176473,
"grad_norm": 1.263992447554956,
"learning_rate": 1.9936775553050017e-05,
"loss": 0.5616,
"mean_token_accuracy": 0.8182382667437196,
"num_tokens": 28423217.0,
"step": 66
},
{
"entropy": 0.474945068359375,
"epoch": 0.2627450980392157,
"grad_norm": 1.327268023915043,
"learning_rate": 1.993182424657285e-05,
"loss": 0.5833,
"mean_token_accuracy": 0.8107379814609885,
"num_tokens": 28859600.0,
"step": 67
},
{
"entropy": 0.47674560546875,
"epoch": 0.26666666666666666,
"grad_norm": 1.424999013681543,
"learning_rate": 1.9926686964973813e-05,
"loss": 0.5725,
"mean_token_accuracy": 0.8136901557445526,
"num_tokens": 29274182.0,
"step": 68
},
{
"entropy": 0.480072021484375,
"epoch": 0.27058823529411763,
"grad_norm": 1.418343251105222,
"learning_rate": 1.9921363804449383e-05,
"loss": 0.5875,
"mean_token_accuracy": 0.8093108516186476,
"num_tokens": 29705582.0,
"step": 69
},
{
"entropy": 0.47283935546875,
"epoch": 0.27450980392156865,
"grad_norm": 1.4521724256504855,
"learning_rate": 1.9915854864676665e-05,
"loss": 0.5891,
"mean_token_accuracy": 0.8094787122681737,
"num_tokens": 30137961.0,
"step": 70
},
{
"entropy": 0.466949462890625,
"epoch": 0.2784313725490196,
"grad_norm": 1.2792010927676387,
"learning_rate": 1.9910160248811502e-05,
"loss": 0.5723,
"mean_token_accuracy": 0.8133273124694824,
"num_tokens": 30574142.0,
"step": 71
},
{
"entropy": 0.474884033203125,
"epoch": 0.2823529411764706,
"grad_norm": 1.6473953355135174,
"learning_rate": 1.9904280063486563e-05,
"loss": 0.5673,
"mean_token_accuracy": 0.8135095341131091,
"num_tokens": 31007161.0,
"step": 72
},
{
"entropy": 0.480682373046875,
"epoch": 0.28627450980392155,
"grad_norm": 1.4162141272460542,
"learning_rate": 1.989821441880933e-05,
"loss": 0.5749,
"mean_token_accuracy": 0.8136226320639253,
"num_tokens": 31434909.0,
"step": 73
},
{
"entropy": 0.4854736328125,
"epoch": 0.2901960784313726,
"grad_norm": 1.4648510763716496,
"learning_rate": 1.9891963428360043e-05,
"loss": 0.5757,
"mean_token_accuracy": 0.8133104396983981,
"num_tokens": 31858663.0,
"step": 74
},
{
"entropy": 0.486297607421875,
"epoch": 0.29411764705882354,
"grad_norm": 1.3901996305935094,
"learning_rate": 1.9885527209189577e-05,
"loss": 0.5591,
"mean_token_accuracy": 0.815345604903996,
"num_tokens": 32263979.0,
"step": 75
},
{
"entropy": 0.474151611328125,
"epoch": 0.2980392156862745,
"grad_norm": 1.416507682950556,
"learning_rate": 1.9878905881817254e-05,
"loss": 0.5565,
"mean_token_accuracy": 0.8163448050618172,
"num_tokens": 32681289.0,
"step": 76
},
{
"entropy": 0.46917724609375,
"epoch": 0.30196078431372547,
"grad_norm": 1.1900128401920054,
"learning_rate": 1.9872099570228556e-05,
"loss": 0.5545,
"mean_token_accuracy": 0.819454850628972,
"num_tokens": 33117944.0,
"step": 77
},
{
"entropy": 0.469085693359375,
"epoch": 0.3058823529411765,
"grad_norm": 1.273205865013161,
"learning_rate": 1.9865108401872856e-05,
"loss": 0.5757,
"mean_token_accuracy": 0.8083239402621984,
"num_tokens": 33569669.0,
"step": 78
},
{
"entropy": 0.469879150390625,
"epoch": 0.30980392156862746,
"grad_norm": 1.3425586659856636,
"learning_rate": 1.9857932507660983e-05,
"loss": 0.5601,
"mean_token_accuracy": 0.8155460730195045,
"num_tokens": 34000102.0,
"step": 79
},
{
"entropy": 0.469207763671875,
"epoch": 0.3137254901960784,
"grad_norm": 1.1089356147226235,
"learning_rate": 1.9850572021962788e-05,
"loss": 0.5536,
"mean_token_accuracy": 0.8189804637804627,
"num_tokens": 34433110.0,
"step": 80
},
{
"entropy": 0.474700927734375,
"epoch": 0.3176470588235294,
"grad_norm": 1.2801670983454294,
"learning_rate": 1.984302708260464e-05,
"loss": 0.5681,
"mean_token_accuracy": 0.8136863615363836,
"num_tokens": 34884713.0,
"step": 81
},
{
"entropy": 0.470916748046875,
"epoch": 0.3215686274509804,
"grad_norm": 1.168115984160054,
"learning_rate": 1.9835297830866827e-05,
"loss": 0.5551,
"mean_token_accuracy": 0.8168244622647762,
"num_tokens": 35326962.0,
"step": 82
},
{
"entropy": 0.480133056640625,
"epoch": 0.3254901960784314,
"grad_norm": 1.3647458163316752,
"learning_rate": 1.9827384411480924e-05,
"loss": 0.5445,
"mean_token_accuracy": 0.8184509659186006,
"num_tokens": 35734884.0,
"step": 83
},
{
"entropy": 0.479736328125,
"epoch": 0.32941176470588235,
"grad_norm": 1.319678355325207,
"learning_rate": 1.9819286972627066e-05,
"loss": 0.5498,
"mean_token_accuracy": 0.8164876466616988,
"num_tokens": 36156751.0,
"step": 84
},
{
"entropy": 0.475982666015625,
"epoch": 0.3333333333333333,
"grad_norm": 1.2319398960631225,
"learning_rate": 1.9811005665931205e-05,
"loss": 0.5606,
"mean_token_accuracy": 0.8145118253305554,
"num_tokens": 36585317.0,
"step": 85
},
{
"entropy": 0.4764404296875,
"epoch": 0.33725490196078434,
"grad_norm": 1.2931356300554002,
"learning_rate": 1.980254064646223e-05,
"loss": 0.5549,
"mean_token_accuracy": 0.8165107127279043,
"num_tokens": 37018661.0,
"step": 86
},
{
"entropy": 0.4676513671875,
"epoch": 0.3411764705882353,
"grad_norm": 1.2256666633113913,
"learning_rate": 1.9793892072729087e-05,
"loss": 0.5453,
"mean_token_accuracy": 0.8220384856685996,
"num_tokens": 37444396.0,
"step": 87
},
{
"entropy": 0.464141845703125,
"epoch": 0.34509803921568627,
"grad_norm": 1.37296362447334,
"learning_rate": 1.9785060106677818e-05,
"loss": 0.5549,
"mean_token_accuracy": 0.8157011214643717,
"num_tokens": 37867417.0,
"step": 88
},
{
"entropy": 0.465057373046875,
"epoch": 0.34901960784313724,
"grad_norm": 1.3470877230759366,
"learning_rate": 1.9776044913688503e-05,
"loss": 0.5546,
"mean_token_accuracy": 0.817422934807837,
"num_tokens": 38323018.0,
"step": 89
},
{
"entropy": 0.467376708984375,
"epoch": 0.35294117647058826,
"grad_norm": 1.2559034842680823,
"learning_rate": 1.976684666257219e-05,
"loss": 0.5691,
"mean_token_accuracy": 0.8121252795681357,
"num_tokens": 38780992.0,
"step": 90
},
{
"entropy": 0.47125244140625,
"epoch": 0.3568627450980392,
"grad_norm": 1.2755650974437212,
"learning_rate": 1.975746552556772e-05,
"loss": 0.544,
"mean_token_accuracy": 0.8212292147800326,
"num_tokens": 39200876.0,
"step": 91
},
{
"entropy": 0.467437744140625,
"epoch": 0.3607843137254902,
"grad_norm": 1.0816461876931776,
"learning_rate": 1.9747901678338496e-05,
"loss": 0.5494,
"mean_token_accuracy": 0.8188506988808513,
"num_tokens": 39637981.0,
"step": 92
},
{
"entropy": 0.4659423828125,
"epoch": 0.36470588235294116,
"grad_norm": 1.0361403686428494,
"learning_rate": 1.9738155299969207e-05,
"loss": 0.5368,
"mean_token_accuracy": 0.8218570798635483,
"num_tokens": 40075528.0,
"step": 93
},
{
"entropy": 0.469573974609375,
"epoch": 0.3686274509803922,
"grad_norm": 1.2736306848315126,
"learning_rate": 1.9728226572962474e-05,
"loss": 0.5592,
"mean_token_accuracy": 0.818149627186358,
"num_tokens": 40513207.0,
"step": 94
},
{
"entropy": 0.4595947265625,
"epoch": 0.37254901960784315,
"grad_norm": 1.1806358194501028,
"learning_rate": 1.9718115683235418e-05,
"loss": 0.5355,
"mean_token_accuracy": 0.8224614998325706,
"num_tokens": 40936903.0,
"step": 95
},
{
"entropy": 0.46954345703125,
"epoch": 0.3764705882352941,
"grad_norm": 1.2155743490435709,
"learning_rate": 1.9707822820116193e-05,
"loss": 0.5347,
"mean_token_accuracy": 0.8223023796454072,
"num_tokens": 41351554.0,
"step": 96
},
{
"entropy": 0.45892333984375,
"epoch": 0.3803921568627451,
"grad_norm": 1.2902102201493495,
"learning_rate": 1.9697348176340442e-05,
"loss": 0.5605,
"mean_token_accuracy": 0.8165110582485795,
"num_tokens": 41818184.0,
"step": 97
},
{
"entropy": 0.458892822265625,
"epoch": 0.3843137254901961,
"grad_norm": 1.1739046775651587,
"learning_rate": 1.9686691948047665e-05,
"loss": 0.5268,
"mean_token_accuracy": 0.8239621166139841,
"num_tokens": 42253116.0,
"step": 98
},
{
"entropy": 0.4661865234375,
"epoch": 0.38823529411764707,
"grad_norm": 1.1935459708374951,
"learning_rate": 1.9675854334777585e-05,
"loss": 0.5605,
"mean_token_accuracy": 0.8175614606589079,
"num_tokens": 42700051.0,
"step": 99
},
{
"entropy": 0.45660400390625,
"epoch": 0.39215686274509803,
"grad_norm": 1.2019966132878526,
"learning_rate": 1.966483553946637e-05,
"loss": 0.5513,
"mean_token_accuracy": 0.8168433653190732,
"num_tokens": 43137034.0,
"step": 100
},
{
"entropy": 0.47357177734375,
"epoch": 0.396078431372549,
"grad_norm": 1.2701212654163299,
"learning_rate": 1.9653635768442872e-05,
"loss": 0.5448,
"mean_token_accuracy": 0.8182221315801144,
"num_tokens": 43550710.0,
"step": 101
},
{
"entropy": 0.469451904296875,
"epoch": 0.4,
"grad_norm": 1.191074358150651,
"learning_rate": 1.964225523142473e-05,
"loss": 0.5408,
"mean_token_accuracy": 0.8197409231215715,
"num_tokens": 43976597.0,
"step": 102
},
{
"entropy": 0.4630126953125,
"epoch": 0.403921568627451,
"grad_norm": 1.1845159595637624,
"learning_rate": 1.9630694141514467e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8227794086560607,
"num_tokens": 44386677.0,
"step": 103
},
{
"entropy": 0.45684814453125,
"epoch": 0.40784313725490196,
"grad_norm": 0.9700158907451113,
"learning_rate": 1.9618952715195476e-05,
"loss": 0.5366,
"mean_token_accuracy": 0.8220989098772407,
"num_tokens": 44824398.0,
"step": 104
},
{
"entropy": 0.466583251953125,
"epoch": 0.4117647058823529,
"grad_norm": 1.4454386759696154,
"learning_rate": 1.9607031172327998e-05,
"loss": 0.5436,
"mean_token_accuracy": 0.8189141182228923,
"num_tokens": 45253415.0,
"step": 105
},
{
"entropy": 0.46417236328125,
"epoch": 0.41568627450980394,
"grad_norm": 1.020453739389049,
"learning_rate": 1.9594929736144978e-05,
"loss": 0.5221,
"mean_token_accuracy": 0.8261368265375495,
"num_tokens": 45662349.0,
"step": 106
},
{
"entropy": 0.456329345703125,
"epoch": 0.4196078431372549,
"grad_norm": 1.2120811793285806,
"learning_rate": 1.958264863324789e-05,
"loss": 0.5345,
"mean_token_accuracy": 0.8214567014947534,
"num_tokens": 46116282.0,
"step": 107
},
{
"entropy": 0.451568603515625,
"epoch": 0.4235294117647059,
"grad_norm": 1.1449867557458162,
"learning_rate": 1.9570188093602512e-05,
"loss": 0.5368,
"mean_token_accuracy": 0.8221517028287053,
"num_tokens": 46561211.0,
"step": 108
},
{
"entropy": 0.46954345703125,
"epoch": 0.42745098039215684,
"grad_norm": 1.1463970481434735,
"learning_rate": 1.955754835053459e-05,
"loss": 0.5444,
"mean_token_accuracy": 0.821798668242991,
"num_tokens": 46996031.0,
"step": 109
},
{
"entropy": 0.46063232421875,
"epoch": 0.43137254901960786,
"grad_norm": 1.328722695820361,
"learning_rate": 1.95447296407255e-05,
"loss": 0.5332,
"mean_token_accuracy": 0.8222680538892746,
"num_tokens": 47440121.0,
"step": 110
},
{
"entropy": 0.468475341796875,
"epoch": 0.43529411764705883,
"grad_norm": 1.109295062470751,
"learning_rate": 1.9531732204207787e-05,
"loss": 0.548,
"mean_token_accuracy": 0.8184197628870606,
"num_tokens": 47877065.0,
"step": 111
},
{
"entropy": 0.47601318359375,
"epoch": 0.4392156862745098,
"grad_norm": 1.1607526849937826,
"learning_rate": 1.9518556284360696e-05,
"loss": 0.5298,
"mean_token_accuracy": 0.822773078456521,
"num_tokens": 48290264.0,
"step": 112
},
{
"entropy": 0.471038818359375,
"epoch": 0.44313725490196076,
"grad_norm": 1.020865323457694,
"learning_rate": 1.95052021279056e-05,
"loss": 0.521,
"mean_token_accuracy": 0.8239856511354446,
"num_tokens": 48704339.0,
"step": 113
},
{
"entropy": 0.461517333984375,
"epoch": 0.4470588235294118,
"grad_norm": 1.1750318383641127,
"learning_rate": 1.9491669984901377e-05,
"loss": 0.5309,
"mean_token_accuracy": 0.8254398861899972,
"num_tokens": 49132806.0,
"step": 114
},
{
"entropy": 0.45526123046875,
"epoch": 0.45098039215686275,
"grad_norm": 0.9995281650726873,
"learning_rate": 1.947796010873974e-05,
"loss": 0.5249,
"mean_token_accuracy": 0.8260870166122913,
"num_tokens": 49578930.0,
"step": 115
},
{
"entropy": 0.469482421875,
"epoch": 0.4549019607843137,
"grad_norm": 1.0746114295572882,
"learning_rate": 1.9464072756140487e-05,
"loss": 0.5328,
"mean_token_accuracy": 0.8212811881676316,
"num_tokens": 49996034.0,
"step": 116
},
{
"entropy": 0.472015380859375,
"epoch": 0.4588235294117647,
"grad_norm": 0.962126687778494,
"learning_rate": 1.9450008187146685e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8235579496249557,
"num_tokens": 50416951.0,
"step": 117
},
{
"entropy": 0.47576904296875,
"epoch": 0.4627450980392157,
"grad_norm": 1.0768479231849244,
"learning_rate": 1.9435766665119823e-05,
"loss": 0.5223,
"mean_token_accuracy": 0.8253951445221901,
"num_tokens": 50831955.0,
"step": 118
},
{
"entropy": 0.46588134765625,
"epoch": 0.4666666666666667,
"grad_norm": 1.1169527281919984,
"learning_rate": 1.9421348456734844e-05,
"loss": 0.5344,
"mean_token_accuracy": 0.8200209140777588,
"num_tokens": 51269078.0,
"step": 119
},
{
"entropy": 0.466888427734375,
"epoch": 0.47058823529411764,
"grad_norm": 1.0439601462864598,
"learning_rate": 1.9406753831975202e-05,
"loss": 0.5452,
"mean_token_accuracy": 0.8209556825459003,
"num_tokens": 51716754.0,
"step": 120
},
{
"entropy": 0.46319580078125,
"epoch": 0.4745098039215686,
"grad_norm": 1.0226579656321964,
"learning_rate": 1.939198306412775e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.8236983455717564,
"num_tokens": 52135573.0,
"step": 121
},
{
"entropy": 0.458648681640625,
"epoch": 0.47843137254901963,
"grad_norm": 0.9029783127472882,
"learning_rate": 1.9377036429777673e-05,
"loss": 0.5149,
"mean_token_accuracy": 0.8285743938758969,
"num_tokens": 52566553.0,
"step": 122
},
{
"entropy": 0.458740234375,
"epoch": 0.4823529411764706,
"grad_norm": 1.0908467464044844,
"learning_rate": 1.9361914208803264e-05,
"loss": 0.5437,
"mean_token_accuracy": 0.8188828593119979,
"num_tokens": 53005267.0,
"step": 123
},
{
"entropy": 0.470458984375,
"epoch": 0.48627450980392156,
"grad_norm": 1.0725220647845308,
"learning_rate": 1.934661668437073e-05,
"loss": 0.5107,
"mean_token_accuracy": 0.8296375386416912,
"num_tokens": 53421602.0,
"step": 124
},
{
"entropy": 0.470245361328125,
"epoch": 0.49019607843137253,
"grad_norm": 1.133065623203244,
"learning_rate": 1.9331144142928853e-05,
"loss": 0.5308,
"mean_token_accuracy": 0.8249672232195735,
"num_tokens": 53830156.0,
"step": 125
},
{
"entropy": 0.458160400390625,
"epoch": 0.49411764705882355,
"grad_norm": 1.043613743096017,
"learning_rate": 1.9315496874203637e-05,
"loss": 0.5269,
"mean_token_accuracy": 0.8225527862086892,
"num_tokens": 54260269.0,
"step": 126
},
{
"entropy": 0.454193115234375,
"epoch": 0.4980392156862745,
"grad_norm": 1.0411321135052143,
"learning_rate": 1.929967517119289e-05,
"loss": 0.526,
"mean_token_accuracy": 0.8245013160631061,
"num_tokens": 54692550.0,
"step": 127
},
{
"entropy": 0.45831298828125,
"epoch": 0.5019607843137255,
"grad_norm": 1.153644894412413,
"learning_rate": 1.9283679330160726e-05,
"loss": 0.5372,
"mean_token_accuracy": 0.8229446588084102,
"num_tokens": 55129663.0,
"step": 128
},
{
"entropy": 0.454833984375,
"epoch": 0.5058823529411764,
"grad_norm": 1.0680654714661098,
"learning_rate": 1.926750965063203e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.8256676206365228,
"num_tokens": 55558261.0,
"step": 129
},
{
"entropy": 0.45941162109375,
"epoch": 0.5098039215686274,
"grad_norm": 0.9504005715051883,
"learning_rate": 1.9251166435386837e-05,
"loss": 0.5363,
"mean_token_accuracy": 0.8229048978537321,
"num_tokens": 55998017.0,
"step": 130
},
{
"entropy": 0.46295166015625,
"epoch": 0.5137254901960784,
"grad_norm": 1.095073383543885,
"learning_rate": 1.9234649990454678e-05,
"loss": 0.5188,
"mean_token_accuracy": 0.8264925237745047,
"num_tokens": 56425710.0,
"step": 131
},
{
"entropy": 0.466278076171875,
"epoch": 0.5176470588235295,
"grad_norm": 1.0136365133134169,
"learning_rate": 1.921796062510882e-05,
"loss": 0.521,
"mean_token_accuracy": 0.8278189906850457,
"num_tokens": 56852573.0,
"step": 132
},
{
"entropy": 0.465911865234375,
"epoch": 0.5215686274509804,
"grad_norm": 1.0774587353071405,
"learning_rate": 1.920109865186052e-05,
"loss": 0.5321,
"mean_token_accuracy": 0.8233054708689451,
"num_tokens": 57303340.0,
"step": 133
},
{
"entropy": 0.455169677734375,
"epoch": 0.5254901960784314,
"grad_norm": 1.1845295561863336,
"learning_rate": 1.9184064386453127e-05,
"loss": 0.5279,
"mean_token_accuracy": 0.8221144881099463,
"num_tokens": 57749444.0,
"step": 134
},
{
"entropy": 0.455780029296875,
"epoch": 0.5294117647058824,
"grad_norm": 1.0251726517418989,
"learning_rate": 1.9166858147856204e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.8259268430992961,
"num_tokens": 58175991.0,
"step": 135
},
{
"entropy": 0.454925537109375,
"epoch": 0.5333333333333333,
"grad_norm": 1.1572076691548627,
"learning_rate": 1.9149480258259535e-05,
"loss": 0.5255,
"mean_token_accuracy": 0.8251020405441523,
"num_tokens": 58609070.0,
"step": 136
},
{
"entropy": 0.464202880859375,
"epoch": 0.5372549019607843,
"grad_norm": 1.1385262369773776,
"learning_rate": 1.9131931043067092e-05,
"loss": 0.5208,
"mean_token_accuracy": 0.824607603251934,
"num_tokens": 59034229.0,
"step": 137
},
{
"entropy": 0.462188720703125,
"epoch": 0.5411764705882353,
"grad_norm": 1.0421945208646466,
"learning_rate": 1.911421083089097e-05,
"loss": 0.5161,
"mean_token_accuracy": 0.8247706349939108,
"num_tokens": 59474279.0,
"step": 138
},
{
"entropy": 0.455352783203125,
"epoch": 0.5450980392156862,
"grad_norm": 0.9905728313025403,
"learning_rate": 1.9096319953545186e-05,
"loss": 0.5145,
"mean_token_accuracy": 0.826055621728301,
"num_tokens": 59909618.0,
"step": 139
},
{
"entropy": 0.4610595703125,
"epoch": 0.5490196078431373,
"grad_norm": 1.2224526998602292,
"learning_rate": 1.907825874603951e-05,
"loss": 0.516,
"mean_token_accuracy": 0.827407187782228,
"num_tokens": 60333313.0,
"step": 140
},
{
"entropy": 0.45953369140625,
"epoch": 0.5529411764705883,
"grad_norm": 1.031383956090397,
"learning_rate": 1.9060027546573164e-05,
"loss": 0.5084,
"mean_token_accuracy": 0.8289670627564192,
"num_tokens": 60769519.0,
"step": 141
},
{
"entropy": 0.4581298828125,
"epoch": 0.5568627450980392,
"grad_norm": 0.9286455749135041,
"learning_rate": 1.9041626696528503e-05,
"loss": 0.5159,
"mean_token_accuracy": 0.8258885480463505,
"num_tokens": 61197717.0,
"step": 142
},
{
"entropy": 0.46160888671875,
"epoch": 0.5607843137254902,
"grad_norm": 1.043604648324428,
"learning_rate": 1.9023056540464622e-05,
"loss": 0.5088,
"mean_token_accuracy": 0.8298035254701972,
"num_tokens": 61623020.0,
"step": 143
},
{
"entropy": 0.45831298828125,
"epoch": 0.5647058823529412,
"grad_norm": 0.9853779681682011,
"learning_rate": 1.9004317426110888e-05,
"loss": 0.5081,
"mean_token_accuracy": 0.8284985879436135,
"num_tokens": 62041651.0,
"step": 144
},
{
"entropy": 0.45892333984375,
"epoch": 0.5686274509803921,
"grad_norm": 1.0256172993248214,
"learning_rate": 1.8985409704360457e-05,
"loss": 0.5237,
"mean_token_accuracy": 0.8231825344264507,
"num_tokens": 62478366.0,
"step": 145
},
{
"entropy": 0.4647216796875,
"epoch": 0.5725490196078431,
"grad_norm": 1.0172104304899114,
"learning_rate": 1.8966333729263674e-05,
"loss": 0.5017,
"mean_token_accuracy": 0.8279380407184362,
"num_tokens": 62893575.0,
"step": 146
},
{
"entropy": 0.457427978515625,
"epoch": 0.5764705882352941,
"grad_norm": 0.9671420496140763,
"learning_rate": 1.8947089858021465e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.8288818374276161,
"num_tokens": 63347404.0,
"step": 147
},
{
"entropy": 0.46197509765625,
"epoch": 0.5803921568627451,
"grad_norm": 0.9552903663153749,
"learning_rate": 1.892767845097864e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.8258289452642202,
"num_tokens": 63781233.0,
"step": 148
},
{
"entropy": 0.458984375,
"epoch": 0.5843137254901961,
"grad_norm": 0.8972556890492476,
"learning_rate": 1.8908099871617137e-05,
"loss": 0.5101,
"mean_token_accuracy": 0.827708194963634,
"num_tokens": 64205982.0,
"step": 149
},
{
"entropy": 0.45684814453125,
"epoch": 0.5882352941176471,
"grad_norm": 0.9940426982366383,
"learning_rate": 1.8888354486549238e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.8226973535493016,
"num_tokens": 64620518.0,
"step": 150
},
{
"entropy": 0.451263427734375,
"epoch": 0.592156862745098,
"grad_norm": 1.0209199020984132,
"learning_rate": 1.886844266551068e-05,
"loss": 0.5037,
"mean_token_accuracy": 0.8314398853108287,
"num_tokens": 65055500.0,
"step": 151
},
{
"entropy": 0.450714111328125,
"epoch": 0.596078431372549,
"grad_norm": 1.0567665577574286,
"learning_rate": 1.8848364781353744e-05,
"loss": 0.5009,
"mean_token_accuracy": 0.828608175739646,
"num_tokens": 65478309.0,
"step": 152
},
{
"entropy": 0.44635009765625,
"epoch": 0.6,
"grad_norm": 1.0017296774811835,
"learning_rate": 1.882812121004028e-05,
"loss": 0.4949,
"mean_token_accuracy": 0.8324599312618375,
"num_tokens": 65910742.0,
"step": 153
},
{
"entropy": 0.453643798828125,
"epoch": 0.6039215686274509,
"grad_norm": 1.0657800071579415,
"learning_rate": 1.8807712330634645e-05,
"loss": 0.4935,
"mean_token_accuracy": 0.8339864388108253,
"num_tokens": 66319397.0,
"step": 154
},
{
"entropy": 0.455413818359375,
"epoch": 0.6078431372549019,
"grad_norm": 1.0028724888261045,
"learning_rate": 1.878713852529663e-05,
"loss": 0.524,
"mean_token_accuracy": 0.8232726603746414,
"num_tokens": 66754574.0,
"step": 155
},
{
"entropy": 0.455322265625,
"epoch": 0.611764705882353,
"grad_norm": 0.8811386198134418,
"learning_rate": 1.8766400179274287e-05,
"loss": 0.4927,
"mean_token_accuracy": 0.8332832558080554,
"num_tokens": 67168621.0,
"step": 156
},
{
"entropy": 0.45269775390625,
"epoch": 0.615686274509804,
"grad_norm": 0.9617989903466428,
"learning_rate": 1.8745497680896722e-05,
"loss": 0.5261,
"mean_token_accuracy": 0.825565142557025,
"num_tokens": 67617448.0,
"step": 157
},
{
"entropy": 0.46160888671875,
"epoch": 0.6196078431372549,
"grad_norm": 0.9722737962073038,
"learning_rate": 1.8724431421566822e-05,
"loss": 0.5062,
"mean_token_accuracy": 0.8278914587572217,
"num_tokens": 68042404.0,
"step": 158
},
{
"entropy": 0.4639892578125,
"epoch": 0.6235294117647059,
"grad_norm": 0.9314083104367087,
"learning_rate": 1.870320179575393e-05,
"loss": 0.5115,
"mean_token_accuracy": 0.8274906072765589,
"num_tokens": 68459798.0,
"step": 159
},
{
"entropy": 0.44720458984375,
"epoch": 0.6274509803921569,
"grad_norm": 0.9602745230104547,
"learning_rate": 1.868180920098644e-05,
"loss": 0.5033,
"mean_token_accuracy": 0.8310739854350686,
"num_tokens": 68889446.0,
"step": 160
},
{
"entropy": 0.4462890625,
"epoch": 0.6313725490196078,
"grad_norm": 0.9314028085759819,
"learning_rate": 1.866025403784439e-05,
"loss": 0.5136,
"mean_token_accuracy": 0.8256695969030261,
"num_tokens": 69299743.0,
"step": 161
},
{
"entropy": 0.442535400390625,
"epoch": 0.6352941176470588,
"grad_norm": 0.9736165597315848,
"learning_rate": 1.8638536709951916e-05,
"loss": 0.499,
"mean_token_accuracy": 0.8318196469917893,
"num_tokens": 69732061.0,
"step": 162
},
{
"entropy": 0.445587158203125,
"epoch": 0.6392156862745098,
"grad_norm": 1.101418386080784,
"learning_rate": 1.861665762396974e-05,
"loss": 0.5002,
"mean_token_accuracy": 0.8316458566114306,
"num_tokens": 70173908.0,
"step": 163
},
{
"entropy": 0.455230712890625,
"epoch": 0.6431372549019608,
"grad_norm": 0.9161407634822796,
"learning_rate": 1.8594617189587515e-05,
"loss": 0.4924,
"mean_token_accuracy": 0.8328952239826322,
"num_tokens": 70582274.0,
"step": 164
},
{
"entropy": 0.44940185546875,
"epoch": 0.6470588235294118,
"grad_norm": 0.9578481443615958,
"learning_rate": 1.8572415819516174e-05,
"loss": 0.5026,
"mean_token_accuracy": 0.8308584401383996,
"num_tokens": 71020923.0,
"step": 165
},
{
"entropy": 0.447174072265625,
"epoch": 0.6509803921568628,
"grad_norm": 0.8859183038806147,
"learning_rate": 1.8550053929480202e-05,
"loss": 0.5044,
"mean_token_accuracy": 0.8281080648303032,
"num_tokens": 71456687.0,
"step": 166
},
{
"entropy": 0.4407958984375,
"epoch": 0.6549019607843137,
"grad_norm": 1.073002915476322,
"learning_rate": 1.8527531938209847e-05,
"loss": 0.5064,
"mean_token_accuracy": 0.8275053184479475,
"num_tokens": 71895633.0,
"step": 167
},
{
"entropy": 0.44793701171875,
"epoch": 0.6588235294117647,
"grad_norm": 0.8588167816577594,
"learning_rate": 1.8504850267433278e-05,
"loss": 0.502,
"mean_token_accuracy": 0.8324728878214955,
"num_tokens": 72315957.0,
"step": 168
},
{
"entropy": 0.440399169921875,
"epoch": 0.6627450980392157,
"grad_norm": 0.9162787056521714,
"learning_rate": 1.8482009341868696e-05,
"loss": 0.5111,
"mean_token_accuracy": 0.829299739561975,
"num_tokens": 72758849.0,
"step": 169
},
{
"entropy": 0.449249267578125,
"epoch": 0.6666666666666666,
"grad_norm": 0.8975412677782875,
"learning_rate": 1.8459009589216364e-05,
"loss": 0.4852,
"mean_token_accuracy": 0.8351985597983003,
"num_tokens": 73180765.0,
"step": 170
},
{
"entropy": 0.444366455078125,
"epoch": 0.6705882352941176,
"grad_norm": 1.0823871467715829,
"learning_rate": 1.843585144015063e-05,
"loss": 0.495,
"mean_token_accuracy": 0.8346499102190137,
"num_tokens": 73588923.0,
"step": 171
},
{
"entropy": 0.443878173828125,
"epoch": 0.6745098039215687,
"grad_norm": 0.9614941949925443,
"learning_rate": 1.8412535328311813e-05,
"loss": 0.5077,
"mean_token_accuracy": 0.8277076184749603,
"num_tokens": 74045150.0,
"step": 172
},
{
"entropy": 0.442962646484375,
"epoch": 0.6784313725490196,
"grad_norm": 0.8983449509382112,
"learning_rate": 1.838906169029814e-05,
"loss": 0.4842,
"mean_token_accuracy": 0.8344194013625383,
"num_tokens": 74455581.0,
"step": 173
},
{
"entropy": 0.440032958984375,
"epoch": 0.6823529411764706,
"grad_norm": 0.8921437173146779,
"learning_rate": 1.8365430965657527e-05,
"loss": 0.4891,
"mean_token_accuracy": 0.8333101095631719,
"num_tokens": 74901722.0,
"step": 174
},
{
"entropy": 0.44647216796875,
"epoch": 0.6862745098039216,
"grad_norm": 0.9018886281577161,
"learning_rate": 1.834164359687937e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.8270498281344771,
"num_tokens": 75346539.0,
"step": 175
},
{
"entropy": 0.46038818359375,
"epoch": 0.6901960784313725,
"grad_norm": 0.9208123316136007,
"learning_rate": 1.8317700029386245e-05,
"loss": 0.5019,
"mean_token_accuracy": 0.8328892076388001,
"num_tokens": 75755098.0,
"step": 176
},
{
"entropy": 0.454498291015625,
"epoch": 0.6941176470588235,
"grad_norm": 0.9713253889644098,
"learning_rate": 1.829360071152559e-05,
"loss": 0.4813,
"mean_token_accuracy": 0.8358625834807754,
"num_tokens": 76155127.0,
"step": 177
},
{
"entropy": 0.45330810546875,
"epoch": 0.6980392156862745,
"grad_norm": 0.9928686889194862,
"learning_rate": 1.826934609456129e-05,
"loss": 0.4968,
"mean_token_accuracy": 0.8308405773714185,
"num_tokens": 76581278.0,
"step": 178
},
{
"entropy": 0.448089599609375,
"epoch": 0.7019607843137254,
"grad_norm": 0.8728503339525145,
"learning_rate": 1.8244936632665223e-05,
"loss": 0.4943,
"mean_token_accuracy": 0.8327812422066927,
"num_tokens": 76984426.0,
"step": 179
},
{
"entropy": 0.442962646484375,
"epoch": 0.7058823529411765,
"grad_norm": 1.0407682234756923,
"learning_rate": 1.8220372782908778e-05,
"loss": 0.4951,
"mean_token_accuracy": 0.8317067986354232,
"num_tokens": 77421977.0,
"step": 180
},
{
"entropy": 0.440216064453125,
"epoch": 0.7098039215686275,
"grad_norm": 0.8599102112771374,
"learning_rate": 1.8195655005254274e-05,
"loss": 0.513,
"mean_token_accuracy": 0.8280303832143545,
"num_tokens": 77866296.0,
"step": 181
},
{
"entropy": 0.450531005859375,
"epoch": 0.7137254901960784,
"grad_norm": 0.89596271187072,
"learning_rate": 1.8170783762546363e-05,
"loss": 0.5075,
"mean_token_accuracy": 0.8298424258828163,
"num_tokens": 78304256.0,
"step": 182
},
{
"entropy": 0.4423828125,
"epoch": 0.7176470588235294,
"grad_norm": 0.8933849619381508,
"learning_rate": 1.814575952050336e-05,
"loss": 0.4934,
"mean_token_accuracy": 0.8312811693176627,
"num_tokens": 78730340.0,
"step": 183
},
{
"entropy": 0.449249267578125,
"epoch": 0.7215686274509804,
"grad_norm": 0.9127697862713822,
"learning_rate": 1.8120582747708503e-05,
"loss": 0.494,
"mean_token_accuracy": 0.8302021278068423,
"num_tokens": 79156747.0,
"step": 184
},
{
"entropy": 0.4403076171875,
"epoch": 0.7254901960784313,
"grad_norm": 0.9995234569515845,
"learning_rate": 1.8095253915601207e-05,
"loss": 0.486,
"mean_token_accuracy": 0.8339323159307241,
"num_tokens": 79582648.0,
"step": 185
},
{
"entropy": 0.4400634765625,
"epoch": 0.7294117647058823,
"grad_norm": 0.9755695900508942,
"learning_rate": 1.8069773498468224e-05,
"loss": 0.4915,
"mean_token_accuracy": 0.8324770601466298,
"num_tokens": 80006926.0,
"step": 186
},
{
"entropy": 0.43646240234375,
"epoch": 0.7333333333333333,
"grad_norm": 1.0501557093529328,
"learning_rate": 1.804414197343476e-05,
"loss": 0.4938,
"mean_token_accuracy": 0.8323720088228583,
"num_tokens": 80445925.0,
"step": 187
},
{
"entropy": 0.43475341796875,
"epoch": 0.7372549019607844,
"grad_norm": 0.9747183754878954,
"learning_rate": 1.8018359820455535e-05,
"loss": 0.5005,
"mean_token_accuracy": 0.8321980787441134,
"num_tokens": 80884102.0,
"step": 188
},
{
"entropy": 0.43695068359375,
"epoch": 0.7411764705882353,
"grad_norm": 1.013802709267682,
"learning_rate": 1.799242752230582e-05,
"loss": 0.4867,
"mean_token_accuracy": 0.8352240175008774,
"num_tokens": 81315280.0,
"step": 189
},
{
"entropy": 0.437896728515625,
"epoch": 0.7450980392156863,
"grad_norm": 0.8354277367263349,
"learning_rate": 1.796634556457236e-05,
"loss": 0.4958,
"mean_token_accuracy": 0.8291263459250331,
"num_tokens": 81748605.0,
"step": 190
},
{
"entropy": 0.4354248046875,
"epoch": 0.7490196078431373,
"grad_norm": 0.9816747402124396,
"learning_rate": 1.794011443564432e-05,
"loss": 0.483,
"mean_token_accuracy": 0.8366444101557136,
"num_tokens": 82175348.0,
"step": 191
},
{
"entropy": 0.44696044921875,
"epoch": 0.7529411764705882,
"grad_norm": 0.9513646004972339,
"learning_rate": 1.791373462670411e-05,
"loss": 0.4937,
"mean_token_accuracy": 0.8334898129105568,
"num_tokens": 82613145.0,
"step": 192
},
{
"entropy": 0.444091796875,
"epoch": 0.7568627450980392,
"grad_norm": 0.8255350035205672,
"learning_rate": 1.7887206631718202e-05,
"loss": 0.4876,
"mean_token_accuracy": 0.8342526415362954,
"num_tokens": 83047052.0,
"step": 193
},
{
"entropy": 0.44970703125,
"epoch": 0.7607843137254902,
"grad_norm": 0.8736860391365483,
"learning_rate": 1.7860530947427878e-05,
"loss": 0.4976,
"mean_token_accuracy": 0.8308942569419742,
"num_tokens": 83466934.0,
"step": 194
},
{
"entropy": 0.447509765625,
"epoch": 0.7647058823529411,
"grad_norm": 0.8782495436576085,
"learning_rate": 1.7833708073339924e-05,
"loss": 0.4943,
"mean_token_accuracy": 0.832427485845983,
"num_tokens": 83900582.0,
"step": 195
},
{
"entropy": 0.443267822265625,
"epoch": 0.7686274509803922,
"grad_norm": 0.863039317348048,
"learning_rate": 1.780673851171728e-05,
"loss": 0.5083,
"mean_token_accuracy": 0.8297006255015731,
"num_tokens": 84337148.0,
"step": 196
},
{
"entropy": 0.4403076171875,
"epoch": 0.7725490196078432,
"grad_norm": 0.8456444047616816,
"learning_rate": 1.777962276756965e-05,
"loss": 0.4791,
"mean_token_accuracy": 0.8360334020107985,
"num_tokens": 84757172.0,
"step": 197
},
{
"entropy": 0.442047119140625,
"epoch": 0.7764705882352941,
"grad_norm": 0.8417951704771196,
"learning_rate": 1.7752361348644012e-05,
"loss": 0.4849,
"mean_token_accuracy": 0.8353043273091316,
"num_tokens": 85186171.0,
"step": 198
},
{
"entropy": 0.44305419921875,
"epoch": 0.7803921568627451,
"grad_norm": 0.9116590851768235,
"learning_rate": 1.7724954765415137e-05,
"loss": 0.4834,
"mean_token_accuracy": 0.8365112636238337,
"num_tokens": 85616924.0,
"step": 199
},
{
"entropy": 0.445159912109375,
"epoch": 0.7843137254901961,
"grad_norm": 0.8973441814276175,
"learning_rate": 1.769740353107602e-05,
"loss": 0.4932,
"mean_token_accuracy": 0.8347958726808429,
"num_tokens": 86029797.0,
"step": 200
},
{
"entropy": 0.452972412109375,
"epoch": 0.788235294117647,
"grad_norm": 0.9120006674689222,
"learning_rate": 1.766970816152828e-05,
"loss": 0.4858,
"mean_token_accuracy": 0.8320359215140343,
"num_tokens": 86441674.0,
"step": 201
},
{
"entropy": 0.45050048828125,
"epoch": 0.792156862745098,
"grad_norm": 0.9530574679205193,
"learning_rate": 1.7641869175372493e-05,
"loss": 0.4986,
"mean_token_accuracy": 0.8322794009000063,
"num_tokens": 86871318.0,
"step": 202
},
{
"entropy": 0.449432373046875,
"epoch": 0.796078431372549,
"grad_norm": 0.7899417130805441,
"learning_rate": 1.7613887093898466e-05,
"loss": 0.479,
"mean_token_accuracy": 0.8354767337441444,
"num_tokens": 87301013.0,
"step": 203
},
{
"entropy": 0.4456787109375,
"epoch": 0.8,
"grad_norm": 0.8983278002195033,
"learning_rate": 1.7585762441075504e-05,
"loss": 0.4989,
"mean_token_accuracy": 0.8304571500048041,
"num_tokens": 87749700.0,
"step": 204
},
{
"entropy": 0.449981689453125,
"epoch": 0.803921568627451,
"grad_norm": 0.8482340735997149,
"learning_rate": 1.7557495743542586e-05,
"loss": 0.4913,
"mean_token_accuracy": 0.8334829676896334,
"num_tokens": 88201140.0,
"step": 205
},
{
"entropy": 0.446044921875,
"epoch": 0.807843137254902,
"grad_norm": 0.964208745237549,
"learning_rate": 1.752908753059849e-05,
"loss": 0.4923,
"mean_token_accuracy": 0.8305018618702888,
"num_tokens": 88633660.0,
"step": 206
},
{
"entropy": 0.443023681640625,
"epoch": 0.8117647058823529,
"grad_norm": 0.8368396835127176,
"learning_rate": 1.7500538334191906e-05,
"loss": 0.4852,
"mean_token_accuracy": 0.8355229757726192,
"num_tokens": 89048406.0,
"step": 207
},
{
"entropy": 0.4366455078125,
"epoch": 0.8156862745098039,
"grad_norm": 1.542876567727518,
"learning_rate": 1.7471848688911465e-05,
"loss": 0.482,
"mean_token_accuracy": 0.8349215844646096,
"num_tokens": 89469026.0,
"step": 208
},
{
"entropy": 0.441131591796875,
"epoch": 0.8196078431372549,
"grad_norm": 0.9628327965048106,
"learning_rate": 1.7443019131975716e-05,
"loss": 0.484,
"mean_token_accuracy": 0.8347738357260823,
"num_tokens": 89890734.0,
"step": 209
},
{
"entropy": 0.43792724609375,
"epoch": 0.8235294117647058,
"grad_norm": 0.8674843259606271,
"learning_rate": 1.7414050203223092e-05,
"loss": 0.4797,
"mean_token_accuracy": 0.8360974583774805,
"num_tokens": 90331223.0,
"step": 210
},
{
"entropy": 0.455902099609375,
"epoch": 0.8274509803921568,
"grad_norm": 0.8644106725290759,
"learning_rate": 1.7384942445101772e-05,
"loss": 0.4888,
"mean_token_accuracy": 0.8351241312921047,
"num_tokens": 90744739.0,
"step": 211
},
{
"entropy": 0.44989013671875,
"epoch": 0.8313725490196079,
"grad_norm": 0.8302621076190317,
"learning_rate": 1.735569640265955e-05,
"loss": 0.4739,
"mean_token_accuracy": 0.8376046605408192,
"num_tokens": 91158198.0,
"step": 212
},
{
"entropy": 0.454345703125,
"epoch": 0.8352941176470589,
"grad_norm": 0.870764317553708,
"learning_rate": 1.7326312623533617e-05,
"loss": 0.4778,
"mean_token_accuracy": 0.8371938867494464,
"num_tokens": 91579459.0,
"step": 213
},
{
"entropy": 0.446258544921875,
"epoch": 0.8392156862745098,
"grad_norm": 0.9379049791651398,
"learning_rate": 1.72967916579403e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8384145144373178,
"num_tokens": 92005351.0,
"step": 214
},
{
"entropy": 0.438934326171875,
"epoch": 0.8431372549019608,
"grad_norm": 0.8537995734738039,
"learning_rate": 1.7267134058664776e-05,
"loss": 0.4652,
"mean_token_accuracy": 0.8381716851145029,
"num_tokens": 92443937.0,
"step": 215
},
{
"entropy": 0.434356689453125,
"epoch": 0.8470588235294118,
"grad_norm": 0.7898971606485905,
"learning_rate": 1.72373403810507e-05,
"loss": 0.4908,
"mean_token_accuracy": 0.8318126108497381,
"num_tokens": 92892784.0,
"step": 216
},
{
"entropy": 0.439666748046875,
"epoch": 0.8509803921568627,
"grad_norm": 0.7535497265357768,
"learning_rate": 1.7207411182989834e-05,
"loss": 0.4663,
"mean_token_accuracy": 0.8424786748364568,
"num_tokens": 93325929.0,
"step": 217
},
{
"entropy": 0.440216064453125,
"epoch": 0.8549019607843137,
"grad_norm": 0.8363493018404706,
"learning_rate": 1.7177347024911562e-05,
"loss": 0.4757,
"mean_token_accuracy": 0.8364362018182874,
"num_tokens": 93745474.0,
"step": 218
},
{
"entropy": 0.434539794921875,
"epoch": 0.8588235294117647,
"grad_norm": 0.7460393470424544,
"learning_rate": 1.7147148469772433e-05,
"loss": 0.4741,
"mean_token_accuracy": 0.8371732924133539,
"num_tokens": 94189419.0,
"step": 219
},
{
"entropy": 0.433502197265625,
"epoch": 0.8627450980392157,
"grad_norm": 0.7927531784250542,
"learning_rate": 1.7116816083045603e-05,
"loss": 0.4687,
"mean_token_accuracy": 0.841255315579474,
"num_tokens": 94625785.0,
"step": 220
},
{
"entropy": 0.439544677734375,
"epoch": 0.8666666666666667,
"grad_norm": 0.847951723708805,
"learning_rate": 1.7086350432710243e-05,
"loss": 0.4729,
"mean_token_accuracy": 0.8400851683691144,
"num_tokens": 95045825.0,
"step": 221
},
{
"entropy": 0.439727783203125,
"epoch": 0.8705882352941177,
"grad_norm": 0.8653958965315521,
"learning_rate": 1.7055752089240907e-05,
"loss": 0.4872,
"mean_token_accuracy": 0.8322295276448131,
"num_tokens": 95485894.0,
"step": 222
},
{
"entropy": 0.436309814453125,
"epoch": 0.8745098039215686,
"grad_norm": 0.8139892412838203,
"learning_rate": 1.7025021625596852e-05,
"loss": 0.4877,
"mean_token_accuracy": 0.8352080434560776,
"num_tokens": 95903775.0,
"step": 223
},
{
"entropy": 0.43829345703125,
"epoch": 0.8784313725490196,
"grad_norm": 0.9137046083892381,
"learning_rate": 1.6994159617211318e-05,
"loss": 0.4757,
"mean_token_accuracy": 0.8375628003850579,
"num_tokens": 96324996.0,
"step": 224
},
{
"entropy": 0.44122314453125,
"epoch": 0.8823529411764706,
"grad_norm": 0.8019525849533575,
"learning_rate": 1.6963166641980732e-05,
"loss": 0.4815,
"mean_token_accuracy": 0.8376647494733334,
"num_tokens": 96751142.0,
"step": 225
},
{
"entropy": 0.438232421875,
"epoch": 0.8862745098039215,
"grad_norm": 0.7661466594667296,
"learning_rate": 1.6932043280253892e-05,
"loss": 0.4741,
"mean_token_accuracy": 0.8382897274568677,
"num_tokens": 97183056.0,
"step": 226
},
{
"entropy": 0.4429931640625,
"epoch": 0.8901960784313725,
"grad_norm": 0.9246954784051347,
"learning_rate": 1.6900790114821122e-05,
"loss": 0.4733,
"mean_token_accuracy": 0.8375686015933752,
"num_tokens": 97611237.0,
"step": 227
},
{
"entropy": 0.44525146484375,
"epoch": 0.8941176470588236,
"grad_norm": 0.9222883646984189,
"learning_rate": 1.686940773090333e-05,
"loss": 0.466,
"mean_token_accuracy": 0.8396632606163621,
"num_tokens": 98013920.0,
"step": 228
},
{
"entropy": 0.436859130859375,
"epoch": 0.8980392156862745,
"grad_norm": 0.7494759382545816,
"learning_rate": 1.683789671614107e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8365710796788335,
"num_tokens": 98445616.0,
"step": 229
},
{
"entropy": 0.44287109375,
"epoch": 0.9019607843137255,
"grad_norm": 0.8481872394661378,
"learning_rate": 1.6806257660583534e-05,
"loss": 0.4752,
"mean_token_accuracy": 0.8378980001434684,
"num_tokens": 98879880.0,
"step": 230
},
{
"entropy": 0.443023681640625,
"epoch": 0.9058823529411765,
"grad_norm": 0.7342093494551309,
"learning_rate": 1.6774491156677482e-05,
"loss": 0.4678,
"mean_token_accuracy": 0.8390908362343907,
"num_tokens": 99304994.0,
"step": 231
},
{
"entropy": 0.439727783203125,
"epoch": 0.9098039215686274,
"grad_norm": 0.7975903074062608,
"learning_rate": 1.6742597799256182e-05,
"loss": 0.4694,
"mean_token_accuracy": 0.8397601125761867,
"num_tokens": 99740204.0,
"step": 232
},
{
"entropy": 0.440093994140625,
"epoch": 0.9137254901960784,
"grad_norm": 0.8294915259809778,
"learning_rate": 1.6710578185528254e-05,
"loss": 0.4875,
"mean_token_accuracy": 0.8345666192471981,
"num_tokens": 100200826.0,
"step": 233
},
{
"entropy": 0.4527587890625,
"epoch": 0.9176470588235294,
"grad_norm": 0.8101787502262441,
"learning_rate": 1.6678432915066488e-05,
"loss": 0.4735,
"mean_token_accuracy": 0.8401564313098788,
"num_tokens": 100610122.0,
"step": 234
},
{
"entropy": 0.451934814453125,
"epoch": 0.9215686274509803,
"grad_norm": 0.7660857033302512,
"learning_rate": 1.6646162589796616e-05,
"loss": 0.4823,
"mean_token_accuracy": 0.835520800203085,
"num_tokens": 101053523.0,
"step": 235
},
{
"entropy": 0.440185546875,
"epoch": 0.9254901960784314,
"grad_norm": 0.8122888604746737,
"learning_rate": 1.6613767813986045e-05,
"loss": 0.4848,
"mean_token_accuracy": 0.8362872619181871,
"num_tokens": 101485940.0,
"step": 236
},
{
"entropy": 0.43414306640625,
"epoch": 0.9294117647058824,
"grad_norm": 0.7831540627887795,
"learning_rate": 1.6581249194232533e-05,
"loss": 0.4634,
"mean_token_accuracy": 0.8410006538033485,
"num_tokens": 101920333.0,
"step": 237
},
{
"entropy": 0.4376220703125,
"epoch": 0.9333333333333333,
"grad_norm": 0.8091781971460889,
"learning_rate": 1.6548607339452853e-05,
"loss": 0.4746,
"mean_token_accuracy": 0.8403336834162474,
"num_tokens": 102362931.0,
"step": 238
},
{
"entropy": 0.439788818359375,
"epoch": 0.9372549019607843,
"grad_norm": 0.7677541604132057,
"learning_rate": 1.6515842860871355e-05,
"loss": 0.4711,
"mean_token_accuracy": 0.8392134476453066,
"num_tokens": 102791458.0,
"step": 239
},
{
"entropy": 0.4378662109375,
"epoch": 0.9411764705882353,
"grad_norm": 0.7630280457517558,
"learning_rate": 1.648295637200856e-05,
"loss": 0.4694,
"mean_token_accuracy": 0.8395261112600565,
"num_tokens": 103217546.0,
"step": 240
},
{
"entropy": 0.44439697265625,
"epoch": 0.9450980392156862,
"grad_norm": 0.7151353860934048,
"learning_rate": 1.644994848866964e-05,
"loss": 0.4704,
"mean_token_accuracy": 0.8399761924520135,
"num_tokens": 103654329.0,
"step": 241
},
{
"entropy": 0.445831298828125,
"epoch": 0.9490196078431372,
"grad_norm": 0.7886114163810609,
"learning_rate": 1.64168198289329e-05,
"loss": 0.4628,
"mean_token_accuracy": 0.8427135553210974,
"num_tokens": 104065772.0,
"step": 242
},
{
"entropy": 0.437042236328125,
"epoch": 0.9529411764705882,
"grad_norm": 0.7506289883748392,
"learning_rate": 1.6383571013138214e-05,
"loss": 0.464,
"mean_token_accuracy": 0.8399297744035721,
"num_tokens": 104485121.0,
"step": 243
},
{
"entropy": 0.43280029296875,
"epoch": 0.9568627450980393,
"grad_norm": 0.738503159574171,
"learning_rate": 1.6350202663875385e-05,
"loss": 0.466,
"mean_token_accuracy": 0.840507653541863,
"num_tokens": 104911315.0,
"step": 244
},
{
"entropy": 0.4429931640625,
"epoch": 0.9607843137254902,
"grad_norm": 0.7698793504804927,
"learning_rate": 1.631671540597251e-05,
"loss": 0.4691,
"mean_token_accuracy": 0.8367423294112086,
"num_tokens": 105322868.0,
"step": 245
},
{
"entropy": 0.44146728515625,
"epoch": 0.9647058823529412,
"grad_norm": 0.7708943980471168,
"learning_rate": 1.628310986648427e-05,
"loss": 0.4815,
"mean_token_accuracy": 0.8389245234429836,
"num_tokens": 105754753.0,
"step": 246
},
{
"entropy": 0.45343017578125,
"epoch": 0.9686274509803922,
"grad_norm": 0.7584700182022692,
"learning_rate": 1.6249386674680186e-05,
"loss": 0.4738,
"mean_token_accuracy": 0.8398635992780328,
"num_tokens": 106148914.0,
"step": 247
},
{
"entropy": 0.438385009765625,
"epoch": 0.9725490196078431,
"grad_norm": 0.8570809330325667,
"learning_rate": 1.621554646203284e-05,
"loss": 0.4727,
"mean_token_accuracy": 0.8388482462614775,
"num_tokens": 106592260.0,
"step": 248
},
{
"entropy": 0.429840087890625,
"epoch": 0.9764705882352941,
"grad_norm": 0.7930657950269568,
"learning_rate": 1.6181589862206053e-05,
"loss": 0.477,
"mean_token_accuracy": 0.8356762723997235,
"num_tokens": 107045387.0,
"step": 249
},
{
"entropy": 0.447479248046875,
"epoch": 0.9803921568627451,
"grad_norm": 0.7855461075816964,
"learning_rate": 1.614751751104301e-05,
"loss": 0.4626,
"mean_token_accuracy": 0.8429703069850802,
"num_tokens": 107450555.0,
"step": 250
},
{
"entropy": 0.44061279296875,
"epoch": 0.984313725490196,
"grad_norm": 0.8480693241060412,
"learning_rate": 1.6113330046554363e-05,
"loss": 0.4725,
"mean_token_accuracy": 0.8400067826732993,
"num_tokens": 107869202.0,
"step": 251
},
{
"entropy": 0.437591552734375,
"epoch": 0.9882352941176471,
"grad_norm": 0.7799681659592422,
"learning_rate": 1.607902810890628e-05,
"loss": 0.4756,
"mean_token_accuracy": 0.838361494243145,
"num_tokens": 108294767.0,
"step": 252
},
{
"entropy": 0.440704345703125,
"epoch": 0.9921568627450981,
"grad_norm": 0.7683369799036914,
"learning_rate": 1.6044612340408466e-05,
"loss": 0.4727,
"mean_token_accuracy": 0.8381105521693826,
"num_tokens": 108700118.0,
"step": 253
},
{
"entropy": 0.43280029296875,
"epoch": 0.996078431372549,
"grad_norm": 0.8122897127695787,
"learning_rate": 1.601008338550211e-05,
"loss": 0.4692,
"mean_token_accuracy": 0.8396132970228791,
"num_tokens": 109111197.0,
"step": 254
},
{
"entropy": 0.431365966796875,
"epoch": 1.0,
"grad_norm": 0.6954505238166176,
"learning_rate": 1.5975441890747855e-05,
"loss": 0.4774,
"mean_token_accuracy": 0.8396214628592134,
"num_tokens": 109546636.0,
"step": 255
},
{
"entropy": 0.442596435546875,
"epoch": 1.003921568627451,
"grad_norm": 0.6963377343270832,
"learning_rate": 1.5940688504813664e-05,
"loss": 0.4379,
"mean_token_accuracy": 0.8488357551395893,
"num_tokens": 109971679.0,
"step": 256
},
{
"entropy": 0.442474365234375,
"epoch": 1.007843137254902,
"grad_norm": 0.8143419375331571,
"learning_rate": 1.590582387846268e-05,
"loss": 0.4392,
"mean_token_accuracy": 0.8477622792124748,
"num_tokens": 110393426.0,
"step": 257
},
{
"entropy": 0.429779052734375,
"epoch": 1.011764705882353,
"grad_norm": 0.7190829117341659,
"learning_rate": 1.5870848664541046e-05,
"loss": 0.4288,
"mean_token_accuracy": 0.84952078666538,
"num_tokens": 110824443.0,
"step": 258
},
{
"entropy": 0.41986083984375,
"epoch": 1.0156862745098039,
"grad_norm": 0.7240913276129756,
"learning_rate": 1.5835763517965676e-05,
"loss": 0.4313,
"mean_token_accuracy": 0.8483589226379991,
"num_tokens": 111260666.0,
"step": 259
},
{
"entropy": 0.425994873046875,
"epoch": 1.0196078431372548,
"grad_norm": 0.7709654036315373,
"learning_rate": 1.5800569095711983e-05,
"loss": 0.4379,
"mean_token_accuracy": 0.8487503128126264,
"num_tokens": 111695842.0,
"step": 260
},
{
"entropy": 0.417755126953125,
"epoch": 1.0235294117647058,
"grad_norm": 0.6752329019748353,
"learning_rate": 1.5765266056801603e-05,
"loss": 0.4154,
"mean_token_accuracy": 0.8561578085646033,
"num_tokens": 112126448.0,
"step": 261
},
{
"entropy": 0.4171142578125,
"epoch": 1.0274509803921568,
"grad_norm": 0.7858284344679825,
"learning_rate": 1.5729855062290024e-05,
"loss": 0.4284,
"mean_token_accuracy": 0.8497958136722445,
"num_tokens": 112561721.0,
"step": 262
},
{
"entropy": 0.429656982421875,
"epoch": 1.0313725490196077,
"grad_norm": 0.7086470475884443,
"learning_rate": 1.569433677525422e-05,
"loss": 0.4252,
"mean_token_accuracy": 0.8517554877325892,
"num_tokens": 112979380.0,
"step": 263
},
{
"entropy": 0.430023193359375,
"epoch": 1.035294117647059,
"grad_norm": 0.7246552088677546,
"learning_rate": 1.565871186078025e-05,
"loss": 0.4307,
"mean_token_accuracy": 0.849640991538763,
"num_tokens": 113406744.0,
"step": 264
},
{
"entropy": 0.4281005859375,
"epoch": 1.0392156862745099,
"grad_norm": 0.7560945351744308,
"learning_rate": 1.562298098595078e-05,
"loss": 0.4333,
"mean_token_accuracy": 0.8482805853709579,
"num_tokens": 113841273.0,
"step": 265
},
{
"entropy": 0.427642822265625,
"epoch": 1.0431372549019609,
"grad_norm": 0.7398785092951623,
"learning_rate": 1.55871448198326e-05,
"loss": 0.4262,
"mean_token_accuracy": 0.8510823482647538,
"num_tokens": 114273278.0,
"step": 266
},
{
"entropy": 0.43505859375,
"epoch": 1.0470588235294118,
"grad_norm": 0.7577350777991329,
"learning_rate": 1.5551204033464102e-05,
"loss": 0.4236,
"mean_token_accuracy": 0.8518886985257268,
"num_tokens": 114690533.0,
"step": 267
},
{
"entropy": 0.431854248046875,
"epoch": 1.0509803921568628,
"grad_norm": 0.7383675662288005,
"learning_rate": 1.551515929984271e-05,
"loss": 0.4289,
"mean_token_accuracy": 0.850223665125668,
"num_tokens": 115119453.0,
"step": 268
},
{
"entropy": 0.42645263671875,
"epoch": 1.0549019607843138,
"grad_norm": 0.7881284075354964,
"learning_rate": 1.5479011293912273e-05,
"loss": 0.4228,
"mean_token_accuracy": 0.8536723731085658,
"num_tokens": 115560457.0,
"step": 269
},
{
"entropy": 0.4232177734375,
"epoch": 1.0588235294117647,
"grad_norm": 0.7954689841424749,
"learning_rate": 1.5442760692550443e-05,
"loss": 0.4343,
"mean_token_accuracy": 0.8489060252904892,
"num_tokens": 115989927.0,
"step": 270
},
{
"entropy": 0.418365478515625,
"epoch": 1.0627450980392157,
"grad_norm": 0.7491371820505106,
"learning_rate": 1.5406408174555978e-05,
"loss": 0.4385,
"mean_token_accuracy": 0.8481973754242063,
"num_tokens": 116426232.0,
"step": 271
},
{
"entropy": 0.42626953125,
"epoch": 1.0666666666666667,
"grad_norm": 0.783768787078027,
"learning_rate": 1.5369954420636048e-05,
"loss": 0.4397,
"mean_token_accuracy": 0.8476524073630571,
"num_tokens": 116838678.0,
"step": 272
},
{
"entropy": 0.428741455078125,
"epoch": 1.0705882352941176,
"grad_norm": 0.7549673317013068,
"learning_rate": 1.533340011339348e-05,
"loss": 0.4245,
"mean_token_accuracy": 0.8504898408427835,
"num_tokens": 117262142.0,
"step": 273
},
{
"entropy": 0.42266845703125,
"epoch": 1.0745098039215686,
"grad_norm": 0.7760483061593083,
"learning_rate": 1.529674593731399e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8512313133105636,
"num_tokens": 117688261.0,
"step": 274
},
{
"entropy": 0.429931640625,
"epoch": 1.0784313725490196,
"grad_norm": 0.7346729703955766,
"learning_rate": 1.5259992578753335e-05,
"loss": 0.4303,
"mean_token_accuracy": 0.8502482092007995,
"num_tokens": 118118588.0,
"step": 275
},
{
"entropy": 0.43316650390625,
"epoch": 1.0823529411764705,
"grad_norm": 0.7610949914654637,
"learning_rate": 1.5223140725924494e-05,
"loss": 0.428,
"mean_token_accuracy": 0.8525876356288791,
"num_tokens": 118533618.0,
"step": 276
},
{
"entropy": 0.416259765625,
"epoch": 1.0862745098039215,
"grad_norm": 0.7586040410611942,
"learning_rate": 1.5186191068884774e-05,
"loss": 0.4172,
"mean_token_accuracy": 0.8536369241774082,
"num_tokens": 118986771.0,
"step": 277
},
{
"entropy": 0.419158935546875,
"epoch": 1.0901960784313725,
"grad_norm": 0.8122333245465506,
"learning_rate": 1.5149144299522874e-05,
"loss": 0.4353,
"mean_token_accuracy": 0.8489217078313231,
"num_tokens": 119418848.0,
"step": 278
},
{
"entropy": 0.4161376953125,
"epoch": 1.0941176470588236,
"grad_norm": 0.7971027405941981,
"learning_rate": 1.5112001111545933e-05,
"loss": 0.4328,
"mean_token_accuracy": 0.8489033579826355,
"num_tokens": 119840841.0,
"step": 279
},
{
"entropy": 0.4166259765625,
"epoch": 1.0980392156862746,
"grad_norm": 0.7894844623974637,
"learning_rate": 1.5074762200466557e-05,
"loss": 0.4314,
"mean_token_accuracy": 0.8496137168258429,
"num_tokens": 120279680.0,
"step": 280
},
{
"entropy": 0.420928955078125,
"epoch": 1.1019607843137256,
"grad_norm": 0.7956959838533788,
"learning_rate": 1.5037428263589778e-05,
"loss": 0.4318,
"mean_token_accuracy": 0.8514725975692272,
"num_tokens": 120700317.0,
"step": 281
},
{
"entropy": 0.423797607421875,
"epoch": 1.1058823529411765,
"grad_norm": 0.825458569699761,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.4277,
"mean_token_accuracy": 0.8525010421872139,
"num_tokens": 121126491.0,
"step": 282
},
{
"entropy": 0.43621826171875,
"epoch": 1.1098039215686275,
"grad_norm": 0.7166411546479006,
"learning_rate": 1.4962478110547918e-05,
"loss": 0.429,
"mean_token_accuracy": 0.8502249773591757,
"num_tokens": 121555610.0,
"step": 283
},
{
"entropy": 0.434906005859375,
"epoch": 1.1137254901960785,
"grad_norm": 0.7706765433868406,
"learning_rate": 1.4924863297837378e-05,
"loss": 0.4267,
"mean_token_accuracy": 0.8501248694956303,
"num_tokens": 121983963.0,
"step": 284
},
{
"entropy": 0.438873291015625,
"epoch": 1.1176470588235294,
"grad_norm": 0.7683977578682482,
"learning_rate": 1.4887156266212237e-05,
"loss": 0.4341,
"mean_token_accuracy": 0.8469060966745019,
"num_tokens": 122390376.0,
"step": 285
},
{
"entropy": 0.430755615234375,
"epoch": 1.1215686274509804,
"grad_norm": 0.7288640506851769,
"learning_rate": 1.4849357721743169e-05,
"loss": 0.4312,
"mean_token_accuracy": 0.8482475150376558,
"num_tokens": 122832305.0,
"step": 286
},
{
"entropy": 0.423187255859375,
"epoch": 1.1254901960784314,
"grad_norm": 0.7112151204180048,
"learning_rate": 1.4811468372214432e-05,
"loss": 0.4407,
"mean_token_accuracy": 0.84721062425524,
"num_tokens": 123272477.0,
"step": 287
},
{
"entropy": 0.425872802734375,
"epoch": 1.1294117647058823,
"grad_norm": 0.7543951967031106,
"learning_rate": 1.4773488927110633e-05,
"loss": 0.427,
"mean_token_accuracy": 0.8514616079628468,
"num_tokens": 123716296.0,
"step": 288
},
{
"entropy": 0.4237060546875,
"epoch": 1.1333333333333333,
"grad_norm": 0.7220973122261882,
"learning_rate": 1.473542009760343e-05,
"loss": 0.4256,
"mean_token_accuracy": 0.8516914714127779,
"num_tokens": 124149993.0,
"step": 289
},
{
"entropy": 0.4173583984375,
"epoch": 1.1372549019607843,
"grad_norm": 0.7187248533832408,
"learning_rate": 1.4697262596538227e-05,
"loss": 0.424,
"mean_token_accuracy": 0.8525945777073503,
"num_tokens": 124575738.0,
"step": 290
},
{
"entropy": 0.427032470703125,
"epoch": 1.1411764705882352,
"grad_norm": 0.6579326862882529,
"learning_rate": 1.4659017138420804e-05,
"loss": 0.4294,
"mean_token_accuracy": 0.852800234220922,
"num_tokens": 125012512.0,
"step": 291
},
{
"entropy": 0.434814453125,
"epoch": 1.1450980392156862,
"grad_norm": 0.724724576082981,
"learning_rate": 1.4620684439403962e-05,
"loss": 0.4401,
"mean_token_accuracy": 0.8472226839512587,
"num_tokens": 125437893.0,
"step": 292
},
{
"entropy": 0.428192138671875,
"epoch": 1.1490196078431372,
"grad_norm": 0.7183600555478822,
"learning_rate": 1.4582265217274105e-05,
"loss": 0.4229,
"mean_token_accuracy": 0.8529646163806319,
"num_tokens": 125877544.0,
"step": 293
},
{
"entropy": 0.429595947265625,
"epoch": 1.1529411764705881,
"grad_norm": 0.6896120937907955,
"learning_rate": 1.454376019143779e-05,
"loss": 0.4441,
"mean_token_accuracy": 0.8463943209499121,
"num_tokens": 126315970.0,
"step": 294
},
{
"entropy": 0.43206787109375,
"epoch": 1.156862745098039,
"grad_norm": 0.7105359525136665,
"learning_rate": 1.4505170082908269e-05,
"loss": 0.4363,
"mean_token_accuracy": 0.8492165962234139,
"num_tokens": 126749309.0,
"step": 295
},
{
"entropy": 0.424591064453125,
"epoch": 1.1607843137254903,
"grad_norm": 0.6660202625171524,
"learning_rate": 1.4466495614291977e-05,
"loss": 0.4246,
"mean_token_accuracy": 0.8515631575137377,
"num_tokens": 127191013.0,
"step": 296
},
{
"entropy": 0.42242431640625,
"epoch": 1.1647058823529413,
"grad_norm": 0.7632586900497342,
"learning_rate": 1.4427737509775008e-05,
"loss": 0.4233,
"mean_token_accuracy": 0.8523411825299263,
"num_tokens": 127628779.0,
"step": 297
},
{
"entropy": 0.420318603515625,
"epoch": 1.1686274509803922,
"grad_norm": 0.7099430978314399,
"learning_rate": 1.438889649510956e-05,
"loss": 0.4249,
"mean_token_accuracy": 0.8517980761826038,
"num_tokens": 128055765.0,
"step": 298
},
{
"entropy": 0.415679931640625,
"epoch": 1.1725490196078432,
"grad_norm": 0.7169933749455566,
"learning_rate": 1.4349973297600321e-05,
"loss": 0.4274,
"mean_token_accuracy": 0.8488310389220715,
"num_tokens": 128485889.0,
"step": 299
},
{
"entropy": 0.416351318359375,
"epoch": 1.1764705882352942,
"grad_norm": 0.722515066306239,
"learning_rate": 1.4310968646090884e-05,
"loss": 0.4279,
"mean_token_accuracy": 0.8511504996567965,
"num_tokens": 128915737.0,
"step": 300
},
{
"entropy": 0.421173095703125,
"epoch": 1.1803921568627451,
"grad_norm": 0.6798210825809636,
"learning_rate": 1.4271883270950073e-05,
"loss": 0.4275,
"mean_token_accuracy": 0.8499457621946931,
"num_tokens": 129337557.0,
"step": 301
},
{
"entropy": 0.418731689453125,
"epoch": 1.184313725490196,
"grad_norm": 0.706619653888489,
"learning_rate": 1.423271790405828e-05,
"loss": 0.4117,
"mean_token_accuracy": 0.8567122034728527,
"num_tokens": 129771578.0,
"step": 302
},
{
"entropy": 0.418914794921875,
"epoch": 1.188235294117647,
"grad_norm": 0.7361824434334385,
"learning_rate": 1.419347327879375e-05,
"loss": 0.4291,
"mean_token_accuracy": 0.850643745623529,
"num_tokens": 130196995.0,
"step": 303
},
{
"entropy": 0.412841796875,
"epoch": 1.192156862745098,
"grad_norm": 0.6661584399171278,
"learning_rate": 1.4154150130018867e-05,
"loss": 0.4345,
"mean_token_accuracy": 0.8520185491070151,
"num_tokens": 130628897.0,
"step": 304
},
{
"entropy": 0.417449951171875,
"epoch": 1.196078431372549,
"grad_norm": 0.7846231723266166,
"learning_rate": 1.4114749194066364e-05,
"loss": 0.428,
"mean_token_accuracy": 0.8515145275741816,
"num_tokens": 131045663.0,
"step": 305
},
{
"entropy": 0.414306640625,
"epoch": 1.2,
"grad_norm": 0.6574696580235325,
"learning_rate": 1.4075271208725572e-05,
"loss": 0.411,
"mean_token_accuracy": 0.8561296090483665,
"num_tokens": 131464562.0,
"step": 306
},
{
"entropy": 0.410675048828125,
"epoch": 1.203921568627451,
"grad_norm": 0.6673289170228338,
"learning_rate": 1.4035716913228568e-05,
"loss": 0.431,
"mean_token_accuracy": 0.8509927401319146,
"num_tokens": 131908062.0,
"step": 307
},
{
"entropy": 0.41754150390625,
"epoch": 1.2078431372549019,
"grad_norm": 0.7427627721794107,
"learning_rate": 1.3996087048236357e-05,
"loss": 0.425,
"mean_token_accuracy": 0.8522914592176676,
"num_tokens": 132321024.0,
"step": 308
},
{
"entropy": 0.405853271484375,
"epoch": 1.2117647058823529,
"grad_norm": 0.7168551440712705,
"learning_rate": 1.3956382355824999e-05,
"loss": 0.4325,
"mean_token_accuracy": 0.850104920566082,
"num_tokens": 132758451.0,
"step": 309
},
{
"entropy": 0.41082763671875,
"epoch": 1.215686274509804,
"grad_norm": 0.679207878607794,
"learning_rate": 1.3916603579471705e-05,
"loss": 0.4181,
"mean_token_accuracy": 0.8522440018132329,
"num_tokens": 133200509.0,
"step": 310
},
{
"entropy": 0.41253662109375,
"epoch": 1.219607843137255,
"grad_norm": 0.8035559374142178,
"learning_rate": 1.3876751464040924e-05,
"loss": 0.4213,
"mean_token_accuracy": 0.8530607046559453,
"num_tokens": 133630952.0,
"step": 311
},
{
"entropy": 0.40924072265625,
"epoch": 1.223529411764706,
"grad_norm": 0.6881590152098038,
"learning_rate": 1.3836826755770386e-05,
"loss": 0.4146,
"mean_token_accuracy": 0.8529272833839059,
"num_tokens": 134044313.0,
"step": 312
},
{
"entropy": 0.4156494140625,
"epoch": 1.227450980392157,
"grad_norm": 0.720307099360712,
"learning_rate": 1.3796830202257141e-05,
"loss": 0.4253,
"mean_token_accuracy": 0.8524025613442063,
"num_tokens": 134476638.0,
"step": 313
},
{
"entropy": 0.4110107421875,
"epoch": 1.231372549019608,
"grad_norm": 0.7218239013473345,
"learning_rate": 1.3756762552443555e-05,
"loss": 0.4307,
"mean_token_accuracy": 0.8505604760721326,
"num_tokens": 134910566.0,
"step": 314
},
{
"entropy": 0.414794921875,
"epoch": 1.2352941176470589,
"grad_norm": 0.6637567115140405,
"learning_rate": 1.3716624556603275e-05,
"loss": 0.4198,
"mean_token_accuracy": 0.8546493574976921,
"num_tokens": 135349522.0,
"step": 315
},
{
"entropy": 0.41571044921875,
"epoch": 1.2392156862745098,
"grad_norm": 0.68587837287742,
"learning_rate": 1.3676416966327201e-05,
"loss": 0.4142,
"mean_token_accuracy": 0.8545364672318101,
"num_tokens": 135759530.0,
"step": 316
},
{
"entropy": 0.423095703125,
"epoch": 1.2431372549019608,
"grad_norm": 0.7041633347426386,
"learning_rate": 1.3636140534509392e-05,
"loss": 0.4301,
"mean_token_accuracy": 0.8494144305586815,
"num_tokens": 136191290.0,
"step": 317
},
{
"entropy": 0.41656494140625,
"epoch": 1.2470588235294118,
"grad_norm": 0.6645492409494638,
"learning_rate": 1.3595796015332986e-05,
"loss": 0.4127,
"mean_token_accuracy": 0.8538919584825635,
"num_tokens": 136621679.0,
"step": 318
},
{
"entropy": 0.41436767578125,
"epoch": 1.2509803921568627,
"grad_norm": 0.6809848141129548,
"learning_rate": 1.3555384164256048e-05,
"loss": 0.4113,
"mean_token_accuracy": 0.8555595567449927,
"num_tokens": 137072932.0,
"step": 319
},
{
"entropy": 0.420654296875,
"epoch": 1.2549019607843137,
"grad_norm": 0.6860893890330958,
"learning_rate": 1.3514905737997474e-05,
"loss": 0.4257,
"mean_token_accuracy": 0.8516252571716905,
"num_tokens": 137498595.0,
"step": 320
},
{
"entropy": 0.41839599609375,
"epoch": 1.2588235294117647,
"grad_norm": 0.7209641987545784,
"learning_rate": 1.3474361494522769e-05,
"loss": 0.4135,
"mean_token_accuracy": 0.8547264030203223,
"num_tokens": 137929823.0,
"step": 321
},
{
"entropy": 0.41461181640625,
"epoch": 1.2627450980392156,
"grad_norm": 0.6642408443800759,
"learning_rate": 1.3433752193029888e-05,
"loss": 0.4227,
"mean_token_accuracy": 0.8537326790392399,
"num_tokens": 138366548.0,
"step": 322
},
{
"entropy": 0.414642333984375,
"epoch": 1.2666666666666666,
"grad_norm": 0.7129550559327684,
"learning_rate": 1.3393078593934998e-05,
"loss": 0.4252,
"mean_token_accuracy": 0.8525771573185921,
"num_tokens": 138805069.0,
"step": 323
},
{
"entropy": 0.422607421875,
"epoch": 1.2705882352941176,
"grad_norm": 0.7470458148795028,
"learning_rate": 1.3352341458858264e-05,
"loss": 0.4308,
"mean_token_accuracy": 0.8501967024058104,
"num_tokens": 139222262.0,
"step": 324
},
{
"entropy": 0.41412353515625,
"epoch": 1.2745098039215685,
"grad_norm": 0.7073090732730448,
"learning_rate": 1.3311541550609566e-05,
"loss": 0.4326,
"mean_token_accuracy": 0.8521415013819933,
"num_tokens": 139655569.0,
"step": 325
},
{
"entropy": 0.42816162109375,
"epoch": 1.2784313725490195,
"grad_norm": 0.6632060012055889,
"learning_rate": 1.3270679633174219e-05,
"loss": 0.4196,
"mean_token_accuracy": 0.8533567879348993,
"num_tokens": 140063044.0,
"step": 326
},
{
"entropy": 0.42047119140625,
"epoch": 1.2823529411764705,
"grad_norm": 0.7145450450096429,
"learning_rate": 1.3229756471698674e-05,
"loss": 0.4264,
"mean_token_accuracy": 0.8511630315333605,
"num_tokens": 140491506.0,
"step": 327
},
{
"entropy": 0.427001953125,
"epoch": 1.2862745098039214,
"grad_norm": 0.7417491681577237,
"learning_rate": 1.318877283247619e-05,
"loss": 0.4199,
"mean_token_accuracy": 0.8536494439467788,
"num_tokens": 140915505.0,
"step": 328
},
{
"entropy": 0.42401123046875,
"epoch": 1.2901960784313726,
"grad_norm": 0.7026578815553888,
"learning_rate": 1.3147729482932473e-05,
"loss": 0.4182,
"mean_token_accuracy": 0.8537089116871357,
"num_tokens": 141348827.0,
"step": 329
},
{
"entropy": 0.420562744140625,
"epoch": 1.2941176470588236,
"grad_norm": 0.7920529959240524,
"learning_rate": 1.3106627191611333e-05,
"loss": 0.4175,
"mean_token_accuracy": 0.8526282785460353,
"num_tokens": 141763227.0,
"step": 330
},
{
"entropy": 0.404693603515625,
"epoch": 1.2980392156862746,
"grad_norm": 0.7187972107180246,
"learning_rate": 1.3065466728160253e-05,
"loss": 0.4332,
"mean_token_accuracy": 0.8482584049925208,
"num_tokens": 142216480.0,
"step": 331
},
{
"entropy": 0.4119873046875,
"epoch": 1.3019607843137255,
"grad_norm": 0.7174309524641856,
"learning_rate": 1.3024248863316012e-05,
"loss": 0.423,
"mean_token_accuracy": 0.8517096359282732,
"num_tokens": 142657049.0,
"step": 332
},
{
"entropy": 0.409637451171875,
"epoch": 1.3058823529411765,
"grad_norm": 0.7028509712861,
"learning_rate": 1.2982974368890243e-05,
"loss": 0.4051,
"mean_token_accuracy": 0.8556926595047116,
"num_tokens": 143105947.0,
"step": 333
},
{
"entropy": 0.413299560546875,
"epoch": 1.3098039215686275,
"grad_norm": 0.7056529897906295,
"learning_rate": 1.2941644017754964e-05,
"loss": 0.4207,
"mean_token_accuracy": 0.851080933585763,
"num_tokens": 143536222.0,
"step": 334
},
{
"entropy": 0.4180908203125,
"epoch": 1.3137254901960784,
"grad_norm": 0.7476049940926148,
"learning_rate": 1.2900258583828138e-05,
"loss": 0.4287,
"mean_token_accuracy": 0.8512825155630708,
"num_tokens": 143975464.0,
"step": 335
},
{
"entropy": 0.419647216796875,
"epoch": 1.3176470588235294,
"grad_norm": 0.7336378531937496,
"learning_rate": 1.2858818842059145e-05,
"loss": 0.4152,
"mean_token_accuracy": 0.8559805741533637,
"num_tokens": 144399794.0,
"step": 336
},
{
"entropy": 0.416168212890625,
"epoch": 1.3215686274509804,
"grad_norm": 0.7405211465935665,
"learning_rate": 1.2817325568414299e-05,
"loss": 0.4246,
"mean_token_accuracy": 0.8506488613784313,
"num_tokens": 144828576.0,
"step": 337
},
{
"entropy": 0.42138671875,
"epoch": 1.3254901960784313,
"grad_norm": 0.7122042109511074,
"learning_rate": 1.2775779539862305e-05,
"loss": 0.4152,
"mean_token_accuracy": 0.8549979459494352,
"num_tokens": 145258310.0,
"step": 338
},
{
"entropy": 0.41534423828125,
"epoch": 1.3294117647058823,
"grad_norm": 0.7320013767426466,
"learning_rate": 1.273418153435971e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.8519812626764178,
"num_tokens": 145683977.0,
"step": 339
},
{
"entropy": 0.417236328125,
"epoch": 1.3333333333333333,
"grad_norm": 0.6927031984828272,
"learning_rate": 1.2692532330836346e-05,
"loss": 0.428,
"mean_token_accuracy": 0.8527764491736889,
"num_tokens": 146122446.0,
"step": 340
},
{
"entropy": 0.416900634765625,
"epoch": 1.3372549019607844,
"grad_norm": 0.7155795792658018,
"learning_rate": 1.2650832709180727e-05,
"loss": 0.4058,
"mean_token_accuracy": 0.856788550503552,
"num_tokens": 146522149.0,
"step": 341
},
{
"entropy": 0.410552978515625,
"epoch": 1.3411764705882354,
"grad_norm": 0.6848398168854253,
"learning_rate": 1.2609083450225468e-05,
"loss": 0.4248,
"mean_token_accuracy": 0.8541076770052314,
"num_tokens": 146983682.0,
"step": 342
},
{
"entropy": 0.417816162109375,
"epoch": 1.3450980392156864,
"grad_norm": 0.6766196619423954,
"learning_rate": 1.2567285335732633e-05,
"loss": 0.4117,
"mean_token_accuracy": 0.8540923977270722,
"num_tokens": 147407175.0,
"step": 343
},
{
"entropy": 0.42071533203125,
"epoch": 1.3490196078431373,
"grad_norm": 0.7039047785450393,
"learning_rate": 1.2525439148379127e-05,
"loss": 0.4046,
"mean_token_accuracy": 0.8577220821753144,
"num_tokens": 147803720.0,
"step": 344
},
{
"entropy": 0.417327880859375,
"epoch": 1.3529411764705883,
"grad_norm": 0.699655445151693,
"learning_rate": 1.248354567174203e-05,
"loss": 0.4151,
"mean_token_accuracy": 0.8557805633172393,
"num_tokens": 148249099.0,
"step": 345
},
{
"entropy": 0.416839599609375,
"epoch": 1.3568627450980393,
"grad_norm": 0.6808551312339953,
"learning_rate": 1.2441605690283915e-05,
"loss": 0.4178,
"mean_token_accuracy": 0.8543298495933414,
"num_tokens": 148679725.0,
"step": 346
},
{
"entropy": 0.4178466796875,
"epoch": 1.3607843137254902,
"grad_norm": 0.6714585744701463,
"learning_rate": 1.2399619989338165e-05,
"loss": 0.4176,
"mean_token_accuracy": 0.853199539706111,
"num_tokens": 149105797.0,
"step": 347
},
{
"entropy": 0.419891357421875,
"epoch": 1.3647058823529412,
"grad_norm": 0.6847257289307138,
"learning_rate": 1.2357589355094275e-05,
"loss": 0.4176,
"mean_token_accuracy": 0.8529877169057727,
"num_tokens": 149521643.0,
"step": 348
},
{
"entropy": 0.42413330078125,
"epoch": 1.3686274509803922,
"grad_norm": 0.7120004884140041,
"learning_rate": 1.2315514574583113e-05,
"loss": 0.4181,
"mean_token_accuracy": 0.8546496015042067,
"num_tokens": 149936921.0,
"step": 349
},
{
"entropy": 0.413299560546875,
"epoch": 1.3725490196078431,
"grad_norm": 0.6593385779430017,
"learning_rate": 1.2273396435662212e-05,
"loss": 0.4077,
"mean_token_accuracy": 0.8568513067439198,
"num_tokens": 150358247.0,
"step": 350
},
{
"entropy": 0.416351318359375,
"epoch": 1.3764705882352941,
"grad_norm": 0.6748945354600052,
"learning_rate": 1.2231235727000977e-05,
"loss": 0.412,
"mean_token_accuracy": 0.8564546350389719,
"num_tokens": 150781639.0,
"step": 351
},
{
"entropy": 0.412689208984375,
"epoch": 1.380392156862745,
"grad_norm": 0.6627273883982908,
"learning_rate": 1.218903323806595e-05,
"loss": 0.4101,
"mean_token_accuracy": 0.8563144765794277,
"num_tokens": 151210244.0,
"step": 352
},
{
"entropy": 0.41314697265625,
"epoch": 1.384313725490196,
"grad_norm": 0.6804939839140359,
"learning_rate": 1.2146789759106016e-05,
"loss": 0.4297,
"mean_token_accuracy": 0.8509758925065398,
"num_tokens": 151652694.0,
"step": 353
},
{
"entropy": 0.409027099609375,
"epoch": 1.388235294117647,
"grad_norm": 0.7551814729523321,
"learning_rate": 1.2104506081137608e-05,
"loss": 0.4171,
"mean_token_accuracy": 0.8555029472336173,
"num_tokens": 152069524.0,
"step": 354
},
{
"entropy": 0.411651611328125,
"epoch": 1.392156862745098,
"grad_norm": 0.6638398446727486,
"learning_rate": 1.2062182995929883e-05,
"loss": 0.4133,
"mean_token_accuracy": 0.8564111962914467,
"num_tokens": 152496092.0,
"step": 355
},
{
"entropy": 0.4066162109375,
"epoch": 1.396078431372549,
"grad_norm": 0.6956702354585098,
"learning_rate": 1.2019821295989913e-05,
"loss": 0.4141,
"mean_token_accuracy": 0.8558093551546335,
"num_tokens": 152936586.0,
"step": 356
},
{
"entropy": 0.412200927734375,
"epoch": 1.4,
"grad_norm": 0.7356334108107045,
"learning_rate": 1.1977421774547832e-05,
"loss": 0.4142,
"mean_token_accuracy": 0.8541790386661887,
"num_tokens": 153371852.0,
"step": 357
},
{
"entropy": 0.4122314453125,
"epoch": 1.4039215686274509,
"grad_norm": 0.7014777687549074,
"learning_rate": 1.1934985225541998e-05,
"loss": 0.4125,
"mean_token_accuracy": 0.8557530920952559,
"num_tokens": 153789570.0,
"step": 358
},
{
"entropy": 0.40606689453125,
"epoch": 1.4078431372549018,
"grad_norm": 0.6865162084151079,
"learning_rate": 1.1892512443604103e-05,
"loss": 0.4116,
"mean_token_accuracy": 0.854659709148109,
"num_tokens": 154208764.0,
"step": 359
},
{
"entropy": 0.408111572265625,
"epoch": 1.4117647058823528,
"grad_norm": 0.712196123175708,
"learning_rate": 1.1850004224044315e-05,
"loss": 0.3975,
"mean_token_accuracy": 0.8598070461302996,
"num_tokens": 154630249.0,
"step": 360
},
{
"entropy": 0.405975341796875,
"epoch": 1.415686274509804,
"grad_norm": 0.6986703398021826,
"learning_rate": 1.1807461362836382e-05,
"loss": 0.4112,
"mean_token_accuracy": 0.8554151114076376,
"num_tokens": 155063669.0,
"step": 361
},
{
"entropy": 0.40484619140625,
"epoch": 1.419607843137255,
"grad_norm": 0.7011227779262905,
"learning_rate": 1.1764884656602711e-05,
"loss": 0.4155,
"mean_token_accuracy": 0.8543020207434893,
"num_tokens": 155502743.0,
"step": 362
},
{
"entropy": 0.41925048828125,
"epoch": 1.423529411764706,
"grad_norm": 0.7388781102918118,
"learning_rate": 1.1722274902599469e-05,
"loss": 0.4188,
"mean_token_accuracy": 0.8535276213660836,
"num_tokens": 155911398.0,
"step": 363
},
{
"entropy": 0.409423828125,
"epoch": 1.427450980392157,
"grad_norm": 0.6903468704485393,
"learning_rate": 1.1679632898701649e-05,
"loss": 0.428,
"mean_token_accuracy": 0.849765595048666,
"num_tokens": 156361652.0,
"step": 364
},
{
"entropy": 0.41168212890625,
"epoch": 1.4313725490196079,
"grad_norm": 0.6778766361688789,
"learning_rate": 1.1636959443388131e-05,
"loss": 0.4015,
"mean_token_accuracy": 0.8571812696754932,
"num_tokens": 156807343.0,
"step": 365
},
{
"entropy": 0.412109375,
"epoch": 1.4352941176470588,
"grad_norm": 0.642733238897495,
"learning_rate": 1.1594255335726725e-05,
"loss": 0.4145,
"mean_token_accuracy": 0.853903261013329,
"num_tokens": 157249464.0,
"step": 366
},
{
"entropy": 0.424652099609375,
"epoch": 1.4392156862745098,
"grad_norm": 0.6483544400990201,
"learning_rate": 1.1551521375359207e-05,
"loss": 0.4046,
"mean_token_accuracy": 0.857556514441967,
"num_tokens": 157640719.0,
"step": 367
},
{
"entropy": 0.409210205078125,
"epoch": 1.4431372549019608,
"grad_norm": 0.7222910236522575,
"learning_rate": 1.1508758362486358e-05,
"loss": 0.4266,
"mean_token_accuracy": 0.8515659496188164,
"num_tokens": 158086707.0,
"step": 368
},
{
"entropy": 0.410858154296875,
"epoch": 1.4470588235294117,
"grad_norm": 0.6823438746266127,
"learning_rate": 1.1465967097852971e-05,
"loss": 0.4092,
"mean_token_accuracy": 0.8546876255422831,
"num_tokens": 158534215.0,
"step": 369
},
{
"entropy": 0.4141845703125,
"epoch": 1.4509803921568627,
"grad_norm": 0.7292179149719259,
"learning_rate": 1.1423148382732854e-05,
"loss": 0.4237,
"mean_token_accuracy": 0.8510736022144556,
"num_tokens": 158964888.0,
"step": 370
},
{
"entropy": 0.410247802734375,
"epoch": 1.4549019607843137,
"grad_norm": 0.6971738873757594,
"learning_rate": 1.1380303018913832e-05,
"loss": 0.4172,
"mean_token_accuracy": 0.8541428428143263,
"num_tokens": 159399267.0,
"step": 371
},
{
"entropy": 0.415679931640625,
"epoch": 1.4588235294117646,
"grad_norm": 0.7383800792346643,
"learning_rate": 1.133743180868273e-05,
"loss": 0.4221,
"mean_token_accuracy": 0.8528790548443794,
"num_tokens": 159819677.0,
"step": 372
},
{
"entropy": 0.412841796875,
"epoch": 1.4627450980392158,
"grad_norm": 0.6805212338474808,
"learning_rate": 1.1294535554810356e-05,
"loss": 0.4161,
"mean_token_accuracy": 0.8552266210317612,
"num_tokens": 160254854.0,
"step": 373
},
{
"entropy": 0.412506103515625,
"epoch": 1.4666666666666668,
"grad_norm": 0.7480351284538674,
"learning_rate": 1.125161506053646e-05,
"loss": 0.4071,
"mean_token_accuracy": 0.8572438461706042,
"num_tokens": 160685537.0,
"step": 374
},
{
"entropy": 0.41534423828125,
"epoch": 1.4705882352941178,
"grad_norm": 0.6771163486521969,
"learning_rate": 1.1208671129554703e-05,
"loss": 0.4162,
"mean_token_accuracy": 0.8534526033326983,
"num_tokens": 161126542.0,
"step": 375
},
{
"entropy": 0.412841796875,
"epoch": 1.4745098039215687,
"grad_norm": 0.6397900219361803,
"learning_rate": 1.1165704565997593e-05,
"loss": 0.4009,
"mean_token_accuracy": 0.8574716188013554,
"num_tokens": 161551358.0,
"step": 376
},
{
"entropy": 0.41497802734375,
"epoch": 1.4784313725490197,
"grad_norm": 0.7413777447374873,
"learning_rate": 1.1122716174421446e-05,
"loss": 0.4097,
"mean_token_accuracy": 0.8574340445920825,
"num_tokens": 161967605.0,
"step": 377
},
{
"entropy": 0.41448974609375,
"epoch": 1.4823529411764707,
"grad_norm": 0.6851273431643456,
"learning_rate": 1.1079706759791311e-05,
"loss": 0.4105,
"mean_token_accuracy": 0.8551990939304233,
"num_tokens": 162376155.0,
"step": 378
},
{
"entropy": 0.40765380859375,
"epoch": 1.4862745098039216,
"grad_norm": 0.7262571621062189,
"learning_rate": 1.103667712746589e-05,
"loss": 0.4063,
"mean_token_accuracy": 0.8556642541661859,
"num_tokens": 162795464.0,
"step": 379
},
{
"entropy": 0.418060302734375,
"epoch": 1.4901960784313726,
"grad_norm": 0.709837787003394,
"learning_rate": 1.0993628083182468e-05,
"loss": 0.4101,
"mean_token_accuracy": 0.8567473096773028,
"num_tokens": 163211049.0,
"step": 380
},
{
"entropy": 0.412078857421875,
"epoch": 1.4941176470588236,
"grad_norm": 0.7160640024703722,
"learning_rate": 1.0950560433041825e-05,
"loss": 0.4204,
"mean_token_accuracy": 0.853213481605053,
"num_tokens": 163651169.0,
"step": 381
},
{
"entropy": 0.4249267578125,
"epoch": 1.4980392156862745,
"grad_norm": 0.6635137849439725,
"learning_rate": 1.0907474983493144e-05,
"loss": 0.408,
"mean_token_accuracy": 0.8586263991892338,
"num_tokens": 164052131.0,
"step": 382
},
{
"entropy": 0.41943359375,
"epoch": 1.5019607843137255,
"grad_norm": 0.7005678883006693,
"learning_rate": 1.0864372541318891e-05,
"loss": 0.3892,
"mean_token_accuracy": 0.8622283479198813,
"num_tokens": 164472628.0,
"step": 383
},
{
"entropy": 0.419921875,
"epoch": 1.5058823529411764,
"grad_norm": 0.7354010110687161,
"learning_rate": 1.0821253913619727e-05,
"loss": 0.4204,
"mean_token_accuracy": 0.8538774671033025,
"num_tokens": 164917451.0,
"step": 384
},
{
"entropy": 0.411956787109375,
"epoch": 1.5098039215686274,
"grad_norm": 0.7080530760627707,
"learning_rate": 1.0778119907799399e-05,
"loss": 0.4022,
"mean_token_accuracy": 0.8572713797912002,
"num_tokens": 165348112.0,
"step": 385
},
{
"entropy": 0.411041259765625,
"epoch": 1.5137254901960784,
"grad_norm": 0.6752314903003961,
"learning_rate": 1.0734971331549604e-05,
"loss": 0.4014,
"mean_token_accuracy": 0.8585884692147374,
"num_tokens": 165757610.0,
"step": 386
},
{
"entropy": 0.409271240234375,
"epoch": 1.5176470588235293,
"grad_norm": 0.859464788101391,
"learning_rate": 1.0691808992834866e-05,
"loss": 0.4138,
"mean_token_accuracy": 0.8532675765454769,
"num_tokens": 166191021.0,
"step": 387
},
{
"entropy": 0.40966796875,
"epoch": 1.5215686274509803,
"grad_norm": 0.6723968211994824,
"learning_rate": 1.064863369987743e-05,
"loss": 0.411,
"mean_token_accuracy": 0.8572338540107012,
"num_tokens": 166647810.0,
"step": 388
},
{
"entropy": 0.4100341796875,
"epoch": 1.5254901960784313,
"grad_norm": 0.6654450287296578,
"learning_rate": 1.06054462611421e-05,
"loss": 0.4053,
"mean_token_accuracy": 0.8566803587600589,
"num_tokens": 167091919.0,
"step": 389
},
{
"entropy": 0.411102294921875,
"epoch": 1.5294117647058822,
"grad_norm": 0.6743637396313977,
"learning_rate": 1.0562247485321116e-05,
"loss": 0.4102,
"mean_token_accuracy": 0.8562160143628716,
"num_tokens": 167524975.0,
"step": 390
},
{
"entropy": 0.412933349609375,
"epoch": 1.5333333333333332,
"grad_norm": 0.6979119955346013,
"learning_rate": 1.0519038181319e-05,
"loss": 0.3989,
"mean_token_accuracy": 0.8592202458530664,
"num_tokens": 167957081.0,
"step": 391
},
{
"entropy": 0.414581298828125,
"epoch": 1.5372549019607842,
"grad_norm": 0.7214480435006635,
"learning_rate": 1.0475819158237426e-05,
"loss": 0.4164,
"mean_token_accuracy": 0.854176253080368,
"num_tokens": 168382901.0,
"step": 392
},
{
"entropy": 0.410247802734375,
"epoch": 1.5411764705882351,
"grad_norm": 0.7240884794501748,
"learning_rate": 1.0432591225360052e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.851943246088922,
"num_tokens": 168830532.0,
"step": 393
},
{
"entropy": 0.41070556640625,
"epoch": 1.5450980392156861,
"grad_norm": 0.6615798629050048,
"learning_rate": 1.0389355192137379e-05,
"loss": 0.396,
"mean_token_accuracy": 0.8609147928655148,
"num_tokens": 169260615.0,
"step": 394
},
{
"entropy": 0.403564453125,
"epoch": 1.5490196078431373,
"grad_norm": 0.6950370598617063,
"learning_rate": 1.0346111868171584e-05,
"loss": 0.3964,
"mean_token_accuracy": 0.8605449888855219,
"num_tokens": 169688457.0,
"step": 395
},
{
"entropy": 0.406524658203125,
"epoch": 1.5529411764705883,
"grad_norm": 0.6784651046053325,
"learning_rate": 1.0302862063201367e-05,
"loss": 0.4099,
"mean_token_accuracy": 0.8544306671246886,
"num_tokens": 170139299.0,
"step": 396
},
{
"entropy": 0.4044189453125,
"epoch": 1.5568627450980392,
"grad_norm": 0.7847918254095473,
"learning_rate": 1.0259606587086783e-05,
"loss": 0.4179,
"mean_token_accuracy": 0.8526219138875604,
"num_tokens": 170587531.0,
"step": 397
},
{
"entropy": 0.40472412109375,
"epoch": 1.5607843137254902,
"grad_norm": 0.6894614523455117,
"learning_rate": 1.0216346249794087e-05,
"loss": 0.4079,
"mean_token_accuracy": 0.8570181773975492,
"num_tokens": 171025712.0,
"step": 398
},
{
"entropy": 0.40240478515625,
"epoch": 1.5647058823529412,
"grad_norm": 0.6841419524220748,
"learning_rate": 1.0173081861380551e-05,
"loss": 0.391,
"mean_token_accuracy": 0.8597865039482713,
"num_tokens": 171479929.0,
"step": 399
},
{
"entropy": 0.412628173828125,
"epoch": 1.5686274509803921,
"grad_norm": 0.6912607545067652,
"learning_rate": 1.012981423197931e-05,
"loss": 0.3995,
"mean_token_accuracy": 0.8574095563963056,
"num_tokens": 171913547.0,
"step": 400
},
{
"entropy": 0.41455078125,
"epoch": 1.572549019607843,
"grad_norm": 0.6752207564966213,
"learning_rate": 1.0086544171784187e-05,
"loss": 0.4011,
"mean_token_accuracy": 0.8586401976644993,
"num_tokens": 172350197.0,
"step": 401
},
{
"entropy": 0.40863037109375,
"epoch": 1.576470588235294,
"grad_norm": 0.71746520315725,
"learning_rate": 1.0043272491034523e-05,
"loss": 0.3977,
"mean_token_accuracy": 0.8575174137949944,
"num_tokens": 172777991.0,
"step": 402
},
{
"entropy": 0.41156005859375,
"epoch": 1.5803921568627453,
"grad_norm": 0.675986123292717,
"learning_rate": 1e-05,
"loss": 0.3949,
"mean_token_accuracy": 0.8585043726488948,
"num_tokens": 173200136.0,
"step": 403
},
{
"entropy": 0.407012939453125,
"epoch": 1.5843137254901962,
"grad_norm": 0.6925221354784149,
"learning_rate": 9.956727508965482e-06,
"loss": 0.414,
"mean_token_accuracy": 0.8540292549878359,
"num_tokens": 173629631.0,
"step": 404
},
{
"entropy": 0.410736083984375,
"epoch": 1.5882352941176472,
"grad_norm": 0.6847506074125855,
"learning_rate": 9.913455828215815e-06,
"loss": 0.3959,
"mean_token_accuracy": 0.8615149781107903,
"num_tokens": 174054145.0,
"step": 405
},
{
"entropy": 0.410858154296875,
"epoch": 1.5921568627450982,
"grad_norm": 0.722859402637327,
"learning_rate": 9.870185768020694e-06,
"loss": 0.3992,
"mean_token_accuracy": 0.8596215089783072,
"num_tokens": 174473463.0,
"step": 406
},
{
"entropy": 0.406646728515625,
"epoch": 1.5960784313725491,
"grad_norm": 0.6946870646270319,
"learning_rate": 9.826918138619454e-06,
"loss": 0.4038,
"mean_token_accuracy": 0.8573076035827398,
"num_tokens": 174892440.0,
"step": 407
},
{
"entropy": 0.41070556640625,
"epoch": 1.6,
"grad_norm": 0.6890905076143856,
"learning_rate": 9.783653750205916e-06,
"loss": 0.3854,
"mean_token_accuracy": 0.8630838803946972,
"num_tokens": 175299479.0,
"step": 408
},
{
"entropy": 0.40509033203125,
"epoch": 1.603921568627451,
"grad_norm": 0.7026567580948715,
"learning_rate": 9.740393412913219e-06,
"loss": 0.3993,
"mean_token_accuracy": 0.8585938615724444,
"num_tokens": 175727422.0,
"step": 409
},
{
"entropy": 0.407501220703125,
"epoch": 1.607843137254902,
"grad_norm": 0.6499788718397761,
"learning_rate": 9.697137936798635e-06,
"loss": 0.4118,
"mean_token_accuracy": 0.8579577170312405,
"num_tokens": 176171135.0,
"step": 410
},
{
"entropy": 0.41387939453125,
"epoch": 1.611764705882353,
"grad_norm": 0.7288347926112773,
"learning_rate": 9.65388813182842e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.857942121103406,
"num_tokens": 176605815.0,
"step": 411
},
{
"entropy": 0.406585693359375,
"epoch": 1.615686274509804,
"grad_norm": 0.6873130367941563,
"learning_rate": 9.610644807862625e-06,
"loss": 0.385,
"mean_token_accuracy": 0.8628216292709112,
"num_tokens": 177046849.0,
"step": 412
},
{
"entropy": 0.409515380859375,
"epoch": 1.619607843137255,
"grad_norm": 0.8159477025546342,
"learning_rate": 9.567408774639951e-06,
"loss": 0.4114,
"mean_token_accuracy": 0.8583543403074145,
"num_tokens": 177479814.0,
"step": 413
},
{
"entropy": 0.4022216796875,
"epoch": 1.6235294117647059,
"grad_norm": 0.7046825130274702,
"learning_rate": 9.524180841762577e-06,
"loss": 0.4079,
"mean_token_accuracy": 0.8577337488532066,
"num_tokens": 177942489.0,
"step": 414
},
{
"entropy": 0.406951904296875,
"epoch": 1.6274509803921569,
"grad_norm": 0.67812309257672,
"learning_rate": 9.480961818681004e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.8608548073098063,
"num_tokens": 178372403.0,
"step": 415
},
{
"entropy": 0.408172607421875,
"epoch": 1.6313725490196078,
"grad_norm": 0.6501390152436636,
"learning_rate": 9.437752514678888e-06,
"loss": 0.3822,
"mean_token_accuracy": 0.864359175786376,
"num_tokens": 178802249.0,
"step": 416
},
{
"entropy": 0.411834716796875,
"epoch": 1.6352941176470588,
"grad_norm": 0.6919125667004101,
"learning_rate": 9.394553738857902e-06,
"loss": 0.3804,
"mean_token_accuracy": 0.8647814923897386,
"num_tokens": 179211558.0,
"step": 417
},
{
"entropy": 0.418670654296875,
"epoch": 1.6392156862745098,
"grad_norm": 0.745063777167442,
"learning_rate": 9.351366300122569e-06,
"loss": 0.4066,
"mean_token_accuracy": 0.8586796801537275,
"num_tokens": 179626922.0,
"step": 418
},
{
"entropy": 0.40985107421875,
"epoch": 1.6431372549019607,
"grad_norm": 0.7460150646397898,
"learning_rate": 9.308191007165135e-06,
"loss": 0.4107,
"mean_token_accuracy": 0.8577667633071542,
"num_tokens": 180060705.0,
"step": 419
},
{
"entropy": 0.4140625,
"epoch": 1.6470588235294117,
"grad_norm": 0.6550263847678918,
"learning_rate": 9.265028668450403e-06,
"loss": 0.3917,
"mean_token_accuracy": 0.8609657865017653,
"num_tokens": 180478026.0,
"step": 420
},
{
"entropy": 0.413665771484375,
"epoch": 1.6509803921568627,
"grad_norm": 0.7020773548739169,
"learning_rate": 9.221880092200601e-06,
"loss": 0.4072,
"mean_token_accuracy": 0.8573073288425803,
"num_tokens": 180923505.0,
"step": 421
},
{
"entropy": 0.40521240234375,
"epoch": 1.6549019607843136,
"grad_norm": 0.6966400451319761,
"learning_rate": 9.178746086380274e-06,
"loss": 0.4021,
"mean_token_accuracy": 0.8583230208605528,
"num_tokens": 181351214.0,
"step": 422
},
{
"entropy": 0.41058349609375,
"epoch": 1.6588235294117646,
"grad_norm": 0.6698243839063328,
"learning_rate": 9.135627458681116e-06,
"loss": 0.3967,
"mean_token_accuracy": 0.8596462178975344,
"num_tokens": 181785481.0,
"step": 423
},
{
"entropy": 0.40679931640625,
"epoch": 1.6627450980392156,
"grad_norm": 0.6475108302803604,
"learning_rate": 9.092525016506858e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.8637061798945069,
"num_tokens": 182219067.0,
"step": 424
},
{
"entropy": 0.41473388671875,
"epoch": 1.6666666666666665,
"grad_norm": 0.6675953966752504,
"learning_rate": 9.049439566958176e-06,
"loss": 0.4046,
"mean_token_accuracy": 0.8553493404760957,
"num_tokens": 182636470.0,
"step": 425
},
{
"entropy": 0.40753173828125,
"epoch": 1.6705882352941175,
"grad_norm": 0.6619441579391007,
"learning_rate": 9.006371916817533e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.860662579536438,
"num_tokens": 183060041.0,
"step": 426
},
{
"entropy": 0.408294677734375,
"epoch": 1.6745098039215687,
"grad_norm": 0.6445359135710907,
"learning_rate": 8.963322872534115e-06,
"loss": 0.4007,
"mean_token_accuracy": 0.8577142441645265,
"num_tokens": 183475870.0,
"step": 427
},
{
"entropy": 0.409027099609375,
"epoch": 1.6784313725490196,
"grad_norm": 0.6979215249121394,
"learning_rate": 8.920293240208694e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8606764739379287,
"num_tokens": 183910388.0,
"step": 428
},
{
"entropy": 0.415252685546875,
"epoch": 1.6823529411764706,
"grad_norm": 0.6899546001964146,
"learning_rate": 8.877283825578554e-06,
"loss": 0.3991,
"mean_token_accuracy": 0.8583256499841809,
"num_tokens": 184319219.0,
"step": 429
},
{
"entropy": 0.40948486328125,
"epoch": 1.6862745098039216,
"grad_norm": 0.6302580019373215,
"learning_rate": 8.83429543400241e-06,
"loss": 0.4009,
"mean_token_accuracy": 0.8594886185601354,
"num_tokens": 184751471.0,
"step": 430
},
{
"entropy": 0.412750244140625,
"epoch": 1.6901960784313725,
"grad_norm": 0.6911336842606688,
"learning_rate": 8.791328870445302e-06,
"loss": 0.399,
"mean_token_accuracy": 0.8586211362853646,
"num_tokens": 185166878.0,
"step": 431
},
{
"entropy": 0.407196044921875,
"epoch": 1.6941176470588235,
"grad_norm": 0.6623519559272918,
"learning_rate": 8.748384939463543e-06,
"loss": 0.401,
"mean_token_accuracy": 0.8593526994809508,
"num_tokens": 185606889.0,
"step": 432
},
{
"entropy": 0.401275634765625,
"epoch": 1.6980392156862745,
"grad_norm": 0.6115072966136919,
"learning_rate": 8.705464445189646e-06,
"loss": 0.3945,
"mean_token_accuracy": 0.8617425132542849,
"num_tokens": 186057141.0,
"step": 433
},
{
"entropy": 0.406341552734375,
"epoch": 1.7019607843137254,
"grad_norm": 0.6563129181567589,
"learning_rate": 8.662568191317273e-06,
"loss": 0.4008,
"mean_token_accuracy": 0.8593833548948169,
"num_tokens": 186496413.0,
"step": 434
},
{
"entropy": 0.4078369140625,
"epoch": 1.7058823529411766,
"grad_norm": 0.6384184762621999,
"learning_rate": 8.619696981086173e-06,
"loss": 0.3878,
"mean_token_accuracy": 0.8609198983758688,
"num_tokens": 186908586.0,
"step": 435
},
{
"entropy": 0.407470703125,
"epoch": 1.7098039215686276,
"grad_norm": 0.6456952863053911,
"learning_rate": 8.576851617267151e-06,
"loss": 0.4054,
"mean_token_accuracy": 0.8583989115431905,
"num_tokens": 187344594.0,
"step": 436
},
{
"entropy": 0.415130615234375,
"epoch": 1.7137254901960786,
"grad_norm": 0.6051085180740002,
"learning_rate": 8.53403290214703e-06,
"loss": 0.3887,
"mean_token_accuracy": 0.8639670200645924,
"num_tokens": 187761423.0,
"step": 437
},
{
"entropy": 0.407196044921875,
"epoch": 1.7176470588235295,
"grad_norm": 0.6231489555374776,
"learning_rate": 8.491241637513644e-06,
"loss": 0.3909,
"mean_token_accuracy": 0.8602004619315267,
"num_tokens": 188191066.0,
"step": 438
},
{
"entropy": 0.407196044921875,
"epoch": 1.7215686274509805,
"grad_norm": 0.6793096084424919,
"learning_rate": 8.448478624640798e-06,
"loss": 0.4076,
"mean_token_accuracy": 0.8562532840296626,
"num_tokens": 188644360.0,
"step": 439
},
{
"entropy": 0.40386962890625,
"epoch": 1.7254901960784315,
"grad_norm": 0.6460668298473338,
"learning_rate": 8.405744664273278e-06,
"loss": 0.3988,
"mean_token_accuracy": 0.8608309207484126,
"num_tokens": 189102349.0,
"step": 440
},
{
"entropy": 0.41302490234375,
"epoch": 1.7294117647058824,
"grad_norm": 0.648333243521565,
"learning_rate": 8.363040556611872e-06,
"loss": 0.3979,
"mean_token_accuracy": 0.8595832930877805,
"num_tokens": 189532865.0,
"step": 441
},
{
"entropy": 0.407989501953125,
"epoch": 1.7333333333333334,
"grad_norm": 0.6671747464688874,
"learning_rate": 8.320367101298351e-06,
"loss": 0.3986,
"mean_token_accuracy": 0.8617238756269217,
"num_tokens": 189973506.0,
"step": 442
},
{
"entropy": 0.404937744140625,
"epoch": 1.7372549019607844,
"grad_norm": 0.6590397435344738,
"learning_rate": 8.277725097400536e-06,
"loss": 0.397,
"mean_token_accuracy": 0.8606142830103636,
"num_tokens": 190421714.0,
"step": 443
},
{
"entropy": 0.406005859375,
"epoch": 1.7411764705882353,
"grad_norm": 0.6530611449981759,
"learning_rate": 8.235115343397295e-06,
"loss": 0.4075,
"mean_token_accuracy": 0.85643027164042,
"num_tokens": 190862770.0,
"step": 444
},
{
"entropy": 0.4041748046875,
"epoch": 1.7450980392156863,
"grad_norm": 0.7027101958624039,
"learning_rate": 8.19253863716362e-06,
"loss": 0.3914,
"mean_token_accuracy": 0.8620561547577381,
"num_tokens": 191304126.0,
"step": 445
},
{
"entropy": 0.41015625,
"epoch": 1.7490196078431373,
"grad_norm": 0.6786856809017515,
"learning_rate": 8.149995775955686e-06,
"loss": 0.3917,
"mean_token_accuracy": 0.8625259781256318,
"num_tokens": 191719523.0,
"step": 446
},
{
"entropy": 0.40704345703125,
"epoch": 1.7529411764705882,
"grad_norm": 0.6615376726535924,
"learning_rate": 8.107487556395902e-06,
"loss": 0.3913,
"mean_token_accuracy": 0.8633209681138396,
"num_tokens": 192142207.0,
"step": 447
},
{
"entropy": 0.409759521484375,
"epoch": 1.7568627450980392,
"grad_norm": 0.6878125438153613,
"learning_rate": 8.065014774458004e-06,
"loss": 0.4007,
"mean_token_accuracy": 0.8581640059128404,
"num_tokens": 192579162.0,
"step": 448
},
{
"entropy": 0.416351318359375,
"epoch": 1.7607843137254902,
"grad_norm": 0.6538402222716845,
"learning_rate": 8.02257822545217e-06,
"loss": 0.3905,
"mean_token_accuracy": 0.8591163596138358,
"num_tokens": 192987107.0,
"step": 449
},
{
"entropy": 0.41094970703125,
"epoch": 1.7647058823529411,
"grad_norm": 0.6971665818355606,
"learning_rate": 7.980178704010089e-06,
"loss": 0.4006,
"mean_token_accuracy": 0.8593992488458753,
"num_tokens": 193421020.0,
"step": 450
},
{
"entropy": 0.409881591796875,
"epoch": 1.768627450980392,
"grad_norm": 0.6409197670335629,
"learning_rate": 7.93781700407012e-06,
"loss": 0.3897,
"mean_token_accuracy": 0.8616615459322929,
"num_tokens": 193846467.0,
"step": 451
},
{
"entropy": 0.41131591796875,
"epoch": 1.772549019607843,
"grad_norm": 0.6931163174376977,
"learning_rate": 7.895493918862395e-06,
"loss": 0.4036,
"mean_token_accuracy": 0.8571218065917492,
"num_tokens": 194287774.0,
"step": 452
},
{
"entropy": 0.403717041015625,
"epoch": 1.776470588235294,
"grad_norm": 0.66666924829856,
"learning_rate": 7.853210240893985e-06,
"loss": 0.3905,
"mean_token_accuracy": 0.8637211918830872,
"num_tokens": 194720263.0,
"step": 453
},
{
"entropy": 0.41131591796875,
"epoch": 1.780392156862745,
"grad_norm": 0.6204235116772743,
"learning_rate": 7.810966761934053e-06,
"loss": 0.3921,
"mean_token_accuracy": 0.8618059307336807,
"num_tokens": 195146105.0,
"step": 454
},
{
"entropy": 0.4031982421875,
"epoch": 1.784313725490196,
"grad_norm": 0.6342484649096467,
"learning_rate": 7.76876427299903e-06,
"loss": 0.4121,
"mean_token_accuracy": 0.8566878782585263,
"num_tokens": 195584542.0,
"step": 455
},
{
"entropy": 0.406951904296875,
"epoch": 1.788235294117647,
"grad_norm": 0.6328100517917241,
"learning_rate": 7.726603564337791e-06,
"loss": 0.386,
"mean_token_accuracy": 0.8630632497370243,
"num_tokens": 195999765.0,
"step": 456
},
{
"entropy": 0.406463623046875,
"epoch": 1.792156862745098,
"grad_norm": 0.6652068952143365,
"learning_rate": 7.684485425416888e-06,
"loss": 0.3955,
"mean_token_accuracy": 0.8625632170587778,
"num_tokens": 196418621.0,
"step": 457
},
{
"entropy": 0.406951904296875,
"epoch": 1.7960784313725489,
"grad_norm": 0.6326591145779984,
"learning_rate": 7.642410644905726e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.860276403836906,
"num_tokens": 196847355.0,
"step": 458
},
{
"entropy": 0.41046142578125,
"epoch": 1.8,
"grad_norm": 0.668308364754529,
"learning_rate": 7.600380010661836e-06,
"loss": 0.3885,
"mean_token_accuracy": 0.8643168518319726,
"num_tokens": 197280641.0,
"step": 459
},
{
"entropy": 0.40557861328125,
"epoch": 1.803921568627451,
"grad_norm": 0.6138313731892276,
"learning_rate": 7.558394309716088e-06,
"loss": 0.3821,
"mean_token_accuracy": 0.86452910117805,
"num_tokens": 197727391.0,
"step": 460
},
{
"entropy": 0.39898681640625,
"epoch": 1.807843137254902,
"grad_norm": 0.6364964632727911,
"learning_rate": 7.516454328257969e-06,
"loss": 0.3952,
"mean_token_accuracy": 0.8590056737884879,
"num_tokens": 198148365.0,
"step": 461
},
{
"entropy": 0.399169921875,
"epoch": 1.811764705882353,
"grad_norm": 0.681759781228122,
"learning_rate": 7.474560851620873e-06,
"loss": 0.4151,
"mean_token_accuracy": 0.8547424823045731,
"num_tokens": 198592222.0,
"step": 462
},
{
"entropy": 0.391326904296875,
"epoch": 1.815686274509804,
"grad_norm": 0.6596255218589232,
"learning_rate": 7.432714664267373e-06,
"loss": 0.3859,
"mean_token_accuracy": 0.8629283830523491,
"num_tokens": 199059847.0,
"step": 463
},
{
"entropy": 0.40460205078125,
"epoch": 1.8196078431372549,
"grad_norm": 0.6799616657323965,
"learning_rate": 7.390916549774536e-06,
"loss": 0.399,
"mean_token_accuracy": 0.8610562225803733,
"num_tokens": 199490320.0,
"step": 464
},
{
"entropy": 0.4024658203125,
"epoch": 1.8235294117647058,
"grad_norm": 0.6357942300751293,
"learning_rate": 7.349167290819274e-06,
"loss": 0.3884,
"mean_token_accuracy": 0.8627661904320121,
"num_tokens": 199926836.0,
"step": 465
},
{
"entropy": 0.40557861328125,
"epoch": 1.8274509803921568,
"grad_norm": 0.6212601820985758,
"learning_rate": 7.307467669163655e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8631542297080159,
"num_tokens": 200351704.0,
"step": 466
},
{
"entropy": 0.40411376953125,
"epoch": 1.831372549019608,
"grad_norm": 0.6644528066997022,
"learning_rate": 7.265818465640292e-06,
"loss": 0.3898,
"mean_token_accuracy": 0.8605430433526635,
"num_tokens": 200779543.0,
"step": 467
},
{
"entropy": 0.40142822265625,
"epoch": 1.835294117647059,
"grad_norm": 0.6192510699914716,
"learning_rate": 7.224220460137701e-06,
"loss": 0.383,
"mean_token_accuracy": 0.8634469164535403,
"num_tokens": 201219423.0,
"step": 468
},
{
"entropy": 0.411956787109375,
"epoch": 1.83921568627451,
"grad_norm": 0.6758910902237301,
"learning_rate": 7.182674431585703e-06,
"loss": 0.3851,
"mean_token_accuracy": 0.8633188679814339,
"num_tokens": 201629124.0,
"step": 469
},
{
"entropy": 0.3983154296875,
"epoch": 1.843137254901961,
"grad_norm": 0.6240002104477036,
"learning_rate": 7.141181157940859e-06,
"loss": 0.3849,
"mean_token_accuracy": 0.8628418175503612,
"num_tokens": 202050007.0,
"step": 470
},
{
"entropy": 0.39898681640625,
"epoch": 1.8470588235294119,
"grad_norm": 0.6329914255858695,
"learning_rate": 7.099741416171866e-06,
"loss": 0.3853,
"mean_token_accuracy": 0.8619992816820741,
"num_tokens": 202471383.0,
"step": 471
},
{
"entropy": 0.398651123046875,
"epoch": 1.8509803921568628,
"grad_norm": 0.6397271420327242,
"learning_rate": 7.058355982245038e-06,
"loss": 0.394,
"mean_token_accuracy": 0.8587851086631417,
"num_tokens": 202916687.0,
"step": 472
},
{
"entropy": 0.39678955078125,
"epoch": 1.8549019607843138,
"grad_norm": 0.6245793958337541,
"learning_rate": 7.017025631109762e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8651279462501407,
"num_tokens": 203367706.0,
"step": 473
},
{
"entropy": 0.402496337890625,
"epoch": 1.8588235294117648,
"grad_norm": 0.6480244793330756,
"learning_rate": 6.97575113668399e-06,
"loss": 0.3798,
"mean_token_accuracy": 0.8655670257285237,
"num_tokens": 203794061.0,
"step": 474
},
{
"entropy": 0.396575927734375,
"epoch": 1.8627450980392157,
"grad_norm": 0.668132844077502,
"learning_rate": 6.934533271839751e-06,
"loss": 0.3853,
"mean_token_accuracy": 0.8610855452716351,
"num_tokens": 204218730.0,
"step": 475
},
{
"entropy": 0.39825439453125,
"epoch": 1.8666666666666667,
"grad_norm": 0.6162152767479371,
"learning_rate": 6.893372808388674e-06,
"loss": 0.3959,
"mean_token_accuracy": 0.8632792960852385,
"num_tokens": 204665453.0,
"step": 476
},
{
"entropy": 0.404296875,
"epoch": 1.8705882352941177,
"grad_norm": 0.638832679338457,
"learning_rate": 6.852270517067527e-06,
"loss": 0.3942,
"mean_token_accuracy": 0.8628870220854878,
"num_tokens": 205090169.0,
"step": 477
},
{
"entropy": 0.396942138671875,
"epoch": 1.8745098039215686,
"grad_norm": 0.6191281527349773,
"learning_rate": 6.8112271675238154e-06,
"loss": 0.3947,
"mean_token_accuracy": 0.8603170970454812,
"num_tokens": 205566713.0,
"step": 478
},
{
"entropy": 0.398956298828125,
"epoch": 1.8784313725490196,
"grad_norm": 0.6341770253300271,
"learning_rate": 6.7702435283013315e-06,
"loss": 0.3799,
"mean_token_accuracy": 0.8677375428378582,
"num_tokens": 205992968.0,
"step": 479
},
{
"entropy": 0.400604248046875,
"epoch": 1.8823529411764706,
"grad_norm": 0.6146178497897123,
"learning_rate": 6.729320366825785e-06,
"loss": 0.3793,
"mean_token_accuracy": 0.8669349849224091,
"num_tokens": 206415484.0,
"step": 480
},
{
"entropy": 0.398651123046875,
"epoch": 1.8862745098039215,
"grad_norm": 0.5904407168244195,
"learning_rate": 6.688458449390438e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8662282424047589,
"num_tokens": 206838020.0,
"step": 481
},
{
"entropy": 0.403839111328125,
"epoch": 1.8901960784313725,
"grad_norm": 0.6138200970304336,
"learning_rate": 6.647658541141735e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.863483252003789,
"num_tokens": 207262730.0,
"step": 482
},
{
"entropy": 0.39947509765625,
"epoch": 1.8941176470588235,
"grad_norm": 0.6411707572669829,
"learning_rate": 6.606921406065003e-06,
"loss": 0.393,
"mean_token_accuracy": 0.8628878751769662,
"num_tokens": 207681374.0,
"step": 483
},
{
"entropy": 0.399810791015625,
"epoch": 1.8980392156862744,
"grad_norm": 0.6242158740652315,
"learning_rate": 6.566247806970119e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.8648646343499422,
"num_tokens": 208111486.0,
"step": 484
},
{
"entropy": 0.4017333984375,
"epoch": 1.9019607843137254,
"grad_norm": 0.6214585655042043,
"learning_rate": 6.525638505477232e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.8618203224614263,
"num_tokens": 208524640.0,
"step": 485
},
{
"entropy": 0.406768798828125,
"epoch": 1.9058823529411764,
"grad_norm": 0.675756751207708,
"learning_rate": 6.485094262002529e-06,
"loss": 0.3834,
"mean_token_accuracy": 0.8650505719706416,
"num_tokens": 208943322.0,
"step": 486
},
{
"entropy": 0.40185546875,
"epoch": 1.9098039215686273,
"grad_norm": 0.6339598234809348,
"learning_rate": 6.444615835743955e-06,
"loss": 0.378,
"mean_token_accuracy": 0.8643421633169055,
"num_tokens": 209372138.0,
"step": 487
},
{
"entropy": 0.399261474609375,
"epoch": 1.9137254901960783,
"grad_norm": 0.6669096574204565,
"learning_rate": 6.404203984667019e-06,
"loss": 0.3937,
"mean_token_accuracy": 0.8616707855835557,
"num_tokens": 209800419.0,
"step": 488
},
{
"entropy": 0.400360107421875,
"epoch": 1.9176470588235293,
"grad_norm": 0.6298220613709057,
"learning_rate": 6.363859465490609e-06,
"loss": 0.3677,
"mean_token_accuracy": 0.8683587471023202,
"num_tokens": 210215161.0,
"step": 489
},
{
"entropy": 0.404510498046875,
"epoch": 1.9215686274509802,
"grad_norm": 0.6584599935418873,
"learning_rate": 6.323583033672799e-06,
"loss": 0.393,
"mean_token_accuracy": 0.8635195046663284,
"num_tokens": 210635827.0,
"step": 490
},
{
"entropy": 0.405609130859375,
"epoch": 1.9254901960784314,
"grad_norm": 0.647939059725834,
"learning_rate": 6.283375443396726e-06,
"loss": 0.3833,
"mean_token_accuracy": 0.8655750313773751,
"num_tokens": 211041992.0,
"step": 491
},
{
"entropy": 0.397003173828125,
"epoch": 1.9294117647058824,
"grad_norm": 0.6137155057965655,
"learning_rate": 6.24323744755645e-06,
"loss": 0.3811,
"mean_token_accuracy": 0.8649553088471293,
"num_tokens": 211477066.0,
"step": 492
},
{
"entropy": 0.399658203125,
"epoch": 1.9333333333333333,
"grad_norm": 0.651612311993538,
"learning_rate": 6.203169797742862e-06,
"loss": 0.3901,
"mean_token_accuracy": 0.8612717455253005,
"num_tokens": 211902746.0,
"step": 493
},
{
"entropy": 0.4049072265625,
"epoch": 1.9372549019607843,
"grad_norm": 0.6550364986232016,
"learning_rate": 6.163173244229618e-06,
"loss": 0.3784,
"mean_token_accuracy": 0.8649454573169351,
"num_tokens": 212330551.0,
"step": 494
},
{
"entropy": 0.4041748046875,
"epoch": 1.9411764705882353,
"grad_norm": 0.6257234356087099,
"learning_rate": 6.123248535959083e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.862628510221839,
"num_tokens": 212750893.0,
"step": 495
},
{
"entropy": 0.396514892578125,
"epoch": 1.9450980392156862,
"grad_norm": 0.6207760034425814,
"learning_rate": 6.083396420528298e-06,
"loss": 0.3915,
"mean_token_accuracy": 0.8619965445250273,
"num_tokens": 213191351.0,
"step": 496
},
{
"entropy": 0.4000244140625,
"epoch": 1.9490196078431372,
"grad_norm": 0.6435317991184669,
"learning_rate": 6.043617644175005e-06,
"loss": 0.3843,
"mean_token_accuracy": 0.8652150267735124,
"num_tokens": 213609485.0,
"step": 497
},
{
"entropy": 0.4049072265625,
"epoch": 1.9529411764705882,
"grad_norm": 0.6445614380022352,
"learning_rate": 6.003912951763644e-06,
"loss": 0.3818,
"mean_token_accuracy": 0.8633128497749567,
"num_tokens": 214021808.0,
"step": 498
},
{
"entropy": 0.40032958984375,
"epoch": 1.9568627450980394,
"grad_norm": 0.6188612791246716,
"learning_rate": 5.964283086771435e-06,
"loss": 0.3763,
"mean_token_accuracy": 0.8673922391608357,
"num_tokens": 214467620.0,
"step": 499
},
{
"entropy": 0.404510498046875,
"epoch": 1.9607843137254903,
"grad_norm": 0.6210830962778174,
"learning_rate": 5.924728791274432e-06,
"loss": 0.3928,
"mean_token_accuracy": 0.8633805690333247,
"num_tokens": 214896426.0,
"step": 500
},
{
"entropy": 0.401397705078125,
"epoch": 1.9647058823529413,
"grad_norm": 0.6310710096571402,
"learning_rate": 5.885250805933636e-06,
"loss": 0.3728,
"mean_token_accuracy": 0.8687439002096653,
"num_tokens": 215328235.0,
"step": 501
},
{
"entropy": 0.399017333984375,
"epoch": 1.9686274509803923,
"grad_norm": 0.6300326675580238,
"learning_rate": 5.845849869981137e-06,
"loss": 0.3742,
"mean_token_accuracy": 0.8672775719314814,
"num_tokens": 215762622.0,
"step": 502
},
{
"entropy": 0.393951416015625,
"epoch": 1.9725490196078432,
"grad_norm": 0.6216931663392556,
"learning_rate": 5.806526721206252e-06,
"loss": 0.3705,
"mean_token_accuracy": 0.8691313751041889,
"num_tokens": 216192091.0,
"step": 503
},
{
"entropy": 0.401214599609375,
"epoch": 1.9764705882352942,
"grad_norm": 0.6530296855077998,
"learning_rate": 5.767282095941725e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.8586861994117498,
"num_tokens": 216628887.0,
"step": 504
},
{
"entropy": 0.403411865234375,
"epoch": 1.9803921568627452,
"grad_norm": 0.6208031961076891,
"learning_rate": 5.728116729049929e-06,
"loss": 0.3675,
"mean_token_accuracy": 0.8702347576618195,
"num_tokens": 217039678.0,
"step": 505
},
{
"entropy": 0.397613525390625,
"epoch": 1.9843137254901961,
"grad_norm": 0.626565179565779,
"learning_rate": 5.68903135390912e-06,
"loss": 0.3779,
"mean_token_accuracy": 0.8659409172832966,
"num_tokens": 217486630.0,
"step": 506
},
{
"entropy": 0.402008056640625,
"epoch": 1.988235294117647,
"grad_norm": 0.6381555753272591,
"learning_rate": 5.65002670239968e-06,
"loss": 0.3852,
"mean_token_accuracy": 0.8616180000826716,
"num_tokens": 217923066.0,
"step": 507
},
{
"entropy": 0.3978271484375,
"epoch": 1.992156862745098,
"grad_norm": 0.6111355854752799,
"learning_rate": 5.611103504890444e-06,
"loss": 0.3864,
"mean_token_accuracy": 0.8644295651465654,
"num_tokens": 218363543.0,
"step": 508
},
{
"entropy": 0.401824951171875,
"epoch": 1.996078431372549,
"grad_norm": 0.6279421120839183,
"learning_rate": 5.57226249022499e-06,
"loss": 0.3733,
"mean_token_accuracy": 0.8667671736329794,
"num_tokens": 218782677.0,
"step": 509
},
{
"entropy": 0.401641845703125,
"epoch": 2.0,
"grad_norm": 0.6235365780243987,
"learning_rate": 5.533504385708024e-06,
"loss": 0.3707,
"mean_token_accuracy": 0.8663612883538008,
"num_tokens": 219186566.0,
"step": 510
},
{
"entropy": 0.396575927734375,
"epoch": 2.003921568627451,
"grad_norm": 0.6850387994534668,
"learning_rate": 5.494829917091733e-06,
"loss": 0.3429,
"mean_token_accuracy": 0.8770228121429682,
"num_tokens": 219628415.0,
"step": 511
},
{
"entropy": 0.401519775390625,
"epoch": 2.007843137254902,
"grad_norm": 0.6382337497649052,
"learning_rate": 5.45623980856221e-06,
"loss": 0.332,
"mean_token_accuracy": 0.8810373740270734,
"num_tokens": 220042043.0,
"step": 512
},
{
"entropy": 0.393768310546875,
"epoch": 2.011764705882353,
"grad_norm": 0.671827878794182,
"learning_rate": 5.417734782725896e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8824055539444089,
"num_tokens": 220458080.0,
"step": 513
},
{
"entropy": 0.384246826171875,
"epoch": 2.015686274509804,
"grad_norm": 0.7263272169698258,
"learning_rate": 5.379315560596038e-06,
"loss": 0.3357,
"mean_token_accuracy": 0.879870074801147,
"num_tokens": 220905634.0,
"step": 514
},
{
"entropy": 0.385040283203125,
"epoch": 2.019607843137255,
"grad_norm": 0.7920113629895514,
"learning_rate": 5.340982861579199e-06,
"loss": 0.3454,
"mean_token_accuracy": 0.8794268425554037,
"num_tokens": 221352570.0,
"step": 515
},
{
"entropy": 0.39019775390625,
"epoch": 2.023529411764706,
"grad_norm": 0.641834875539887,
"learning_rate": 5.302737403461778e-06,
"loss": 0.3259,
"mean_token_accuracy": 0.8824308048933744,
"num_tokens": 221787156.0,
"step": 516
},
{
"entropy": 0.401153564453125,
"epoch": 2.0274509803921568,
"grad_norm": 0.6849334728948456,
"learning_rate": 5.26457990239657e-06,
"loss": 0.3342,
"mean_token_accuracy": 0.8797427834942937,
"num_tokens": 222201542.0,
"step": 517
},
{
"entropy": 0.39581298828125,
"epoch": 2.0313725490196077,
"grad_norm": 0.7111898891704417,
"learning_rate": 5.226511072889371e-06,
"loss": 0.3313,
"mean_token_accuracy": 0.8803043775260448,
"num_tokens": 222623123.0,
"step": 518
},
{
"entropy": 0.3948974609375,
"epoch": 2.0352941176470587,
"grad_norm": 0.6505100364503669,
"learning_rate": 5.188531627785573e-06,
"loss": 0.3475,
"mean_token_accuracy": 0.8759458484128118,
"num_tokens": 223044097.0,
"step": 519
},
{
"entropy": 0.388336181640625,
"epoch": 2.0392156862745097,
"grad_norm": 0.6256506799827058,
"learning_rate": 5.1506422782568345e-06,
"loss": 0.3429,
"mean_token_accuracy": 0.8781215418130159,
"num_tokens": 223501353.0,
"step": 520
},
{
"entropy": 0.387054443359375,
"epoch": 2.0431372549019606,
"grad_norm": 0.6572960727423126,
"learning_rate": 5.112843733787765e-06,
"loss": 0.3399,
"mean_token_accuracy": 0.8787580663338304,
"num_tokens": 223935868.0,
"step": 521
},
{
"entropy": 0.389312744140625,
"epoch": 2.0470588235294116,
"grad_norm": 0.6372996031263783,
"learning_rate": 5.075136702162622e-06,
"loss": 0.3322,
"mean_token_accuracy": 0.8786961110308766,
"num_tokens": 224355787.0,
"step": 522
},
{
"entropy": 0.380645751953125,
"epoch": 2.0509803921568626,
"grad_norm": 0.6985340105840396,
"learning_rate": 5.037521889452084e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.8814136106520891,
"num_tokens": 224806688.0,
"step": 523
},
{
"entropy": 0.387451171875,
"epoch": 2.0549019607843135,
"grad_norm": 0.6713865414340499,
"learning_rate": 5.000000000000003e-06,
"loss": 0.3529,
"mean_token_accuracy": 0.8738525109365582,
"num_tokens": 225233035.0,
"step": 524
},
{
"entropy": 0.39373779296875,
"epoch": 2.0588235294117645,
"grad_norm": 0.6027577042393886,
"learning_rate": 4.962571736410224e-06,
"loss": 0.3208,
"mean_token_accuracy": 0.8810843704268336,
"num_tokens": 225650532.0,
"step": 525
},
{
"entropy": 0.390838623046875,
"epoch": 2.0627450980392155,
"grad_norm": 0.6529329845320468,
"learning_rate": 4.925237799533445e-06,
"loss": 0.3371,
"mean_token_accuracy": 0.8787398906424642,
"num_tokens": 226080538.0,
"step": 526
},
{
"entropy": 0.388580322265625,
"epoch": 2.066666666666667,
"grad_norm": 0.6777393301419915,
"learning_rate": 4.8879988884540705e-06,
"loss": 0.322,
"mean_token_accuracy": 0.8831636533141136,
"num_tokens": 226508253.0,
"step": 527
},
{
"entropy": 0.390777587890625,
"epoch": 2.070588235294118,
"grad_norm": 0.6437515132444687,
"learning_rate": 4.85085570047713e-06,
"loss": 0.3352,
"mean_token_accuracy": 0.8790671909227967,
"num_tokens": 226944784.0,
"step": 528
},
{
"entropy": 0.385345458984375,
"epoch": 2.074509803921569,
"grad_norm": 0.6602168660987684,
"learning_rate": 4.813808931115228e-06,
"loss": 0.3319,
"mean_token_accuracy": 0.8783455807715654,
"num_tokens": 227377942.0,
"step": 529
},
{
"entropy": 0.382965087890625,
"epoch": 2.0784313725490198,
"grad_norm": 0.6089184508280152,
"learning_rate": 4.776859274075506e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.8840960031375289,
"num_tokens": 227811996.0,
"step": 530
},
{
"entropy": 0.385040283203125,
"epoch": 2.0823529411764707,
"grad_norm": 0.8297306434177946,
"learning_rate": 4.7400074212466705e-06,
"loss": 0.3404,
"mean_token_accuracy": 0.8791711116209626,
"num_tokens": 228240221.0,
"step": 531
},
{
"entropy": 0.38531494140625,
"epoch": 2.0862745098039217,
"grad_norm": 0.6227790902239935,
"learning_rate": 4.703254062686017e-06,
"loss": 0.3299,
"mean_token_accuracy": 0.8817263282835484,
"num_tokens": 228694396.0,
"step": 532
},
{
"entropy": 0.3951416015625,
"epoch": 2.0901960784313727,
"grad_norm": 0.6726552929084186,
"learning_rate": 4.666599886606521e-06,
"loss": 0.321,
"mean_token_accuracy": 0.8835817389190197,
"num_tokens": 229089982.0,
"step": 533
},
{
"entropy": 0.3868408203125,
"epoch": 2.0941176470588236,
"grad_norm": 0.6460262137645605,
"learning_rate": 4.6300455793639565e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.8793362881988287,
"num_tokens": 229517812.0,
"step": 534
},
{
"entropy": 0.386138916015625,
"epoch": 2.0980392156862746,
"grad_norm": 0.6713226114014762,
"learning_rate": 4.593591825444028e-06,
"loss": 0.3238,
"mean_token_accuracy": 0.8822187837213278,
"num_tokens": 229966675.0,
"step": 535
},
{
"entropy": 0.394989013671875,
"epoch": 2.1019607843137256,
"grad_norm": 0.6444115484817178,
"learning_rate": 4.557239307449562e-06,
"loss": 0.3176,
"mean_token_accuracy": 0.883355819620192,
"num_tokens": 230370130.0,
"step": 536
},
{
"entropy": 0.383544921875,
"epoch": 2.1058823529411765,
"grad_norm": 0.623684910415606,
"learning_rate": 4.520988706087731e-06,
"loss": 0.3188,
"mean_token_accuracy": 0.8842975506559014,
"num_tokens": 230798042.0,
"step": 537
},
{
"entropy": 0.3836669921875,
"epoch": 2.1098039215686275,
"grad_norm": 0.6733188037497384,
"learning_rate": 4.4848407001572945e-06,
"loss": 0.3383,
"mean_token_accuracy": 0.8791207261383533,
"num_tokens": 231227119.0,
"step": 538
},
{
"entropy": 0.3836669921875,
"epoch": 2.1137254901960785,
"grad_norm": 0.6527809232837684,
"learning_rate": 4.448795966535903e-06,
"loss": 0.3307,
"mean_token_accuracy": 0.8810558579862118,
"num_tokens": 231653005.0,
"step": 539
},
{
"entropy": 0.38861083984375,
"epoch": 2.1176470588235294,
"grad_norm": 0.6241280769218844,
"learning_rate": 4.412855180167406e-06,
"loss": 0.3401,
"mean_token_accuracy": 0.8785578245297074,
"num_tokens": 232083947.0,
"step": 540
},
{
"entropy": 0.38385009765625,
"epoch": 2.1215686274509804,
"grad_norm": 0.7053422092034839,
"learning_rate": 4.377019014049223e-06,
"loss": 0.3368,
"mean_token_accuracy": 0.8770233364775777,
"num_tokens": 232534306.0,
"step": 541
},
{
"entropy": 0.3924560546875,
"epoch": 2.1254901960784314,
"grad_norm": 0.646525042345649,
"learning_rate": 4.341288139219752e-06,
"loss": 0.3259,
"mean_token_accuracy": 0.8811231376603246,
"num_tokens": 232956999.0,
"step": 542
},
{
"entropy": 0.38153076171875,
"epoch": 2.1294117647058823,
"grad_norm": 0.6353328108356256,
"learning_rate": 4.30566322474578e-06,
"loss": 0.3261,
"mean_token_accuracy": 0.8826108202338219,
"num_tokens": 233389852.0,
"step": 543
},
{
"entropy": 0.385589599609375,
"epoch": 2.1333333333333333,
"grad_norm": 0.6564549054251345,
"learning_rate": 4.270144937709981e-06,
"loss": 0.3289,
"mean_token_accuracy": 0.883696929551661,
"num_tokens": 233811670.0,
"step": 544
},
{
"entropy": 0.385406494140625,
"epoch": 2.1372549019607843,
"grad_norm": 0.6841239113938077,
"learning_rate": 4.234733943198399e-06,
"loss": 0.3443,
"mean_token_accuracy": 0.8776548197492957,
"num_tokens": 234250912.0,
"step": 545
},
{
"entropy": 0.390472412109375,
"epoch": 2.1411764705882352,
"grad_norm": 0.6429128731108159,
"learning_rate": 4.19943090428802e-06,
"loss": 0.3263,
"mean_token_accuracy": 0.8829544661566615,
"num_tokens": 234669494.0,
"step": 546
},
{
"entropy": 0.395050048828125,
"epoch": 2.145098039215686,
"grad_norm": 0.6627579431444133,
"learning_rate": 4.1642364820343276e-06,
"loss": 0.3309,
"mean_token_accuracy": 0.8791402000933886,
"num_tokens": 235089829.0,
"step": 547
},
{
"entropy": 0.39361572265625,
"epoch": 2.149019607843137,
"grad_norm": 0.7016968609489542,
"learning_rate": 4.1291513354589576e-06,
"loss": 0.3213,
"mean_token_accuracy": 0.8828855343163013,
"num_tokens": 235511064.0,
"step": 548
},
{
"entropy": 0.389129638671875,
"epoch": 2.152941176470588,
"grad_norm": 0.6331735006523904,
"learning_rate": 4.094176121537321e-06,
"loss": 0.3352,
"mean_token_accuracy": 0.8795699439942837,
"num_tokens": 235949743.0,
"step": 549
},
{
"entropy": 0.38787841796875,
"epoch": 2.156862745098039,
"grad_norm": 0.6437682670590288,
"learning_rate": 4.059311495186338e-06,
"loss": 0.3148,
"mean_token_accuracy": 0.8851141845807433,
"num_tokens": 236387462.0,
"step": 550
},
{
"entropy": 0.3834228515625,
"epoch": 2.16078431372549,
"grad_norm": 0.6391407058805494,
"learning_rate": 4.024558109252148e-06,
"loss": 0.3305,
"mean_token_accuracy": 0.8804508438333869,
"num_tokens": 236829823.0,
"step": 551
},
{
"entropy": 0.38372802734375,
"epoch": 2.164705882352941,
"grad_norm": 0.6724194226514026,
"learning_rate": 3.989916614497891e-06,
"loss": 0.3391,
"mean_token_accuracy": 0.8799557471647859,
"num_tokens": 237264397.0,
"step": 552
},
{
"entropy": 0.380615234375,
"epoch": 2.168627450980392,
"grad_norm": 0.6683704841557025,
"learning_rate": 3.955387659591538e-06,
"loss": 0.337,
"mean_token_accuracy": 0.8779965220019221,
"num_tokens": 237706025.0,
"step": 553
},
{
"entropy": 0.381622314453125,
"epoch": 2.172549019607843,
"grad_norm": 0.5976323682888441,
"learning_rate": 3.9209718910937174e-06,
"loss": 0.3346,
"mean_token_accuracy": 0.8799093374982476,
"num_tokens": 238132565.0,
"step": 554
},
{
"entropy": 0.38897705078125,
"epoch": 2.176470588235294,
"grad_norm": 0.7021643775294942,
"learning_rate": 3.886669953445638e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.880199084058404,
"num_tokens": 238552169.0,
"step": 555
},
{
"entropy": 0.38323974609375,
"epoch": 2.180392156862745,
"grad_norm": 0.6201242758831783,
"learning_rate": 3.852482488956992e-06,
"loss": 0.3434,
"mean_token_accuracy": 0.8788248943164945,
"num_tokens": 238990692.0,
"step": 556
},
{
"entropy": 0.382843017578125,
"epoch": 2.1843137254901963,
"grad_norm": 0.677214260217929,
"learning_rate": 3.818410137793947e-06,
"loss": 0.3269,
"mean_token_accuracy": 0.8832383183762431,
"num_tokens": 239425631.0,
"step": 557
},
{
"entropy": 0.38092041015625,
"epoch": 2.1882352941176473,
"grad_norm": 0.6209875432176569,
"learning_rate": 3.784453537967161e-06,
"loss": 0.3352,
"mean_token_accuracy": 0.8799733864143491,
"num_tokens": 239882913.0,
"step": 558
},
{
"entropy": 0.38623046875,
"epoch": 2.1921568627450982,
"grad_norm": 0.6614671874297028,
"learning_rate": 3.7506133253198173e-06,
"loss": 0.3296,
"mean_token_accuracy": 0.8816101523116231,
"num_tokens": 240305968.0,
"step": 559
},
{
"entropy": 0.377227783203125,
"epoch": 2.196078431372549,
"grad_norm": 0.6899542910895241,
"learning_rate": 3.7168901335157313e-06,
"loss": 0.3332,
"mean_token_accuracy": 0.8795541236177087,
"num_tokens": 240745419.0,
"step": 560
},
{
"entropy": 0.375732421875,
"epoch": 2.2,
"grad_norm": 0.7037636770557544,
"learning_rate": 3.683284594027492e-06,
"loss": 0.3283,
"mean_token_accuracy": 0.8814961658790708,
"num_tokens": 241182851.0,
"step": 561
},
{
"entropy": 0.380126953125,
"epoch": 2.203921568627451,
"grad_norm": 0.6910588607446845,
"learning_rate": 3.6497973361246153e-06,
"loss": 0.3417,
"mean_token_accuracy": 0.8777891835197806,
"num_tokens": 241625327.0,
"step": 562
},
{
"entropy": 0.379852294921875,
"epoch": 2.207843137254902,
"grad_norm": 0.6505651822651323,
"learning_rate": 3.6164289868617884e-06,
"loss": 0.3221,
"mean_token_accuracy": 0.8816747777163982,
"num_tokens": 242060360.0,
"step": 563
},
{
"entropy": 0.3958740234375,
"epoch": 2.211764705882353,
"grad_norm": 0.650666952366499,
"learning_rate": 3.583180171067101e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8786816103383899,
"num_tokens": 242470897.0,
"step": 564
},
{
"entropy": 0.38525390625,
"epoch": 2.215686274509804,
"grad_norm": 0.6591628517737783,
"learning_rate": 3.550051511330361e-06,
"loss": 0.3229,
"mean_token_accuracy": 0.8851354466751218,
"num_tokens": 242894132.0,
"step": 565
},
{
"entropy": 0.389678955078125,
"epoch": 2.219607843137255,
"grad_norm": 0.6501659301424558,
"learning_rate": 3.517043627991441e-06,
"loss": 0.3296,
"mean_token_accuracy": 0.8841333854943514,
"num_tokens": 243315484.0,
"step": 566
},
{
"entropy": 0.381011962890625,
"epoch": 2.223529411764706,
"grad_norm": 0.6736372583136302,
"learning_rate": 3.4841571391286466e-06,
"loss": 0.3331,
"mean_token_accuracy": 0.8803358906880021,
"num_tokens": 243761682.0,
"step": 567
},
{
"entropy": 0.384857177734375,
"epoch": 2.227450980392157,
"grad_norm": 0.6385641107317546,
"learning_rate": 3.4513926605471504e-06,
"loss": 0.3232,
"mean_token_accuracy": 0.8852359773591161,
"num_tokens": 244217985.0,
"step": 568
},
{
"entropy": 0.38055419921875,
"epoch": 2.231372549019608,
"grad_norm": 0.6957847971556891,
"learning_rate": 3.418750805767469e-06,
"loss": 0.3213,
"mean_token_accuracy": 0.8834536019712687,
"num_tokens": 244651979.0,
"step": 569
},
{
"entropy": 0.37164306640625,
"epoch": 2.235294117647059,
"grad_norm": 0.6877384491865183,
"learning_rate": 3.3862321860139578e-06,
"loss": 0.3394,
"mean_token_accuracy": 0.8793074004352093,
"num_tokens": 245111895.0,
"step": 570
},
{
"entropy": 0.38189697265625,
"epoch": 2.23921568627451,
"grad_norm": 0.70793552801571,
"learning_rate": 3.3538374102033865e-06,
"loss": 0.3282,
"mean_token_accuracy": 0.883271967060864,
"num_tokens": 245523688.0,
"step": 571
},
{
"entropy": 0.378173828125,
"epoch": 2.243137254901961,
"grad_norm": 0.6522649260770007,
"learning_rate": 3.3215670849335156e-06,
"loss": 0.326,
"mean_token_accuracy": 0.8824318377301097,
"num_tokens": 245976983.0,
"step": 572
},
{
"entropy": 0.38177490234375,
"epoch": 2.2470588235294118,
"grad_norm": 0.6371798326781967,
"learning_rate": 3.2894218144717473e-06,
"loss": 0.336,
"mean_token_accuracy": 0.87877048086375,
"num_tokens": 246399132.0,
"step": 573
},
{
"entropy": 0.379364013671875,
"epoch": 2.2509803921568627,
"grad_norm": 0.6281985651614281,
"learning_rate": 3.257402200743821e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8802892221137881,
"num_tokens": 246829053.0,
"step": 574
},
{
"entropy": 0.3812255859375,
"epoch": 2.2549019607843137,
"grad_norm": 0.6429952031987357,
"learning_rate": 3.2255088433225246e-06,
"loss": 0.3348,
"mean_token_accuracy": 0.8793577533215284,
"num_tokens": 247284964.0,
"step": 575
},
{
"entropy": 0.38433837890625,
"epoch": 2.2588235294117647,
"grad_norm": 0.6591416460917164,
"learning_rate": 3.19374233941647e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8827766692265868,
"num_tokens": 247726518.0,
"step": 576
},
{
"entropy": 0.385955810546875,
"epoch": 2.2627450980392156,
"grad_norm": 0.6509477090622025,
"learning_rate": 3.1621032838589307e-06,
"loss": 0.3286,
"mean_token_accuracy": 0.8831933671608567,
"num_tokens": 248131065.0,
"step": 577
},
{
"entropy": 0.387359619140625,
"epoch": 2.2666666666666666,
"grad_norm": 0.6536748739080436,
"learning_rate": 3.1305922690966705e-06,
"loss": 0.3282,
"mean_token_accuracy": 0.8814155226573348,
"num_tokens": 248542637.0,
"step": 578
},
{
"entropy": 0.3798828125,
"epoch": 2.2705882352941176,
"grad_norm": 0.6326768923288004,
"learning_rate": 3.099209885178882e-06,
"loss": 0.3142,
"mean_token_accuracy": 0.8848884990438819,
"num_tokens": 248952209.0,
"step": 579
},
{
"entropy": 0.38568115234375,
"epoch": 2.2745098039215685,
"grad_norm": 0.6988942589342891,
"learning_rate": 3.0679567197461135e-06,
"loss": 0.3245,
"mean_token_accuracy": 0.8818568410351872,
"num_tokens": 249350069.0,
"step": 580
},
{
"entropy": 0.380950927734375,
"epoch": 2.2784313725490195,
"grad_norm": 0.6437532388899305,
"learning_rate": 3.0368333580192734e-06,
"loss": 0.3362,
"mean_token_accuracy": 0.8800135338678956,
"num_tokens": 249790943.0,
"step": 581
},
{
"entropy": 0.38519287109375,
"epoch": 2.2823529411764705,
"grad_norm": 0.6802253424414118,
"learning_rate": 3.005840382788685e-06,
"loss": 0.3285,
"mean_token_accuracy": 0.8817092962563038,
"num_tokens": 250212691.0,
"step": 582
},
{
"entropy": 0.385223388671875,
"epoch": 2.2862745098039214,
"grad_norm": 0.649252155054827,
"learning_rate": 2.974978374403147e-06,
"loss": 0.3395,
"mean_token_accuracy": 0.8796671917662024,
"num_tokens": 250645778.0,
"step": 583
},
{
"entropy": 0.385650634765625,
"epoch": 2.2901960784313724,
"grad_norm": 0.6266536100784984,
"learning_rate": 2.944247910759097e-06,
"loss": 0.3369,
"mean_token_accuracy": 0.8803050145506859,
"num_tokens": 251091472.0,
"step": 584
},
{
"entropy": 0.383941650390625,
"epoch": 2.2941176470588234,
"grad_norm": 0.6384710127584259,
"learning_rate": 2.9136495672897592e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8817528560757637,
"num_tokens": 251508403.0,
"step": 585
},
{
"entropy": 0.389373779296875,
"epoch": 2.2980392156862743,
"grad_norm": 0.6573963751809176,
"learning_rate": 2.8831839169543998e-06,
"loss": 0.3233,
"mean_token_accuracy": 0.882289039902389,
"num_tokens": 251930049.0,
"step": 586
},
{
"entropy": 0.384124755859375,
"epoch": 2.3019607843137253,
"grad_norm": 0.6252660748375973,
"learning_rate": 2.852851530227566e-06,
"loss": 0.3294,
"mean_token_accuracy": 0.8839052841067314,
"num_tokens": 252346931.0,
"step": 587
},
{
"entropy": 0.382720947265625,
"epoch": 2.3058823529411763,
"grad_norm": 0.8966787930059511,
"learning_rate": 2.8226529750884403e-06,
"loss": 0.3183,
"mean_token_accuracy": 0.8840129124000669,
"num_tokens": 252782839.0,
"step": 588
},
{
"entropy": 0.38275146484375,
"epoch": 2.3098039215686272,
"grad_norm": 0.625666212633788,
"learning_rate": 2.7925888170101667e-06,
"loss": 0.3276,
"mean_token_accuracy": 0.8817493235692382,
"num_tokens": 253210845.0,
"step": 589
},
{
"entropy": 0.384735107421875,
"epoch": 2.313725490196078,
"grad_norm": 0.6415168482799497,
"learning_rate": 2.7626596189492983e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8816859601065516,
"num_tokens": 253626459.0,
"step": 590
},
{
"entropy": 0.388214111328125,
"epoch": 2.317647058823529,
"grad_norm": 0.7267922599495401,
"learning_rate": 2.7328659413352266e-06,
"loss": 0.3155,
"mean_token_accuracy": 0.8850083062425256,
"num_tokens": 254051003.0,
"step": 591
},
{
"entropy": 0.3841552734375,
"epoch": 2.3215686274509806,
"grad_norm": 0.6591517111208269,
"learning_rate": 2.7032083420597e-06,
"loss": 0.3154,
"mean_token_accuracy": 0.8871585316956043,
"num_tokens": 254456077.0,
"step": 592
},
{
"entropy": 0.377532958984375,
"epoch": 2.3254901960784315,
"grad_norm": 0.6441326100639279,
"learning_rate": 2.673687376466385e-06,
"loss": 0.3077,
"mean_token_accuracy": 0.8879004623740911,
"num_tokens": 254877575.0,
"step": 593
},
{
"entropy": 0.385467529296875,
"epoch": 2.3294117647058825,
"grad_norm": 0.7976063543611124,
"learning_rate": 2.6443035973404497e-06,
"loss": 0.3349,
"mean_token_accuracy": 0.880553713068366,
"num_tokens": 255303970.0,
"step": 594
},
{
"entropy": 0.38482666015625,
"epoch": 2.3333333333333335,
"grad_norm": 0.6519593817176016,
"learning_rate": 2.6150575548982295e-06,
"loss": 0.3286,
"mean_token_accuracy": 0.883098304271698,
"num_tokens": 255742340.0,
"step": 595
},
{
"entropy": 0.380828857421875,
"epoch": 2.3372549019607844,
"grad_norm": 0.6652407912562928,
"learning_rate": 2.585949796776912e-06,
"loss": 0.3393,
"mean_token_accuracy": 0.8775012623518705,
"num_tokens": 256171483.0,
"step": 596
},
{
"entropy": 0.3831787109375,
"epoch": 2.3411764705882354,
"grad_norm": 0.6268195088150889,
"learning_rate": 2.5569808680242826e-06,
"loss": 0.3444,
"mean_token_accuracy": 0.878986076451838,
"num_tokens": 256605019.0,
"step": 597
},
{
"entropy": 0.381011962890625,
"epoch": 2.3450980392156864,
"grad_norm": 0.7284967186382716,
"learning_rate": 2.528151311088537e-06,
"loss": 0.3217,
"mean_token_accuracy": 0.8857952449470758,
"num_tokens": 257047852.0,
"step": 598
},
{
"entropy": 0.3873291015625,
"epoch": 2.3490196078431373,
"grad_norm": 0.6808759200253363,
"learning_rate": 2.499461665808095e-06,
"loss": 0.3361,
"mean_token_accuracy": 0.8788221376016736,
"num_tokens": 257484717.0,
"step": 599
},
{
"entropy": 0.39202880859375,
"epoch": 2.3529411764705883,
"grad_norm": 0.7302214681581166,
"learning_rate": 2.470912469401512e-06,
"loss": 0.3091,
"mean_token_accuracy": 0.8890265924856067,
"num_tokens": 257896029.0,
"step": 600
},
{
"entropy": 0.386993408203125,
"epoch": 2.3568627450980393,
"grad_norm": 0.6447515967872068,
"learning_rate": 2.4425042564574186e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.8860885631293058,
"num_tokens": 258313253.0,
"step": 601
},
{
"entropy": 0.37738037109375,
"epoch": 2.3607843137254902,
"grad_norm": 0.6796476546531216,
"learning_rate": 2.414237558924496e-06,
"loss": 0.3139,
"mean_token_accuracy": 0.8867853293195367,
"num_tokens": 258739041.0,
"step": 602
},
{
"entropy": 0.375518798828125,
"epoch": 2.364705882352941,
"grad_norm": 0.6800822428528028,
"learning_rate": 2.3861129061015355e-06,
"loss": 0.3122,
"mean_token_accuracy": 0.8855758523568511,
"num_tokens": 259174100.0,
"step": 603
},
{
"entropy": 0.378173828125,
"epoch": 2.368627450980392,
"grad_norm": 0.6695828945285421,
"learning_rate": 2.3581308246275103e-06,
"loss": 0.314,
"mean_token_accuracy": 0.8833903381600976,
"num_tokens": 259597011.0,
"step": 604
},
{
"entropy": 0.372467041015625,
"epoch": 2.372549019607843,
"grad_norm": 0.6595833290855022,
"learning_rate": 2.3302918384717175e-06,
"loss": 0.3213,
"mean_token_accuracy": 0.8831760762259364,
"num_tokens": 260072716.0,
"step": 605
},
{
"entropy": 0.379364013671875,
"epoch": 2.376470588235294,
"grad_norm": 0.6213706069462764,
"learning_rate": 2.302596468923981e-06,
"loss": 0.313,
"mean_token_accuracy": 0.8855204563587904,
"num_tokens": 260504261.0,
"step": 606
},
{
"entropy": 0.37933349609375,
"epoch": 2.380392156862745,
"grad_norm": 0.7209555926267776,
"learning_rate": 2.2750452345848684e-06,
"loss": 0.3235,
"mean_token_accuracy": 0.8838987145572901,
"num_tokens": 260929791.0,
"step": 607
},
{
"entropy": 0.380828857421875,
"epoch": 2.384313725490196,
"grad_norm": 0.6636808498029044,
"learning_rate": 2.247638651355991e-06,
"loss": 0.3211,
"mean_token_accuracy": 0.8834717646241188,
"num_tokens": 261373621.0,
"step": 608
},
{
"entropy": 0.377593994140625,
"epoch": 2.388235294117647,
"grad_norm": 0.6570217727767104,
"learning_rate": 2.220377232430353e-06,
"loss": 0.3257,
"mean_token_accuracy": 0.8826047889888287,
"num_tokens": 261814225.0,
"step": 609
},
{
"entropy": 0.388275146484375,
"epoch": 2.392156862745098,
"grad_norm": 0.6547880967559162,
"learning_rate": 2.1932614882827196e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.8816813975572586,
"num_tokens": 262224785.0,
"step": 610
},
{
"entropy": 0.378448486328125,
"epoch": 2.396078431372549,
"grad_norm": 0.6467394539814979,
"learning_rate": 2.1662919266600814e-06,
"loss": 0.3146,
"mean_token_accuracy": 0.8854999002069235,
"num_tokens": 262644460.0,
"step": 611
},
{
"entropy": 0.37994384765625,
"epoch": 2.4,
"grad_norm": 0.6657441544890862,
"learning_rate": 2.1394690525721275e-06,
"loss": 0.314,
"mean_token_accuracy": 0.8874372495338321,
"num_tokens": 263073097.0,
"step": 612
},
{
"entropy": 0.3804931640625,
"epoch": 2.403921568627451,
"grad_norm": 0.674015693139673,
"learning_rate": 2.112793368281799e-06,
"loss": 0.3149,
"mean_token_accuracy": 0.8849289892241359,
"num_tokens": 263502290.0,
"step": 613
},
{
"entropy": 0.380401611328125,
"epoch": 2.407843137254902,
"grad_norm": 0.6232086802467347,
"learning_rate": 2.0862653732958914e-06,
"loss": 0.3199,
"mean_token_accuracy": 0.8842472266405821,
"num_tokens": 263945936.0,
"step": 614
},
{
"entropy": 0.375823974609375,
"epoch": 2.411764705882353,
"grad_norm": 0.671370115531208,
"learning_rate": 2.0598855643556824e-06,
"loss": 0.3132,
"mean_token_accuracy": 0.8874692656099796,
"num_tokens": 264373906.0,
"step": 615
},
{
"entropy": 0.378387451171875,
"epoch": 2.4156862745098038,
"grad_norm": 0.7287126820698647,
"learning_rate": 2.03365443542764e-06,
"loss": 0.3275,
"mean_token_accuracy": 0.8789404109120369,
"num_tokens": 264796869.0,
"step": 616
},
{
"entropy": 0.3797607421875,
"epoch": 2.4196078431372547,
"grad_norm": 0.6769800485762081,
"learning_rate": 2.0075724776941842e-06,
"loss": 0.3175,
"mean_token_accuracy": 0.8853352731093764,
"num_tokens": 265211333.0,
"step": 617
},
{
"entropy": 0.37451171875,
"epoch": 2.4235294117647057,
"grad_norm": 0.6535066383233836,
"learning_rate": 1.9816401795444664e-06,
"loss": 0.3043,
"mean_token_accuracy": 0.8888634694740176,
"num_tokens": 265634712.0,
"step": 618
},
{
"entropy": 0.379425048828125,
"epoch": 2.4274509803921567,
"grad_norm": 0.6238556414580247,
"learning_rate": 1.9558580265652448e-06,
"loss": 0.3214,
"mean_token_accuracy": 0.8860293151810765,
"num_tokens": 266062848.0,
"step": 619
},
{
"entropy": 0.38507080078125,
"epoch": 2.431372549019608,
"grad_norm": 0.6581497114358473,
"learning_rate": 1.93022650153178e-06,
"loss": 0.3211,
"mean_token_accuracy": 0.8834013622254133,
"num_tokens": 266472687.0,
"step": 620
},
{
"entropy": 0.375823974609375,
"epoch": 2.435294117647059,
"grad_norm": 0.6183363507599968,
"learning_rate": 1.9047460843987963e-06,
"loss": 0.3102,
"mean_token_accuracy": 0.8872299622744322,
"num_tokens": 266906318.0,
"step": 621
},
{
"entropy": 0.379852294921875,
"epoch": 2.43921568627451,
"grad_norm": 0.6261417698058828,
"learning_rate": 1.8794172522915022e-06,
"loss": 0.3141,
"mean_token_accuracy": 0.8863415522500873,
"num_tokens": 267329307.0,
"step": 622
},
{
"entropy": 0.3753662109375,
"epoch": 2.443137254901961,
"grad_norm": 0.6350012536405131,
"learning_rate": 1.854240479496643e-06,
"loss": 0.3192,
"mean_token_accuracy": 0.8854794921353459,
"num_tokens": 267762405.0,
"step": 623
},
{
"entropy": 0.3779296875,
"epoch": 2.447058823529412,
"grad_norm": 0.6121205166878543,
"learning_rate": 1.829216237453637e-06,
"loss": 0.3162,
"mean_token_accuracy": 0.884573670104146,
"num_tokens": 268207007.0,
"step": 624
},
{
"entropy": 0.38092041015625,
"epoch": 2.450980392156863,
"grad_norm": 0.6158676556614557,
"learning_rate": 1.804344994745727e-06,
"loss": 0.3261,
"mean_token_accuracy": 0.8815991748124361,
"num_tokens": 268640841.0,
"step": 625
},
{
"entropy": 0.3822021484375,
"epoch": 2.454901960784314,
"grad_norm": 0.6844627149146101,
"learning_rate": 1.7796272170912255e-06,
"loss": 0.3305,
"mean_token_accuracy": 0.882242445833981,
"num_tokens": 269058281.0,
"step": 626
},
{
"entropy": 0.3802490234375,
"epoch": 2.458823529411765,
"grad_norm": 0.6351667381198283,
"learning_rate": 1.7550633673347783e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.8831239556893706,
"num_tokens": 269500856.0,
"step": 627
},
{
"entropy": 0.37982177734375,
"epoch": 2.462745098039216,
"grad_norm": 0.6454334947707677,
"learning_rate": 1.730653905438714e-06,
"loss": 0.3115,
"mean_token_accuracy": 0.8864633357152343,
"num_tokens": 269916404.0,
"step": 628
},
{
"entropy": 0.3763427734375,
"epoch": 2.466666666666667,
"grad_norm": 0.6184143204773618,
"learning_rate": 1.7063992884744096e-06,
"loss": 0.3093,
"mean_token_accuracy": 0.8881733799353242,
"num_tokens": 270355413.0,
"step": 629
},
{
"entropy": 0.381805419921875,
"epoch": 2.4705882352941178,
"grad_norm": 0.631196154024867,
"learning_rate": 1.6822999706137565e-06,
"loss": 0.3189,
"mean_token_accuracy": 0.8858399009332061,
"num_tokens": 270795892.0,
"step": 630
},
{
"entropy": 0.37823486328125,
"epoch": 2.4745098039215687,
"grad_norm": 0.6231692839077786,
"learning_rate": 1.6583564031206357e-06,
"loss": 0.3246,
"mean_token_accuracy": 0.8817324228584766,
"num_tokens": 271230718.0,
"step": 631
},
{
"entropy": 0.384552001953125,
"epoch": 2.4784313725490197,
"grad_norm": 0.6458519012457633,
"learning_rate": 1.6345690343424758e-06,
"loss": 0.3266,
"mean_token_accuracy": 0.8848798228427768,
"num_tokens": 271656053.0,
"step": 632
},
{
"entropy": 0.38848876953125,
"epoch": 2.4823529411764707,
"grad_norm": 0.6081484349151737,
"learning_rate": 1.6109383097018628e-06,
"loss": 0.3043,
"mean_token_accuracy": 0.887602380476892,
"num_tokens": 272080970.0,
"step": 633
},
{
"entropy": 0.387176513671875,
"epoch": 2.4862745098039216,
"grad_norm": 0.6151092693902757,
"learning_rate": 1.587464671688187e-06,
"loss": 0.3147,
"mean_token_accuracy": 0.8866362348198891,
"num_tokens": 272507901.0,
"step": 634
},
{
"entropy": 0.3912353515625,
"epoch": 2.4901960784313726,
"grad_norm": 0.6543393306086803,
"learning_rate": 1.5641485598493744e-06,
"loss": 0.3174,
"mean_token_accuracy": 0.8848397238180041,
"num_tokens": 272903491.0,
"step": 635
},
{
"entropy": 0.378570556640625,
"epoch": 2.4941176470588236,
"grad_norm": 0.6289329862500893,
"learning_rate": 1.540990410783636e-06,
"loss": 0.3145,
"mean_token_accuracy": 0.8862317893654108,
"num_tokens": 273341473.0,
"step": 636
},
{
"entropy": 0.376922607421875,
"epoch": 2.4980392156862745,
"grad_norm": 0.6091966079924904,
"learning_rate": 1.5179906581313063e-06,
"loss": 0.318,
"mean_token_accuracy": 0.8845315454527736,
"num_tokens": 273777511.0,
"step": 637
},
{
"entropy": 0.383026123046875,
"epoch": 2.5019607843137255,
"grad_norm": 0.6690355442288055,
"learning_rate": 1.495149732566723e-06,
"loss": 0.3082,
"mean_token_accuracy": 0.8887853538617492,
"num_tokens": 274187994.0,
"step": 638
},
{
"entropy": 0.3792724609375,
"epoch": 2.5058823529411764,
"grad_norm": 0.5924866327426076,
"learning_rate": 1.4724680617901565e-06,
"loss": 0.3105,
"mean_token_accuracy": 0.8864017892628908,
"num_tokens": 274640249.0,
"step": 639
},
{
"entropy": 0.381591796875,
"epoch": 2.5098039215686274,
"grad_norm": 0.6192344124742932,
"learning_rate": 1.4499460705198e-06,
"loss": 0.3067,
"mean_token_accuracy": 0.8865052172914147,
"num_tokens": 275057297.0,
"step": 640
},
{
"entropy": 0.3802490234375,
"epoch": 2.5137254901960784,
"grad_norm": 0.659901776871247,
"learning_rate": 1.4275841804838298e-06,
"loss": 0.3159,
"mean_token_accuracy": 0.8849191283807158,
"num_tokens": 275471254.0,
"step": 641
},
{
"entropy": 0.37738037109375,
"epoch": 2.5176470588235293,
"grad_norm": 0.6638824013938602,
"learning_rate": 1.4053828104124867e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8840452143922448,
"num_tokens": 275891640.0,
"step": 642
},
{
"entropy": 0.382904052734375,
"epoch": 2.5215686274509803,
"grad_norm": 0.6641332193014144,
"learning_rate": 1.383342376030261e-06,
"loss": 0.3241,
"mean_token_accuracy": 0.8852192750200629,
"num_tokens": 276307124.0,
"step": 643
},
{
"entropy": 0.3870849609375,
"epoch": 2.5254901960784313,
"grad_norm": 0.6160087628480071,
"learning_rate": 1.361463290048085e-06,
"loss": 0.309,
"mean_token_accuracy": 0.886401055380702,
"num_tokens": 276704241.0,
"step": 644
},
{
"entropy": 0.376007080078125,
"epoch": 2.5294117647058822,
"grad_norm": 0.6571155139010876,
"learning_rate": 1.339745962155613e-06,
"loss": 0.3261,
"mean_token_accuracy": 0.8838352803140879,
"num_tokens": 277154479.0,
"step": 645
},
{
"entropy": 0.381927490234375,
"epoch": 2.533333333333333,
"grad_norm": 0.6297597292306508,
"learning_rate": 1.3181907990135624e-06,
"loss": 0.3073,
"mean_token_accuracy": 0.8882267083972692,
"num_tokens": 277567661.0,
"step": 646
},
{
"entropy": 0.37628173828125,
"epoch": 2.537254901960784,
"grad_norm": 0.6329324728088497,
"learning_rate": 1.2967982042460758e-06,
"loss": 0.3171,
"mean_token_accuracy": 0.8866855762898922,
"num_tokens": 278003814.0,
"step": 647
},
{
"entropy": 0.385406494140625,
"epoch": 2.541176470588235,
"grad_norm": 0.6162701360426496,
"learning_rate": 1.2755685784331784e-06,
"loss": 0.3204,
"mean_token_accuracy": 0.884353194385767,
"num_tokens": 278430116.0,
"step": 648
},
{
"entropy": 0.381744384765625,
"epoch": 2.545098039215686,
"grad_norm": 0.6126382608490024,
"learning_rate": 1.25450231910328e-06,
"loss": 0.3237,
"mean_token_accuracy": 0.8824541233479977,
"num_tokens": 278877688.0,
"step": 649
},
{
"entropy": 0.379852294921875,
"epoch": 2.549019607843137,
"grad_norm": 0.6326607766200493,
"learning_rate": 1.2335998207257138e-06,
"loss": 0.317,
"mean_token_accuracy": 0.8837267532944679,
"num_tokens": 279288243.0,
"step": 650
},
{
"entropy": 0.380035400390625,
"epoch": 2.552941176470588,
"grad_norm": 0.680830740300124,
"learning_rate": 1.2128614747033728e-06,
"loss": 0.3306,
"mean_token_accuracy": 0.8820574702695012,
"num_tokens": 279710264.0,
"step": 651
},
{
"entropy": 0.378814697265625,
"epoch": 2.556862745098039,
"grad_norm": 0.6099946083878783,
"learning_rate": 1.1922876693653584e-06,
"loss": 0.3114,
"mean_token_accuracy": 0.8873716210946441,
"num_tokens": 280131883.0,
"step": 652
},
{
"entropy": 0.377716064453125,
"epoch": 2.56078431372549,
"grad_norm": 0.6295409472221051,
"learning_rate": 1.1718787899597239e-06,
"loss": 0.3002,
"mean_token_accuracy": 0.8915431387722492,
"num_tokens": 280576360.0,
"step": 653
},
{
"entropy": 0.375396728515625,
"epoch": 2.564705882352941,
"grad_norm": 0.6209039798446115,
"learning_rate": 1.1516352186462588e-06,
"loss": 0.3198,
"mean_token_accuracy": 0.8849526178091764,
"num_tokens": 281018212.0,
"step": 654
},
{
"entropy": 0.37811279296875,
"epoch": 2.568627450980392,
"grad_norm": 0.6320912091179962,
"learning_rate": 1.131557334489326e-06,
"loss": 0.3049,
"mean_token_accuracy": 0.8900288715958595,
"num_tokens": 281432844.0,
"step": 655
},
{
"entropy": 0.38299560546875,
"epoch": 2.572549019607843,
"grad_norm": 0.6821011372034809,
"learning_rate": 1.1116455134507665e-06,
"loss": 0.3163,
"mean_token_accuracy": 0.8853369234129786,
"num_tokens": 281847459.0,
"step": 656
},
{
"entropy": 0.37677001953125,
"epoch": 2.576470588235294,
"grad_norm": 0.6235261380861172,
"learning_rate": 1.0919001283828666e-06,
"loss": 0.3133,
"mean_token_accuracy": 0.8864665804430842,
"num_tokens": 282286944.0,
"step": 657
},
{
"entropy": 0.38006591796875,
"epoch": 2.5803921568627453,
"grad_norm": 0.6221303988087854,
"learning_rate": 1.0723215490213635e-06,
"loss": 0.311,
"mean_token_accuracy": 0.8873193822801113,
"num_tokens": 282710819.0,
"step": 658
},
{
"entropy": 0.37469482421875,
"epoch": 2.5843137254901962,
"grad_norm": 0.6574754213052622,
"learning_rate": 1.052910141978537e-06,
"loss": 0.3125,
"mean_token_accuracy": 0.886294306255877,
"num_tokens": 283156052.0,
"step": 659
},
{
"entropy": 0.3814697265625,
"epoch": 2.588235294117647,
"grad_norm": 0.6320334063566375,
"learning_rate": 1.0336662707363287e-06,
"loss": 0.3173,
"mean_token_accuracy": 0.8854138003662229,
"num_tokens": 283595201.0,
"step": 660
},
{
"entropy": 0.376983642578125,
"epoch": 2.592156862745098,
"grad_norm": 0.619569861836383,
"learning_rate": 1.0145902956395449e-06,
"loss": 0.3154,
"mean_token_accuracy": 0.8877551760524511,
"num_tokens": 284021326.0,
"step": 661
},
{
"entropy": 0.378021240234375,
"epoch": 2.596078431372549,
"grad_norm": 0.6178131877623513,
"learning_rate": 9.95682573889114e-07,
"loss": 0.3101,
"mean_token_accuracy": 0.8884122371673584,
"num_tokens": 284444911.0,
"step": 662
},
{
"entropy": 0.374053955078125,
"epoch": 2.6,
"grad_norm": 0.6257874518096788,
"learning_rate": 9.76943459535381e-07,
"loss": 0.3054,
"mean_token_accuracy": 0.8889162968844175,
"num_tokens": 284883556.0,
"step": 663
},
{
"entropy": 0.381072998046875,
"epoch": 2.603921568627451,
"grad_norm": 0.608051038563699,
"learning_rate": 9.583733034714982e-07,
"loss": 0.3196,
"mean_token_accuracy": 0.886277524754405,
"num_tokens": 285330782.0,
"step": 664
},
{
"entropy": 0.3798828125,
"epoch": 2.607843137254902,
"grad_norm": 0.5909285116426348,
"learning_rate": 9.399724534268385e-07,
"loss": 0.3042,
"mean_token_accuracy": 0.8899399247020483,
"num_tokens": 285763663.0,
"step": 665
},
{
"entropy": 0.380279541015625,
"epoch": 2.611764705882353,
"grad_norm": 0.6431264626693568,
"learning_rate": 9.217412539604942e-07,
"loss": 0.2959,
"mean_token_accuracy": 0.8899163901805878,
"num_tokens": 286199562.0,
"step": 666
},
{
"entropy": 0.376007080078125,
"epoch": 2.615686274509804,
"grad_norm": 0.5982503195100611,
"learning_rate": 9.036800464548157e-07,
"loss": 0.3192,
"mean_token_accuracy": 0.8865080736577511,
"num_tokens": 286621995.0,
"step": 667
},
{
"entropy": 0.379364013671875,
"epoch": 2.619607843137255,
"grad_norm": 0.6270853297070971,
"learning_rate": 8.857891691090336e-07,
"loss": 0.3134,
"mean_token_accuracy": 0.8862617230042815,
"num_tokens": 287057155.0,
"step": 668
},
{
"entropy": 0.3865966796875,
"epoch": 2.623529411764706,
"grad_norm": 0.6300575883109812,
"learning_rate": 8.680689569329071e-07,
"loss": 0.3233,
"mean_token_accuracy": 0.8842667685821652,
"num_tokens": 287460969.0,
"step": 669
},
{
"entropy": 0.377655029296875,
"epoch": 2.627450980392157,
"grad_norm": 0.6453389395674596,
"learning_rate": 8.505197417404687e-07,
"loss": 0.3187,
"mean_token_accuracy": 0.8853940945118666,
"num_tokens": 287895047.0,
"step": 670
},
{
"entropy": 0.382720947265625,
"epoch": 2.631372549019608,
"grad_norm": 0.6238765655563359,
"learning_rate": 8.331418521437973e-07,
"loss": 0.3069,
"mean_token_accuracy": 0.8877802221104503,
"num_tokens": 288310065.0,
"step": 671
},
{
"entropy": 0.374603271484375,
"epoch": 2.635294117647059,
"grad_norm": 0.6047173519053458,
"learning_rate": 8.159356135468721e-07,
"loss": 0.3219,
"mean_token_accuracy": 0.8853782610967755,
"num_tokens": 288751087.0,
"step": 672
},
{
"entropy": 0.3780517578125,
"epoch": 2.6392156862745098,
"grad_norm": 0.622327693826135,
"learning_rate": 7.989013481394813e-07,
"loss": 0.304,
"mean_token_accuracy": 0.8917795773595572,
"num_tokens": 289180726.0,
"step": 673
},
{
"entropy": 0.380523681640625,
"epoch": 2.6431372549019607,
"grad_norm": 0.6222821808342242,
"learning_rate": 7.820393748911792e-07,
"loss": 0.3074,
"mean_token_accuracy": 0.8889235118404031,
"num_tokens": 289591818.0,
"step": 674
},
{
"entropy": 0.3770751953125,
"epoch": 2.6470588235294117,
"grad_norm": 0.6125349893982243,
"learning_rate": 7.653500095453248e-07,
"loss": 0.3086,
"mean_token_accuracy": 0.8879644311964512,
"num_tokens": 290041282.0,
"step": 675
},
{
"entropy": 0.3822021484375,
"epoch": 2.6509803921568627,
"grad_norm": 0.6282527306601516,
"learning_rate": 7.488335646131628e-07,
"loss": 0.3051,
"mean_token_accuracy": 0.8894842406734824,
"num_tokens": 290458439.0,
"step": 676
},
{
"entropy": 0.379425048828125,
"epoch": 2.6549019607843136,
"grad_norm": 0.6299084942404871,
"learning_rate": 7.324903493679703e-07,
"loss": 0.3097,
"mean_token_accuracy": 0.8882144782692194,
"num_tokens": 290895655.0,
"step": 677
},
{
"entropy": 0.37530517578125,
"epoch": 2.6588235294117646,
"grad_norm": 0.6069024822726714,
"learning_rate": 7.163206698392744e-07,
"loss": 0.3099,
"mean_token_accuracy": 0.8873086804524064,
"num_tokens": 291353643.0,
"step": 678
},
{
"entropy": 0.382171630859375,
"epoch": 2.6627450980392156,
"grad_norm": 0.6334982938868678,
"learning_rate": 7.003248288071118e-07,
"loss": 0.3144,
"mean_token_accuracy": 0.887144822627306,
"num_tokens": 291762961.0,
"step": 679
},
{
"entropy": 0.38092041015625,
"epoch": 2.6666666666666665,
"grad_norm": 0.6488389669112627,
"learning_rate": 6.845031257963619e-07,
"loss": 0.316,
"mean_token_accuracy": 0.8869073716923594,
"num_tokens": 292193427.0,
"step": 680
},
{
"entropy": 0.38385009765625,
"epoch": 2.6705882352941175,
"grad_norm": 0.6227550674311402,
"learning_rate": 6.688558570711468e-07,
"loss": 0.3166,
"mean_token_accuracy": 0.8865640126168728,
"num_tokens": 292607942.0,
"step": 681
},
{
"entropy": 0.38037109375,
"epoch": 2.674509803921569,
"grad_norm": 0.6169942122797281,
"learning_rate": 6.53383315629268e-07,
"loss": 0.3048,
"mean_token_accuracy": 0.8906151866540313,
"num_tokens": 293040009.0,
"step": 682
},
{
"entropy": 0.375091552734375,
"epoch": 2.67843137254902,
"grad_norm": 0.615932545560767,
"learning_rate": 6.380857911967364e-07,
"loss": 0.3065,
"mean_token_accuracy": 0.888974635861814,
"num_tokens": 293489302.0,
"step": 683
},
{
"entropy": 0.3773193359375,
"epoch": 2.682352941176471,
"grad_norm": 0.6224114881397005,
"learning_rate": 6.229635702223325e-07,
"loss": 0.3114,
"mean_token_accuracy": 0.8888672534376383,
"num_tokens": 293918626.0,
"step": 684
},
{
"entropy": 0.381591796875,
"epoch": 2.686274509803922,
"grad_norm": 0.6075798679022492,
"learning_rate": 6.08016935872251e-07,
"loss": 0.3071,
"mean_token_accuracy": 0.8886892907321453,
"num_tokens": 294344050.0,
"step": 685
},
{
"entropy": 0.375885009765625,
"epoch": 2.6901960784313728,
"grad_norm": 0.6348423219100143,
"learning_rate": 5.932461680248014e-07,
"loss": 0.3022,
"mean_token_accuracy": 0.8917614500969648,
"num_tokens": 294782851.0,
"step": 686
},
{
"entropy": 0.384307861328125,
"epoch": 2.6941176470588237,
"grad_norm": 0.627257222117316,
"learning_rate": 5.786515432651563e-07,
"loss": 0.3033,
"mean_token_accuracy": 0.8891853494569659,
"num_tokens": 295179548.0,
"step": 687
},
{
"entropy": 0.37713623046875,
"epoch": 2.6980392156862747,
"grad_norm": 0.6233347416359626,
"learning_rate": 5.64233334880181e-07,
"loss": 0.3184,
"mean_token_accuracy": 0.8849873188883066,
"num_tokens": 295611827.0,
"step": 688
},
{
"entropy": 0.3787841796875,
"epoch": 2.7019607843137257,
"grad_norm": 0.6121392174435754,
"learning_rate": 5.499918128533155e-07,
"loss": 0.3126,
"mean_token_accuracy": 0.8860092610120773,
"num_tokens": 296030475.0,
"step": 689
},
{
"entropy": 0.37786865234375,
"epoch": 2.7058823529411766,
"grad_norm": 0.6016793737256022,
"learning_rate": 5.359272438595153e-07,
"loss": 0.3069,
"mean_token_accuracy": 0.8904592543840408,
"num_tokens": 296453853.0,
"step": 690
},
{
"entropy": 0.377716064453125,
"epoch": 2.7098039215686276,
"grad_norm": 0.6620926577713818,
"learning_rate": 5.22039891260262e-07,
"loss": 0.3257,
"mean_token_accuracy": 0.8827581582590938,
"num_tokens": 296901649.0,
"step": 691
},
{
"entropy": 0.38031005859375,
"epoch": 2.7137254901960786,
"grad_norm": 0.6209236794788132,
"learning_rate": 5.083300150986259e-07,
"loss": 0.3133,
"mean_token_accuracy": 0.8865346414968371,
"num_tokens": 297319515.0,
"step": 692
},
{
"entropy": 0.38470458984375,
"epoch": 2.7176470588235295,
"grad_norm": 0.6169264345057406,
"learning_rate": 4.947978720944025e-07,
"loss": 0.3281,
"mean_token_accuracy": 0.883204753510654,
"num_tokens": 297749112.0,
"step": 693
},
{
"entropy": 0.3839111328125,
"epoch": 2.7215686274509805,
"grad_norm": 0.7143983827383057,
"learning_rate": 4.814437156393048e-07,
"loss": 0.3114,
"mean_token_accuracy": 0.8880084175616503,
"num_tokens": 298166481.0,
"step": 694
},
{
"entropy": 0.375640869140625,
"epoch": 2.7254901960784315,
"grad_norm": 0.6088784931751031,
"learning_rate": 4.682677957922155e-07,
"loss": 0.3069,
"mean_token_accuracy": 0.8904805909842253,
"num_tokens": 298606282.0,
"step": 695
},
{
"entropy": 0.378326416015625,
"epoch": 2.7294117647058824,
"grad_norm": 0.6317368332658094,
"learning_rate": 4.5527035927450337e-07,
"loss": 0.31,
"mean_token_accuracy": 0.887597025372088,
"num_tokens": 299050219.0,
"step": 696
},
{
"entropy": 0.379425048828125,
"epoch": 2.7333333333333334,
"grad_norm": 0.6095714519363978,
"learning_rate": 4.424516494654119e-07,
"loss": 0.3155,
"mean_token_accuracy": 0.8861374296247959,
"num_tokens": 299491767.0,
"step": 697
},
{
"entropy": 0.376434326171875,
"epoch": 2.7372549019607844,
"grad_norm": 0.5957575584979452,
"learning_rate": 4.298119063974915e-07,
"loss": 0.3075,
"mean_token_accuracy": 0.8884414276108146,
"num_tokens": 299937012.0,
"step": 698
},
{
"entropy": 0.38043212890625,
"epoch": 2.7411764705882353,
"grad_norm": 0.6145754054259317,
"learning_rate": 4.173513667521123e-07,
"loss": 0.3231,
"mean_token_accuracy": 0.8824452431872487,
"num_tokens": 300366708.0,
"step": 699
},
{
"entropy": 0.377410888671875,
"epoch": 2.7450980392156863,
"grad_norm": 0.596226526796468,
"learning_rate": 4.0507026385502747e-07,
"loss": 0.3185,
"mean_token_accuracy": 0.8844813704490662,
"num_tokens": 300808950.0,
"step": 700
},
{
"entropy": 0.387542724609375,
"epoch": 2.7490196078431373,
"grad_norm": 0.6125484910454695,
"learning_rate": 3.929688276720045e-07,
"loss": 0.3061,
"mean_token_accuracy": 0.8885686565190554,
"num_tokens": 301222241.0,
"step": 701
},
{
"entropy": 0.3780517578125,
"epoch": 2.7529411764705882,
"grad_norm": 0.624916010504038,
"learning_rate": 3.810472848045266e-07,
"loss": 0.3177,
"mean_token_accuracy": 0.8858394585549831,
"num_tokens": 301663064.0,
"step": 702
},
{
"entropy": 0.376312255859375,
"epoch": 2.756862745098039,
"grad_norm": 0.6058817794906675,
"learning_rate": 3.693058584855369e-07,
"loss": 0.3151,
"mean_token_accuracy": 0.8896542508155107,
"num_tokens": 302117732.0,
"step": 703
},
{
"entropy": 0.379486083984375,
"epoch": 2.76078431372549,
"grad_norm": 0.6464785466948662,
"learning_rate": 3.5774476857527107e-07,
"loss": 0.3153,
"mean_token_accuracy": 0.8852464202791452,
"num_tokens": 302565320.0,
"step": 704
},
{
"entropy": 0.3858642578125,
"epoch": 2.764705882352941,
"grad_norm": 0.6403154212673284,
"learning_rate": 3.463642315571292e-07,
"loss": 0.3057,
"mean_token_accuracy": 0.8876703213900328,
"num_tokens": 302985307.0,
"step": 705
},
{
"entropy": 0.3782958984375,
"epoch": 2.768627450980392,
"grad_norm": 0.6095940050206947,
"learning_rate": 3.3516446053363015e-07,
"loss": 0.3082,
"mean_token_accuracy": 0.8882761783897877,
"num_tokens": 303424607.0,
"step": 706
},
{
"entropy": 0.379974365234375,
"epoch": 2.772549019607843,
"grad_norm": 0.6161476280093703,
"learning_rate": 3.241456652224184e-07,
"loss": 0.3138,
"mean_token_accuracy": 0.8859195495024323,
"num_tokens": 303860684.0,
"step": 707
},
{
"entropy": 0.379730224609375,
"epoch": 2.776470588235294,
"grad_norm": 0.5960959951045524,
"learning_rate": 3.1330805195233684e-07,
"loss": 0.3058,
"mean_token_accuracy": 0.8890456901863217,
"num_tokens": 304293142.0,
"step": 708
},
{
"entropy": 0.38690185546875,
"epoch": 2.780392156862745,
"grad_norm": 0.6160682077130808,
"learning_rate": 3.0265182365956213e-07,
"loss": 0.309,
"mean_token_accuracy": 0.8865752117708325,
"num_tokens": 304706003.0,
"step": 709
},
{
"entropy": 0.38427734375,
"epoch": 2.784313725490196,
"grad_norm": 0.6111529979519884,
"learning_rate": 2.921771798838069e-07,
"loss": 0.3237,
"mean_token_accuracy": 0.8845190508291125,
"num_tokens": 305131254.0,
"step": 710
},
{
"entropy": 0.377349853515625,
"epoch": 2.788235294117647,
"grad_norm": 0.6099627522064227,
"learning_rate": 2.818843167645835e-07,
"loss": 0.3129,
"mean_token_accuracy": 0.8868130994960666,
"num_tokens": 305567833.0,
"step": 711
},
{
"entropy": 0.385406494140625,
"epoch": 2.792156862745098,
"grad_norm": 0.6105851628336647,
"learning_rate": 2.717734270375272e-07,
"loss": 0.3197,
"mean_token_accuracy": 0.8870526794344187,
"num_tokens": 305990304.0,
"step": 712
},
{
"entropy": 0.386505126953125,
"epoch": 2.796078431372549,
"grad_norm": 0.6247273295141441,
"learning_rate": 2.618447000307922e-07,
"loss": 0.3137,
"mean_token_accuracy": 0.8874122854322195,
"num_tokens": 306395161.0,
"step": 713
},
{
"entropy": 0.381256103515625,
"epoch": 2.8,
"grad_norm": 0.635299701134664,
"learning_rate": 2.520983216615047e-07,
"loss": 0.298,
"mean_token_accuracy": 0.8914454290643334,
"num_tokens": 306825012.0,
"step": 714
},
{
"entropy": 0.38519287109375,
"epoch": 2.803921568627451,
"grad_norm": 0.595748483597398,
"learning_rate": 2.4253447443228106e-07,
"loss": 0.3045,
"mean_token_accuracy": 0.8908979864791036,
"num_tokens": 307253222.0,
"step": 715
},
{
"entropy": 0.381011962890625,
"epoch": 2.8078431372549018,
"grad_norm": 0.6134283304279732,
"learning_rate": 2.3315333742780942e-07,
"loss": 0.3175,
"mean_token_accuracy": 0.8856520559638739,
"num_tokens": 307689012.0,
"step": 716
},
{
"entropy": 0.379150390625,
"epoch": 2.8117647058823527,
"grad_norm": 0.6057712094651984,
"learning_rate": 2.23955086311497e-07,
"loss": 0.3126,
"mean_token_accuracy": 0.8885315740481019,
"num_tokens": 308125986.0,
"step": 717
},
{
"entropy": 0.3779296875,
"epoch": 2.8156862745098037,
"grad_norm": 0.635688295650433,
"learning_rate": 2.1493989332218468e-07,
"loss": 0.306,
"mean_token_accuracy": 0.8884778618812561,
"num_tokens": 308545495.0,
"step": 718
},
{
"entropy": 0.380035400390625,
"epoch": 2.8196078431372547,
"grad_norm": 0.6261639298694902,
"learning_rate": 2.0610792727091434e-07,
"loss": 0.3156,
"mean_token_accuracy": 0.886909999884665,
"num_tokens": 308975486.0,
"step": 719
},
{
"entropy": 0.38824462890625,
"epoch": 2.8235294117647056,
"grad_norm": 0.6134392371302576,
"learning_rate": 1.9745935353777222e-07,
"loss": 0.3067,
"mean_token_accuracy": 0.8894705669954419,
"num_tokens": 309390246.0,
"step": 720
},
{
"entropy": 0.376953125,
"epoch": 2.8274509803921566,
"grad_norm": 0.6215029371259475,
"learning_rate": 1.889943340687961e-07,
"loss": 0.3069,
"mean_token_accuracy": 0.8888574000447989,
"num_tokens": 309818332.0,
"step": 721
},
{
"entropy": 0.376678466796875,
"epoch": 2.831372549019608,
"grad_norm": 0.6070304102792409,
"learning_rate": 1.8071302737293294e-07,
"loss": 0.3059,
"mean_token_accuracy": 0.8882985869422555,
"num_tokens": 310258005.0,
"step": 722
},
{
"entropy": 0.38250732421875,
"epoch": 2.835294117647059,
"grad_norm": 0.611400797022777,
"learning_rate": 1.7261558851908056e-07,
"loss": 0.3103,
"mean_token_accuracy": 0.8894185777753592,
"num_tokens": 310676709.0,
"step": 723
},
{
"entropy": 0.377838134765625,
"epoch": 2.83921568627451,
"grad_norm": 0.6356223613609566,
"learning_rate": 1.6470216913317628e-07,
"loss": 0.3143,
"mean_token_accuracy": 0.8873595865443349,
"num_tokens": 311108400.0,
"step": 724
},
{
"entropy": 0.383544921875,
"epoch": 2.843137254901961,
"grad_norm": 0.6229792377890674,
"learning_rate": 1.569729173953638e-07,
"loss": 0.3033,
"mean_token_accuracy": 0.8884387910366058,
"num_tokens": 311510481.0,
"step": 725
},
{
"entropy": 0.37615966796875,
"epoch": 2.847058823529412,
"grad_norm": 0.6431143365231876,
"learning_rate": 1.4942797803721543e-07,
"loss": 0.3218,
"mean_token_accuracy": 0.8845628844574094,
"num_tokens": 311942786.0,
"step": 726
},
{
"entropy": 0.38525390625,
"epoch": 2.850980392156863,
"grad_norm": 0.6206033497042358,
"learning_rate": 1.4206749233902084e-07,
"loss": 0.3054,
"mean_token_accuracy": 0.8902052519842982,
"num_tokens": 312370010.0,
"step": 727
},
{
"entropy": 0.3773193359375,
"epoch": 2.854901960784314,
"grad_norm": 0.6498627060531488,
"learning_rate": 1.348915981271437e-07,
"loss": 0.3177,
"mean_token_accuracy": 0.885548148304224,
"num_tokens": 312798692.0,
"step": 728
},
{
"entropy": 0.3800048828125,
"epoch": 2.8588235294117648,
"grad_norm": 0.6411848862187207,
"learning_rate": 1.2790042977144256e-07,
"loss": 0.3082,
"mean_token_accuracy": 0.8862299472093582,
"num_tokens": 313206172.0,
"step": 729
},
{
"entropy": 0.379302978515625,
"epoch": 2.8627450980392157,
"grad_norm": 0.626796859270999,
"learning_rate": 1.2109411818274851e-07,
"loss": 0.3128,
"mean_token_accuracy": 0.8862822419032454,
"num_tokens": 313626626.0,
"step": 730
},
{
"entropy": 0.376983642578125,
"epoch": 2.8666666666666667,
"grad_norm": 0.6173793755783579,
"learning_rate": 1.1447279081042262e-07,
"loss": 0.3091,
"mean_token_accuracy": 0.8877417473122478,
"num_tokens": 314049995.0,
"step": 731
},
{
"entropy": 0.3756103515625,
"epoch": 2.8705882352941177,
"grad_norm": 0.6153282901073235,
"learning_rate": 1.0803657163995896e-07,
"loss": 0.3135,
"mean_token_accuracy": 0.8866222072392702,
"num_tokens": 314479750.0,
"step": 732
},
{
"entropy": 0.380462646484375,
"epoch": 2.8745098039215686,
"grad_norm": 0.6837019195802836,
"learning_rate": 1.0178558119067316e-07,
"loss": 0.3243,
"mean_token_accuracy": 0.886231679469347,
"num_tokens": 314909938.0,
"step": 733
},
{
"entropy": 0.377532958984375,
"epoch": 2.8784313725490196,
"grad_norm": 0.599153352101819,
"learning_rate": 9.571993651343869e-08,
"loss": 0.3156,
"mean_token_accuracy": 0.8857722133398056,
"num_tokens": 315360267.0,
"step": 734
},
{
"entropy": 0.387359619140625,
"epoch": 2.8823529411764706,
"grad_norm": 0.6417439052424432,
"learning_rate": 8.983975118849853e-08,
"loss": 0.32,
"mean_token_accuracy": 0.8861173130571842,
"num_tokens": 315778855.0,
"step": 735
},
{
"entropy": 0.3782958984375,
"epoch": 2.8862745098039215,
"grad_norm": 0.6249131544678768,
"learning_rate": 8.41451353233369e-08,
"loss": 0.3139,
"mean_token_accuracy": 0.8885722318664193,
"num_tokens": 316222091.0,
"step": 736
},
{
"entropy": 0.378143310546875,
"epoch": 2.8901960784313725,
"grad_norm": 0.6129592636225704,
"learning_rate": 7.863619555061874e-08,
"loss": 0.2987,
"mean_token_accuracy": 0.8897266024723649,
"num_tokens": 316647632.0,
"step": 737
},
{
"entropy": 0.378021240234375,
"epoch": 2.8941176470588235,
"grad_norm": 0.62004735541224,
"learning_rate": 7.331303502618903e-08,
"loss": 0.3088,
"mean_token_accuracy": 0.8891623057425022,
"num_tokens": 317083092.0,
"step": 738
},
{
"entropy": 0.379150390625,
"epoch": 2.8980392156862744,
"grad_norm": 0.6220730786367578,
"learning_rate": 6.817575342714988e-08,
"loss": 0.3165,
"mean_token_accuracy": 0.8854466071352363,
"num_tokens": 317521244.0,
"step": 739
},
{
"entropy": 0.378814697265625,
"epoch": 2.9019607843137254,
"grad_norm": 0.6132017832098181,
"learning_rate": 6.32244469499832e-08,
"loss": 0.3166,
"mean_token_accuracy": 0.8851985028013587,
"num_tokens": 317962973.0,
"step": 740
},
{
"entropy": 0.381866455078125,
"epoch": 2.9058823529411764,
"grad_norm": 0.602046192463936,
"learning_rate": 5.845920830875651e-08,
"loss": 0.3097,
"mean_token_accuracy": 0.8887276640161872,
"num_tokens": 318402805.0,
"step": 741
},
{
"entropy": 0.38330078125,
"epoch": 2.9098039215686273,
"grad_norm": 0.6594166630680239,
"learning_rate": 5.388012673338661e-08,
"loss": 0.3263,
"mean_token_accuracy": 0.8835462033748627,
"num_tokens": 318832764.0,
"step": 742
},
{
"entropy": 0.3841552734375,
"epoch": 2.9137254901960783,
"grad_norm": 0.6003852014770814,
"learning_rate": 4.9487287967964206e-08,
"loss": 0.3139,
"mean_token_accuracy": 0.8860819693654776,
"num_tokens": 319258968.0,
"step": 743
},
{
"entropy": 0.372955322265625,
"epoch": 2.9176470588235293,
"grad_norm": 0.6042410386322835,
"learning_rate": 4.528077426915412e-08,
"loss": 0.309,
"mean_token_accuracy": 0.8899551248177886,
"num_tokens": 319697667.0,
"step": 744
},
{
"entropy": 0.37725830078125,
"epoch": 2.9215686274509802,
"grad_norm": 0.6075840713659405,
"learning_rate": 4.126066440464982e-08,
"loss": 0.309,
"mean_token_accuracy": 0.8887394778430462,
"num_tokens": 320116941.0,
"step": 745
},
{
"entropy": 0.385833740234375,
"epoch": 2.9254901960784316,
"grad_norm": 0.6025272498394463,
"learning_rate": 3.7427033651702414e-08,
"loss": 0.3113,
"mean_token_accuracy": 0.8870739573612809,
"num_tokens": 320545709.0,
"step": 746
},
{
"entropy": 0.38299560546875,
"epoch": 2.9294117647058826,
"grad_norm": 0.6081689478979662,
"learning_rate": 3.377995379570731e-08,
"loss": 0.3274,
"mean_token_accuracy": 0.8839912796393037,
"num_tokens": 320960157.0,
"step": 747
},
{
"entropy": 0.3834228515625,
"epoch": 2.9333333333333336,
"grad_norm": 0.5859210580796687,
"learning_rate": 3.03194931288664e-08,
"loss": 0.3094,
"mean_token_accuracy": 0.8882154319435358,
"num_tokens": 321386558.0,
"step": 748
},
{
"entropy": 0.3800048828125,
"epoch": 2.9372549019607845,
"grad_norm": 0.6840561170761486,
"learning_rate": 2.7045716448901305e-08,
"loss": 0.3073,
"mean_token_accuracy": 0.8912063157185912,
"num_tokens": 321808424.0,
"step": 749
},
{
"entropy": 0.38714599609375,
"epoch": 2.9411764705882355,
"grad_norm": 0.6504399582680688,
"learning_rate": 2.3958685057844378e-08,
"loss": 0.3176,
"mean_token_accuracy": 0.8845123695209622,
"num_tokens": 322199235.0,
"step": 750
},
{
"entropy": 0.379180908203125,
"epoch": 2.9450980392156865,
"grad_norm": 0.6102222267932175,
"learning_rate": 2.10584567608918e-08,
"loss": 0.3143,
"mean_token_accuracy": 0.8865124061703682,
"num_tokens": 322626166.0,
"step": 751
},
{
"entropy": 0.38189697265625,
"epoch": 2.9490196078431374,
"grad_norm": 0.5963276311619458,
"learning_rate": 1.83450858653178e-08,
"loss": 0.3147,
"mean_token_accuracy": 0.8874166300520301,
"num_tokens": 323061501.0,
"step": 752
},
{
"entropy": 0.384246826171875,
"epoch": 2.9529411764705884,
"grad_norm": 0.6117631243948043,
"learning_rate": 1.5818623179459924e-08,
"loss": 0.308,
"mean_token_accuracy": 0.8874607440084219,
"num_tokens": 323481673.0,
"step": 753
},
{
"entropy": 0.38275146484375,
"epoch": 2.9568627450980394,
"grad_norm": 0.6167324543611288,
"learning_rate": 1.3479116011769766e-08,
"loss": 0.3286,
"mean_token_accuracy": 0.8801386319100857,
"num_tokens": 323906728.0,
"step": 754
},
{
"entropy": 0.3828125,
"epoch": 2.9607843137254903,
"grad_norm": 0.5960943457045179,
"learning_rate": 1.1326608169920373e-08,
"loss": 0.3375,
"mean_token_accuracy": 0.8788588084280491,
"num_tokens": 324319349.0,
"step": 755
},
{
"entropy": 0.383697509765625,
"epoch": 2.9647058823529413,
"grad_norm": 0.5965957357845432,
"learning_rate": 9.361139959993549e-09,
"loss": 0.3146,
"mean_token_accuracy": 0.8867331743240356,
"num_tokens": 324746953.0,
"step": 756
},
{
"entropy": 0.379486083984375,
"epoch": 2.9686274509803923,
"grad_norm": 0.636712886908032,
"learning_rate": 7.582748185719357e-09,
"loss": 0.3261,
"mean_token_accuracy": 0.8830597475171089,
"num_tokens": 325186860.0,
"step": 757
},
{
"entropy": 0.388946533203125,
"epoch": 2.9725490196078432,
"grad_norm": 0.6118740609991475,
"learning_rate": 5.991466147791114e-09,
"loss": 0.312,
"mean_token_accuracy": 0.8873140625655651,
"num_tokens": 325595646.0,
"step": 758
},
{
"entropy": 0.38189697265625,
"epoch": 2.976470588235294,
"grad_norm": 0.6001174174999216,
"learning_rate": 4.587323643240327e-09,
"loss": 0.3126,
"mean_token_accuracy": 0.885263648815453,
"num_tokens": 326034848.0,
"step": 759
},
{
"entropy": 0.37396240234375,
"epoch": 2.980392156862745,
"grad_norm": 0.6464429544279401,
"learning_rate": 3.3703469648760367e-09,
"loss": 0.3078,
"mean_token_accuracy": 0.8882996095344424,
"num_tokens": 326467480.0,
"step": 760
},
{
"entropy": 0.375457763671875,
"epoch": 2.984313725490196,
"grad_norm": 0.6289692164207805,
"learning_rate": 2.340558900796319e-09,
"loss": 0.3076,
"mean_token_accuracy": 0.8890138473361731,
"num_tokens": 326902334.0,
"step": 761
},
{
"entropy": 0.384246826171875,
"epoch": 2.988235294117647,
"grad_norm": 0.6949597130048252,
"learning_rate": 1.497978733961958e-09,
"loss": 0.3198,
"mean_token_accuracy": 0.8849008204415441,
"num_tokens": 327332137.0,
"step": 762
},
{
"entropy": 0.38165283203125,
"epoch": 2.992156862745098,
"grad_norm": 0.630348147185592,
"learning_rate": 8.426222418311814e-10,
"loss": 0.3024,
"mean_token_accuracy": 0.8867057710886002,
"num_tokens": 327746883.0,
"step": 763
},
{
"entropy": 0.383331298828125,
"epoch": 2.996078431372549,
"grad_norm": 0.6041240098580235,
"learning_rate": 3.745016960665648e-10,
"loss": 0.3177,
"mean_token_accuracy": 0.8889250578358769,
"num_tokens": 328189791.0,
"step": 764
},
{
"entropy": 0.379364013671875,
"epoch": 3.0,
"grad_norm": 0.5940158938944181,
"learning_rate": 9.362586230632353e-11,
"loss": 0.3202,
"mean_token_accuracy": 0.8860361827537417,
"num_tokens": 328632895.0,
"step": 765
},
{
"epoch": 3.0,
"step": 765,
"total_flos": 607742654087168.0,
"train_loss": 0.4391140220601574,
"train_runtime": 58765.7127,
"train_samples_per_second": 1.258,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 765,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 607742654087168.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}