{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 765, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.58074951171875, "epoch": 0.00392156862745098, "grad_norm": 5.946658172831002, "learning_rate": 0.0, "loss": 1.4047, "mean_token_accuracy": 0.6541401473805308, "num_tokens": 394099.0, "step": 1 }, { "entropy": 0.57122802734375, "epoch": 0.00784313725490196, "grad_norm": 5.621097040674523, "learning_rate": 5.128205128205128e-07, "loss": 1.3784, "mean_token_accuracy": 0.6572609562426805, "num_tokens": 836994.0, "step": 2 }, { "entropy": 0.568328857421875, "epoch": 0.011764705882352941, "grad_norm": 5.89101271491728, "learning_rate": 1.0256410256410257e-06, "loss": 1.3794, "mean_token_accuracy": 0.6572064198553562, "num_tokens": 1266293.0, "step": 3 }, { "entropy": 0.559295654296875, "epoch": 0.01568627450980392, "grad_norm": 5.7409378697761975, "learning_rate": 1.5384615384615387e-06, "loss": 1.4025, "mean_token_accuracy": 0.6549294730648398, "num_tokens": 1704596.0, "step": 4 }, { "entropy": 0.5743408203125, "epoch": 0.0196078431372549, "grad_norm": 5.80997969854973, "learning_rate": 2.0512820512820513e-06, "loss": 1.3921, "mean_token_accuracy": 0.6552458582445979, "num_tokens": 2124104.0, "step": 5 }, { "entropy": 0.5599365234375, "epoch": 0.023529411764705882, "grad_norm": 5.314201402415645, "learning_rate": 2.564102564102564e-06, "loss": 1.3713, "mean_token_accuracy": 0.6574546648189425, "num_tokens": 2587214.0, "step": 6 }, { "entropy": 0.565338134765625, "epoch": 0.027450980392156862, "grad_norm": 5.22023313812165, "learning_rate": 3.0769230769230774e-06, "loss": 1.3643, "mean_token_accuracy": 0.6583243450149894, "num_tokens": 3036984.0, "step": 7 }, { "entropy": 0.565338134765625, "epoch": 0.03137254901960784, "grad_norm": 4.414498696886829, "learning_rate": 3.58974358974359e-06, "loss": 1.2991, "mean_token_accuracy": 0.671512926928699, "num_tokens": 3486417.0, "step": 8 }, { "entropy": 0.557159423828125, "epoch": 0.03529411764705882, "grad_norm": 4.436540831930959, "learning_rate": 4.102564102564103e-06, "loss": 1.2795, "mean_token_accuracy": 0.670446545816958, "num_tokens": 3929660.0, "step": 9 }, { "entropy": 0.591888427734375, "epoch": 0.0392156862745098, "grad_norm": 3.712979430380243, "learning_rate": 4.615384615384616e-06, "loss": 1.1742, "mean_token_accuracy": 0.6933586550876498, "num_tokens": 4322786.0, "step": 10 }, { "entropy": 0.5743408203125, "epoch": 0.043137254901960784, "grad_norm": 3.424125055764591, "learning_rate": 5.128205128205128e-06, "loss": 1.1474, "mean_token_accuracy": 0.6944275395944715, "num_tokens": 4734477.0, "step": 11 }, { "entropy": 0.555755615234375, "epoch": 0.047058823529411764, "grad_norm": 3.226923962462494, "learning_rate": 5.641025641025641e-06, "loss": 1.1461, "mean_token_accuracy": 0.6922548627480865, "num_tokens": 5179678.0, "step": 12 }, { "entropy": 0.535308837890625, "epoch": 0.050980392156862744, "grad_norm": 4.36070110135058, "learning_rate": 6.153846153846155e-06, "loss": 1.0418, "mean_token_accuracy": 0.7161724548786879, "num_tokens": 5610766.0, "step": 13 }, { "entropy": 0.535797119140625, "epoch": 0.054901960784313725, "grad_norm": 4.551396130956159, "learning_rate": 6.666666666666667e-06, "loss": 1.0402, "mean_token_accuracy": 0.7155537949874997, "num_tokens": 6047549.0, "step": 14 }, { "entropy": 0.55712890625, "epoch": 0.058823529411764705, "grad_norm": 3.8573225106802878, "learning_rate": 7.17948717948718e-06, "loss": 0.9984, "mean_token_accuracy": 0.7221314841881394, "num_tokens": 6462160.0, "step": 15 }, { "entropy": 0.533477783203125, "epoch": 0.06274509803921569, "grad_norm": 3.288341958711061, "learning_rate": 7.692307692307694e-06, "loss": 0.9651, "mean_token_accuracy": 0.7295441031455994, "num_tokens": 6903680.0, "step": 16 }, { "entropy": 0.540863037109375, "epoch": 0.06666666666666667, "grad_norm": 2.7586452982074854, "learning_rate": 8.205128205128205e-06, "loss": 0.9166, "mean_token_accuracy": 0.7403897074982524, "num_tokens": 7336790.0, "step": 17 }, { "entropy": 0.542144775390625, "epoch": 0.07058823529411765, "grad_norm": 3.9060092823437444, "learning_rate": 8.717948717948719e-06, "loss": 0.9228, "mean_token_accuracy": 0.7334894333034754, "num_tokens": 7778939.0, "step": 18 }, { "entropy": 0.54852294921875, "epoch": 0.07450980392156863, "grad_norm": 3.6701854828060054, "learning_rate": 9.230769230769232e-06, "loss": 0.8874, "mean_token_accuracy": 0.7458300339058042, "num_tokens": 8178997.0, "step": 19 }, { "entropy": 0.542633056640625, "epoch": 0.0784313725490196, "grad_norm": 2.959831958866965, "learning_rate": 9.743589743589744e-06, "loss": 0.8613, "mean_token_accuracy": 0.7516397852450609, "num_tokens": 8590073.0, "step": 20 }, { "entropy": 0.53009033203125, "epoch": 0.08235294117647059, "grad_norm": 2.5206963996671266, "learning_rate": 1.0256410256410256e-05, "loss": 0.8448, "mean_token_accuracy": 0.7533183237537742, "num_tokens": 9023810.0, "step": 21 }, { "entropy": 0.545440673828125, "epoch": 0.08627450980392157, "grad_norm": 2.6387540158464966, "learning_rate": 1.076923076923077e-05, "loss": 0.8468, "mean_token_accuracy": 0.7532298862934113, "num_tokens": 9454262.0, "step": 22 }, { "entropy": 0.530426025390625, "epoch": 0.09019607843137255, "grad_norm": 2.2953463674608385, "learning_rate": 1.1282051282051283e-05, "loss": 0.8381, "mean_token_accuracy": 0.7586212726309896, "num_tokens": 9903204.0, "step": 23 }, { "entropy": 0.530609130859375, "epoch": 0.09411764705882353, "grad_norm": 2.233239119821126, "learning_rate": 1.1794871794871796e-05, "loss": 0.7955, "mean_token_accuracy": 0.7648084424436092, "num_tokens": 10332966.0, "step": 24 }, { "entropy": 0.549896240234375, "epoch": 0.09803921568627451, "grad_norm": 2.298568482502839, "learning_rate": 1.230769230769231e-05, "loss": 0.7833, "mean_token_accuracy": 0.7674612868577242, "num_tokens": 10758034.0, "step": 25 }, { "entropy": 0.532623291015625, "epoch": 0.10196078431372549, "grad_norm": 2.1793696589064484, "learning_rate": 1.2820512820512823e-05, "loss": 0.7821, "mean_token_accuracy": 0.7678138092160225, "num_tokens": 11188541.0, "step": 26 }, { "entropy": 0.5162353515625, "epoch": 0.10588235294117647, "grad_norm": 1.8946351853054721, "learning_rate": 1.3333333333333333e-05, "loss": 0.7349, "mean_token_accuracy": 0.7782322531566024, "num_tokens": 11604712.0, "step": 27 }, { "entropy": 0.52069091796875, "epoch": 0.10980392156862745, "grad_norm": 1.773459568207231, "learning_rate": 1.3846153846153847e-05, "loss": 0.7433, "mean_token_accuracy": 0.7758480096235871, "num_tokens": 12040052.0, "step": 28 }, { "entropy": 0.514068603515625, "epoch": 0.11372549019607843, "grad_norm": 1.721966950909518, "learning_rate": 1.435897435897436e-05, "loss": 0.7389, "mean_token_accuracy": 0.7765677766874433, "num_tokens": 12466998.0, "step": 29 }, { "entropy": 0.512847900390625, "epoch": 0.11764705882352941, "grad_norm": 2.021391596891171, "learning_rate": 1.4871794871794874e-05, "loss": 0.7247, "mean_token_accuracy": 0.7790881004184484, "num_tokens": 12901157.0, "step": 30 }, { "entropy": 0.517608642578125, "epoch": 0.12156862745098039, "grad_norm": 1.9325766267617275, "learning_rate": 1.5384615384615387e-05, "loss": 0.7287, "mean_token_accuracy": 0.7784575093537569, "num_tokens": 13338766.0, "step": 31 }, { "entropy": 0.5120849609375, "epoch": 0.12549019607843137, "grad_norm": 1.9477974950463575, "learning_rate": 1.5897435897435897e-05, "loss": 0.6998, "mean_token_accuracy": 0.7857342725619674, "num_tokens": 13766779.0, "step": 32 }, { "entropy": 0.519744873046875, "epoch": 0.12941176470588237, "grad_norm": 2.348720028802442, "learning_rate": 1.641025641025641e-05, "loss": 0.6994, "mean_token_accuracy": 0.7838903805240989, "num_tokens": 14181903.0, "step": 33 }, { "entropy": 0.501556396484375, "epoch": 0.13333333333333333, "grad_norm": 1.693617785569785, "learning_rate": 1.6923076923076924e-05, "loss": 0.7034, "mean_token_accuracy": 0.7820147024467587, "num_tokens": 14621503.0, "step": 34 }, { "entropy": 0.50091552734375, "epoch": 0.13725490196078433, "grad_norm": 1.8273930967809657, "learning_rate": 1.7435897435897438e-05, "loss": 0.693, "mean_token_accuracy": 0.7863769382238388, "num_tokens": 15047501.0, "step": 35 }, { "entropy": 0.506439208984375, "epoch": 0.1411764705882353, "grad_norm": 1.7334306818931762, "learning_rate": 1.794871794871795e-05, "loss": 0.6676, "mean_token_accuracy": 0.789363824762404, "num_tokens": 15469986.0, "step": 36 }, { "entropy": 0.50177001953125, "epoch": 0.1450980392156863, "grad_norm": 1.9378805916268533, "learning_rate": 1.8461538461538465e-05, "loss": 0.6858, "mean_token_accuracy": 0.7883838685229421, "num_tokens": 15911435.0, "step": 37 }, { "entropy": 0.502349853515625, "epoch": 0.14901960784313725, "grad_norm": 1.7939770238739534, "learning_rate": 1.8974358974358975e-05, "loss": 0.6574, "mean_token_accuracy": 0.7959268698468804, "num_tokens": 16329945.0, "step": 38 }, { "entropy": 0.50067138671875, "epoch": 0.15294117647058825, "grad_norm": 1.7417461015153004, "learning_rate": 1.9487179487179488e-05, "loss": 0.6562, "mean_token_accuracy": 0.7929449649527669, "num_tokens": 16759988.0, "step": 39 }, { "entropy": 0.504241943359375, "epoch": 0.1568627450980392, "grad_norm": 1.7893435723544215, "learning_rate": 2e-05, "loss": 0.6543, "mean_token_accuracy": 0.7933545038104057, "num_tokens": 17192249.0, "step": 40 }, { "entropy": 0.497528076171875, "epoch": 0.1607843137254902, "grad_norm": 1.6271906758881933, "learning_rate": 1.9999906374137693e-05, "loss": 0.667, "mean_token_accuracy": 0.7928741071373224, "num_tokens": 17635747.0, "step": 41 }, { "entropy": 0.499053955078125, "epoch": 0.16470588235294117, "grad_norm": 1.5398454598257563, "learning_rate": 1.9999625498303936e-05, "loss": 0.6406, "mean_token_accuracy": 0.7975211972370744, "num_tokens": 18069109.0, "step": 42 }, { "entropy": 0.494903564453125, "epoch": 0.16862745098039217, "grad_norm": 1.5523620609569904, "learning_rate": 1.999915737775817e-05, "loss": 0.661, "mean_token_accuracy": 0.791739515028894, "num_tokens": 18529022.0, "step": 43 }, { "entropy": 0.5147705078125, "epoch": 0.17254901960784313, "grad_norm": 1.7516517318464624, "learning_rate": 1.999850202126604e-05, "loss": 0.6412, "mean_token_accuracy": 0.7992083700373769, "num_tokens": 18950266.0, "step": 44 }, { "entropy": 0.493988037109375, "epoch": 0.17647058823529413, "grad_norm": 1.443212801622469, "learning_rate": 1.9997659441099205e-05, "loss": 0.6292, "mean_token_accuracy": 0.8008020855486393, "num_tokens": 19382389.0, "step": 45 }, { "entropy": 0.493621826171875, "epoch": 0.1803921568627451, "grad_norm": 1.8120426269144319, "learning_rate": 1.9996629653035128e-05, "loss": 0.6301, "mean_token_accuracy": 0.7986322436481714, "num_tokens": 19814774.0, "step": 46 }, { "entropy": 0.483428955078125, "epoch": 0.1843137254901961, "grad_norm": 1.504152484077655, "learning_rate": 1.999541267635676e-05, "loss": 0.6146, "mean_token_accuracy": 0.8071837406605482, "num_tokens": 20252726.0, "step": 47 }, { "entropy": 0.48736572265625, "epoch": 0.18823529411764706, "grad_norm": 1.5994176249090368, "learning_rate": 1.999400853385221e-05, "loss": 0.6079, "mean_token_accuracy": 0.8068799478933215, "num_tokens": 20693231.0, "step": 48 }, { "entropy": 0.503387451171875, "epoch": 0.19215686274509805, "grad_norm": 1.6595523586084884, "learning_rate": 1.999241725181428e-05, "loss": 0.622, "mean_token_accuracy": 0.8015562687069178, "num_tokens": 21098332.0, "step": 49 }, { "entropy": 0.4970703125, "epoch": 0.19607843137254902, "grad_norm": 1.7649764098243288, "learning_rate": 1.9990638860040007e-05, "loss": 0.6198, "mean_token_accuracy": 0.8020276734605432, "num_tokens": 21516995.0, "step": 50 }, { "entropy": 0.49176025390625, "epoch": 0.2, "grad_norm": 1.5530289762258092, "learning_rate": 1.9988673391830082e-05, "loss": 0.6094, "mean_token_accuracy": 0.805065156891942, "num_tokens": 21940900.0, "step": 51 }, { "entropy": 0.486602783203125, "epoch": 0.20392156862745098, "grad_norm": 1.5600492363872476, "learning_rate": 1.9986520883988233e-05, "loss": 0.6072, "mean_token_accuracy": 0.8033135803416371, "num_tokens": 22373270.0, "step": 52 }, { "entropy": 0.49688720703125, "epoch": 0.20784313725490197, "grad_norm": 1.3244940248118993, "learning_rate": 1.9984181376820542e-05, "loss": 0.6056, "mean_token_accuracy": 0.804581237025559, "num_tokens": 22795686.0, "step": 53 }, { "entropy": 0.477386474609375, "epoch": 0.21176470588235294, "grad_norm": 1.4188422513941414, "learning_rate": 1.9981654914134684e-05, "loss": 0.613, "mean_token_accuracy": 0.8030998045578599, "num_tokens": 23237348.0, "step": 54 }, { "entropy": 0.491363525390625, "epoch": 0.21568627450980393, "grad_norm": 1.3877977869185212, "learning_rate": 1.997894154323911e-05, "loss": 0.5934, "mean_token_accuracy": 0.8087702514603734, "num_tokens": 23658608.0, "step": 55 }, { "entropy": 0.485931396484375, "epoch": 0.2196078431372549, "grad_norm": 1.3154982477831079, "learning_rate": 1.9976041314942156e-05, "loss": 0.5871, "mean_token_accuracy": 0.810174492187798, "num_tokens": 24088562.0, "step": 56 }, { "entropy": 0.4814453125, "epoch": 0.2235294117647059, "grad_norm": 1.4183101521509853, "learning_rate": 1.99729542835511e-05, "loss": 0.61, "mean_token_accuracy": 0.8048477824777365, "num_tokens": 24525442.0, "step": 57 }, { "entropy": 0.48516845703125, "epoch": 0.22745098039215686, "grad_norm": 1.3527504787456999, "learning_rate": 1.9969680506871138e-05, "loss": 0.5784, "mean_token_accuracy": 0.8116851877421141, "num_tokens": 24950326.0, "step": 58 }, { "entropy": 0.482147216796875, "epoch": 0.23137254901960785, "grad_norm": 1.4718082614826202, "learning_rate": 1.9966220046204295e-05, "loss": 0.5971, "mean_token_accuracy": 0.8080776166170835, "num_tokens": 25389519.0, "step": 59 }, { "entropy": 0.48907470703125, "epoch": 0.23529411764705882, "grad_norm": 1.4367294679131557, "learning_rate": 1.99625729663483e-05, "loss": 0.599, "mean_token_accuracy": 0.8051031418144703, "num_tokens": 25815656.0, "step": 60 }, { "entropy": 0.485626220703125, "epoch": 0.23921568627450981, "grad_norm": 1.5409462289066693, "learning_rate": 1.995873933559535e-05, "loss": 0.5969, "mean_token_accuracy": 0.8079076996073127, "num_tokens": 26249330.0, "step": 61 }, { "entropy": 0.47406005859375, "epoch": 0.24313725490196078, "grad_norm": 1.2353178316165938, "learning_rate": 1.9954719225730847e-05, "loss": 0.5713, "mean_token_accuracy": 0.8120078714564443, "num_tokens": 26683828.0, "step": 62 }, { "entropy": 0.474151611328125, "epoch": 0.24705882352941178, "grad_norm": 1.2967444220306512, "learning_rate": 1.9950512712032038e-05, "loss": 0.5836, "mean_token_accuracy": 0.810034915804863, "num_tokens": 27116083.0, "step": 63 }, { "entropy": 0.474334716796875, "epoch": 0.25098039215686274, "grad_norm": 1.3949569129085198, "learning_rate": 1.9946119873266615e-05, "loss": 0.5853, "mean_token_accuracy": 0.8108502132818103, "num_tokens": 27564582.0, "step": 64 }, { "entropy": 0.477752685546875, "epoch": 0.2549019607843137, "grad_norm": 1.1933833335696828, "learning_rate": 1.9941540791691245e-05, "loss": 0.5797, "mean_token_accuracy": 0.8111855685710907, "num_tokens": 28010664.0, "step": 65 }, { "entropy": 0.485443115234375, "epoch": 0.25882352941176473, "grad_norm": 1.263992447554956, "learning_rate": 1.9936775553050017e-05, "loss": 0.5616, "mean_token_accuracy": 0.8182382667437196, "num_tokens": 28423217.0, "step": 66 }, { "entropy": 0.474945068359375, "epoch": 0.2627450980392157, "grad_norm": 1.327268023915043, "learning_rate": 1.993182424657285e-05, "loss": 0.5833, "mean_token_accuracy": 0.8107379814609885, "num_tokens": 28859600.0, "step": 67 }, { "entropy": 0.47674560546875, "epoch": 0.26666666666666666, "grad_norm": 1.424999013681543, "learning_rate": 1.9926686964973813e-05, "loss": 0.5725, "mean_token_accuracy": 0.8136901557445526, "num_tokens": 29274182.0, "step": 68 }, { "entropy": 0.480072021484375, "epoch": 0.27058823529411763, "grad_norm": 1.418343251105222, "learning_rate": 1.9921363804449383e-05, "loss": 0.5875, "mean_token_accuracy": 0.8093108516186476, "num_tokens": 29705582.0, "step": 69 }, { "entropy": 0.47283935546875, "epoch": 0.27450980392156865, "grad_norm": 1.4521724256504855, "learning_rate": 1.9915854864676665e-05, "loss": 0.5891, "mean_token_accuracy": 0.8094787122681737, "num_tokens": 30137961.0, "step": 70 }, { "entropy": 0.466949462890625, "epoch": 0.2784313725490196, "grad_norm": 1.2792010927676387, "learning_rate": 1.9910160248811502e-05, "loss": 0.5723, "mean_token_accuracy": 0.8133273124694824, "num_tokens": 30574142.0, "step": 71 }, { "entropy": 0.474884033203125, "epoch": 0.2823529411764706, "grad_norm": 1.6473953355135174, "learning_rate": 1.9904280063486563e-05, "loss": 0.5673, "mean_token_accuracy": 0.8135095341131091, "num_tokens": 31007161.0, "step": 72 }, { "entropy": 0.480682373046875, "epoch": 0.28627450980392155, "grad_norm": 1.4162141272460542, "learning_rate": 1.989821441880933e-05, "loss": 0.5749, "mean_token_accuracy": 0.8136226320639253, "num_tokens": 31434909.0, "step": 73 }, { "entropy": 0.4854736328125, "epoch": 0.2901960784313726, "grad_norm": 1.4648510763716496, "learning_rate": 1.9891963428360043e-05, "loss": 0.5757, "mean_token_accuracy": 0.8133104396983981, "num_tokens": 31858663.0, "step": 74 }, { "entropy": 0.486297607421875, "epoch": 0.29411764705882354, "grad_norm": 1.3901996305935094, "learning_rate": 1.9885527209189577e-05, "loss": 0.5591, "mean_token_accuracy": 0.815345604903996, "num_tokens": 32263979.0, "step": 75 }, { "entropy": 0.474151611328125, "epoch": 0.2980392156862745, "grad_norm": 1.416507682950556, "learning_rate": 1.9878905881817254e-05, "loss": 0.5565, "mean_token_accuracy": 0.8163448050618172, "num_tokens": 32681289.0, "step": 76 }, { "entropy": 0.46917724609375, "epoch": 0.30196078431372547, "grad_norm": 1.1900128401920054, "learning_rate": 1.9872099570228556e-05, "loss": 0.5545, "mean_token_accuracy": 0.819454850628972, "num_tokens": 33117944.0, "step": 77 }, { "entropy": 0.469085693359375, "epoch": 0.3058823529411765, "grad_norm": 1.273205865013161, "learning_rate": 1.9865108401872856e-05, "loss": 0.5757, "mean_token_accuracy": 0.8083239402621984, "num_tokens": 33569669.0, "step": 78 }, { "entropy": 0.469879150390625, "epoch": 0.30980392156862746, "grad_norm": 1.3425586659856636, "learning_rate": 1.9857932507660983e-05, "loss": 0.5601, "mean_token_accuracy": 0.8155460730195045, "num_tokens": 34000102.0, "step": 79 }, { "entropy": 0.469207763671875, "epoch": 0.3137254901960784, "grad_norm": 1.1089356147226235, "learning_rate": 1.9850572021962788e-05, "loss": 0.5536, "mean_token_accuracy": 0.8189804637804627, "num_tokens": 34433110.0, "step": 80 }, { "entropy": 0.474700927734375, "epoch": 0.3176470588235294, "grad_norm": 1.2801670983454294, "learning_rate": 1.984302708260464e-05, "loss": 0.5681, "mean_token_accuracy": 0.8136863615363836, "num_tokens": 34884713.0, "step": 81 }, { "entropy": 0.470916748046875, "epoch": 0.3215686274509804, "grad_norm": 1.168115984160054, "learning_rate": 1.9835297830866827e-05, "loss": 0.5551, "mean_token_accuracy": 0.8168244622647762, "num_tokens": 35326962.0, "step": 82 }, { "entropy": 0.480133056640625, "epoch": 0.3254901960784314, "grad_norm": 1.3647458163316752, "learning_rate": 1.9827384411480924e-05, "loss": 0.5445, "mean_token_accuracy": 0.8184509659186006, "num_tokens": 35734884.0, "step": 83 }, { "entropy": 0.479736328125, "epoch": 0.32941176470588235, "grad_norm": 1.319678355325207, "learning_rate": 1.9819286972627066e-05, "loss": 0.5498, "mean_token_accuracy": 0.8164876466616988, "num_tokens": 36156751.0, "step": 84 }, { "entropy": 0.475982666015625, "epoch": 0.3333333333333333, "grad_norm": 1.2319398960631225, "learning_rate": 1.9811005665931205e-05, "loss": 0.5606, "mean_token_accuracy": 0.8145118253305554, "num_tokens": 36585317.0, "step": 85 }, { "entropy": 0.4764404296875, "epoch": 0.33725490196078434, "grad_norm": 1.2931356300554002, "learning_rate": 1.980254064646223e-05, "loss": 0.5549, "mean_token_accuracy": 0.8165107127279043, "num_tokens": 37018661.0, "step": 86 }, { "entropy": 0.4676513671875, "epoch": 0.3411764705882353, "grad_norm": 1.2256666633113913, "learning_rate": 1.9793892072729087e-05, "loss": 0.5453, "mean_token_accuracy": 0.8220384856685996, "num_tokens": 37444396.0, "step": 87 }, { "entropy": 0.464141845703125, "epoch": 0.34509803921568627, "grad_norm": 1.37296362447334, "learning_rate": 1.9785060106677818e-05, "loss": 0.5549, "mean_token_accuracy": 0.8157011214643717, "num_tokens": 37867417.0, "step": 88 }, { "entropy": 0.465057373046875, "epoch": 0.34901960784313724, "grad_norm": 1.3470877230759366, "learning_rate": 1.9776044913688503e-05, "loss": 0.5546, "mean_token_accuracy": 0.817422934807837, "num_tokens": 38323018.0, "step": 89 }, { "entropy": 0.467376708984375, "epoch": 0.35294117647058826, "grad_norm": 1.2559034842680823, "learning_rate": 1.976684666257219e-05, "loss": 0.5691, "mean_token_accuracy": 0.8121252795681357, "num_tokens": 38780992.0, "step": 90 }, { "entropy": 0.47125244140625, "epoch": 0.3568627450980392, "grad_norm": 1.2755650974437212, "learning_rate": 1.975746552556772e-05, "loss": 0.544, "mean_token_accuracy": 0.8212292147800326, "num_tokens": 39200876.0, "step": 91 }, { "entropy": 0.467437744140625, "epoch": 0.3607843137254902, "grad_norm": 1.0816461876931776, "learning_rate": 1.9747901678338496e-05, "loss": 0.5494, "mean_token_accuracy": 0.8188506988808513, "num_tokens": 39637981.0, "step": 92 }, { "entropy": 0.4659423828125, "epoch": 0.36470588235294116, "grad_norm": 1.0361403686428494, "learning_rate": 1.9738155299969207e-05, "loss": 0.5368, "mean_token_accuracy": 0.8218570798635483, "num_tokens": 40075528.0, "step": 93 }, { "entropy": 0.469573974609375, "epoch": 0.3686274509803922, "grad_norm": 1.2736306848315126, "learning_rate": 1.9728226572962474e-05, "loss": 0.5592, "mean_token_accuracy": 0.818149627186358, "num_tokens": 40513207.0, "step": 94 }, { "entropy": 0.4595947265625, "epoch": 0.37254901960784315, "grad_norm": 1.1806358194501028, "learning_rate": 1.9718115683235418e-05, "loss": 0.5355, "mean_token_accuracy": 0.8224614998325706, "num_tokens": 40936903.0, "step": 95 }, { "entropy": 0.46954345703125, "epoch": 0.3764705882352941, "grad_norm": 1.2155743490435709, "learning_rate": 1.9707822820116193e-05, "loss": 0.5347, "mean_token_accuracy": 0.8223023796454072, "num_tokens": 41351554.0, "step": 96 }, { "entropy": 0.45892333984375, "epoch": 0.3803921568627451, "grad_norm": 1.2902102201493495, "learning_rate": 1.9697348176340442e-05, "loss": 0.5605, "mean_token_accuracy": 0.8165110582485795, "num_tokens": 41818184.0, "step": 97 }, { "entropy": 0.458892822265625, "epoch": 0.3843137254901961, "grad_norm": 1.1739046775651587, "learning_rate": 1.9686691948047665e-05, "loss": 0.5268, "mean_token_accuracy": 0.8239621166139841, "num_tokens": 42253116.0, "step": 98 }, { "entropy": 0.4661865234375, "epoch": 0.38823529411764707, "grad_norm": 1.1935459708374951, "learning_rate": 1.9675854334777585e-05, "loss": 0.5605, "mean_token_accuracy": 0.8175614606589079, "num_tokens": 42700051.0, "step": 99 }, { "entropy": 0.45660400390625, "epoch": 0.39215686274509803, "grad_norm": 1.2019966132878526, "learning_rate": 1.966483553946637e-05, "loss": 0.5513, "mean_token_accuracy": 0.8168433653190732, "num_tokens": 43137034.0, "step": 100 }, { "entropy": 0.47357177734375, "epoch": 0.396078431372549, "grad_norm": 1.2701212654163299, "learning_rate": 1.9653635768442872e-05, "loss": 0.5448, "mean_token_accuracy": 0.8182221315801144, "num_tokens": 43550710.0, "step": 101 }, { "entropy": 0.469451904296875, "epoch": 0.4, "grad_norm": 1.191074358150651, "learning_rate": 1.964225523142473e-05, "loss": 0.5408, "mean_token_accuracy": 0.8197409231215715, "num_tokens": 43976597.0, "step": 102 }, { "entropy": 0.4630126953125, "epoch": 0.403921568627451, "grad_norm": 1.1845159595637624, "learning_rate": 1.9630694141514467e-05, "loss": 0.5391, "mean_token_accuracy": 0.8227794086560607, "num_tokens": 44386677.0, "step": 103 }, { "entropy": 0.45684814453125, "epoch": 0.40784313725490196, "grad_norm": 0.9700158907451113, "learning_rate": 1.9618952715195476e-05, "loss": 0.5366, "mean_token_accuracy": 0.8220989098772407, "num_tokens": 44824398.0, "step": 104 }, { "entropy": 0.466583251953125, "epoch": 0.4117647058823529, "grad_norm": 1.4454386759696154, "learning_rate": 1.9607031172327998e-05, "loss": 0.5436, "mean_token_accuracy": 0.8189141182228923, "num_tokens": 45253415.0, "step": 105 }, { "entropy": 0.46417236328125, "epoch": 0.41568627450980394, "grad_norm": 1.020453739389049, "learning_rate": 1.9594929736144978e-05, "loss": 0.5221, "mean_token_accuracy": 0.8261368265375495, "num_tokens": 45662349.0, "step": 106 }, { "entropy": 0.456329345703125, "epoch": 0.4196078431372549, "grad_norm": 1.2120811793285806, "learning_rate": 1.958264863324789e-05, "loss": 0.5345, "mean_token_accuracy": 0.8214567014947534, "num_tokens": 46116282.0, "step": 107 }, { "entropy": 0.451568603515625, "epoch": 0.4235294117647059, "grad_norm": 1.1449867557458162, "learning_rate": 1.9570188093602512e-05, "loss": 0.5368, "mean_token_accuracy": 0.8221517028287053, "num_tokens": 46561211.0, "step": 108 }, { "entropy": 0.46954345703125, "epoch": 0.42745098039215684, "grad_norm": 1.1463970481434735, "learning_rate": 1.955754835053459e-05, "loss": 0.5444, "mean_token_accuracy": 0.821798668242991, "num_tokens": 46996031.0, "step": 109 }, { "entropy": 0.46063232421875, "epoch": 0.43137254901960786, "grad_norm": 1.328722695820361, "learning_rate": 1.95447296407255e-05, "loss": 0.5332, "mean_token_accuracy": 0.8222680538892746, "num_tokens": 47440121.0, "step": 110 }, { "entropy": 0.468475341796875, "epoch": 0.43529411764705883, "grad_norm": 1.109295062470751, "learning_rate": 1.9531732204207787e-05, "loss": 0.548, "mean_token_accuracy": 0.8184197628870606, "num_tokens": 47877065.0, "step": 111 }, { "entropy": 0.47601318359375, "epoch": 0.4392156862745098, "grad_norm": 1.1607526849937826, "learning_rate": 1.9518556284360696e-05, "loss": 0.5298, "mean_token_accuracy": 0.822773078456521, "num_tokens": 48290264.0, "step": 112 }, { "entropy": 0.471038818359375, "epoch": 0.44313725490196076, "grad_norm": 1.020865323457694, "learning_rate": 1.95052021279056e-05, "loss": 0.521, "mean_token_accuracy": 0.8239856511354446, "num_tokens": 48704339.0, "step": 113 }, { "entropy": 0.461517333984375, "epoch": 0.4470588235294118, "grad_norm": 1.1750318383641127, "learning_rate": 1.9491669984901377e-05, "loss": 0.5309, "mean_token_accuracy": 0.8254398861899972, "num_tokens": 49132806.0, "step": 114 }, { "entropy": 0.45526123046875, "epoch": 0.45098039215686275, "grad_norm": 0.9995281650726873, "learning_rate": 1.947796010873974e-05, "loss": 0.5249, "mean_token_accuracy": 0.8260870166122913, "num_tokens": 49578930.0, "step": 115 }, { "entropy": 0.469482421875, "epoch": 0.4549019607843137, "grad_norm": 1.0746114295572882, "learning_rate": 1.9464072756140487e-05, "loss": 0.5328, "mean_token_accuracy": 0.8212811881676316, "num_tokens": 49996034.0, "step": 116 }, { "entropy": 0.472015380859375, "epoch": 0.4588235294117647, "grad_norm": 0.962126687778494, "learning_rate": 1.9450008187146685e-05, "loss": 0.5341, "mean_token_accuracy": 0.8235579496249557, "num_tokens": 50416951.0, "step": 117 }, { "entropy": 0.47576904296875, "epoch": 0.4627450980392157, "grad_norm": 1.0768479231849244, "learning_rate": 1.9435766665119823e-05, "loss": 0.5223, "mean_token_accuracy": 0.8253951445221901, "num_tokens": 50831955.0, "step": 118 }, { "entropy": 0.46588134765625, "epoch": 0.4666666666666667, "grad_norm": 1.1169527281919984, "learning_rate": 1.9421348456734844e-05, "loss": 0.5344, "mean_token_accuracy": 0.8200209140777588, "num_tokens": 51269078.0, "step": 119 }, { "entropy": 0.466888427734375, "epoch": 0.47058823529411764, "grad_norm": 1.0439601462864598, "learning_rate": 1.9406753831975202e-05, "loss": 0.5452, "mean_token_accuracy": 0.8209556825459003, "num_tokens": 51716754.0, "step": 120 }, { "entropy": 0.46319580078125, "epoch": 0.4745098039215686, "grad_norm": 1.0226579656321964, "learning_rate": 1.939198306412775e-05, "loss": 0.5218, "mean_token_accuracy": 0.8236983455717564, "num_tokens": 52135573.0, "step": 121 }, { "entropy": 0.458648681640625, "epoch": 0.47843137254901963, "grad_norm": 0.9029783127472882, "learning_rate": 1.9377036429777673e-05, "loss": 0.5149, "mean_token_accuracy": 0.8285743938758969, "num_tokens": 52566553.0, "step": 122 }, { "entropy": 0.458740234375, "epoch": 0.4823529411764706, "grad_norm": 1.0908467464044844, "learning_rate": 1.9361914208803264e-05, "loss": 0.5437, "mean_token_accuracy": 0.8188828593119979, "num_tokens": 53005267.0, "step": 123 }, { "entropy": 0.470458984375, "epoch": 0.48627450980392156, "grad_norm": 1.0725220647845308, "learning_rate": 1.934661668437073e-05, "loss": 0.5107, "mean_token_accuracy": 0.8296375386416912, "num_tokens": 53421602.0, "step": 124 }, { "entropy": 0.470245361328125, "epoch": 0.49019607843137253, "grad_norm": 1.133065623203244, "learning_rate": 1.9331144142928853e-05, "loss": 0.5308, "mean_token_accuracy": 0.8249672232195735, "num_tokens": 53830156.0, "step": 125 }, { "entropy": 0.458160400390625, "epoch": 0.49411764705882355, "grad_norm": 1.043613743096017, "learning_rate": 1.9315496874203637e-05, "loss": 0.5269, "mean_token_accuracy": 0.8225527862086892, "num_tokens": 54260269.0, "step": 126 }, { "entropy": 0.454193115234375, "epoch": 0.4980392156862745, "grad_norm": 1.0411321135052143, "learning_rate": 1.929967517119289e-05, "loss": 0.526, "mean_token_accuracy": 0.8245013160631061, "num_tokens": 54692550.0, "step": 127 }, { "entropy": 0.45831298828125, "epoch": 0.5019607843137255, "grad_norm": 1.153644894412413, "learning_rate": 1.9283679330160726e-05, "loss": 0.5372, "mean_token_accuracy": 0.8229446588084102, "num_tokens": 55129663.0, "step": 128 }, { "entropy": 0.454833984375, "epoch": 0.5058823529411764, "grad_norm": 1.0680654714661098, "learning_rate": 1.926750965063203e-05, "loss": 0.5218, "mean_token_accuracy": 0.8256676206365228, "num_tokens": 55558261.0, "step": 129 }, { "entropy": 0.45941162109375, "epoch": 0.5098039215686274, "grad_norm": 0.9504005715051883, "learning_rate": 1.9251166435386837e-05, "loss": 0.5363, "mean_token_accuracy": 0.8229048978537321, "num_tokens": 55998017.0, "step": 130 }, { "entropy": 0.46295166015625, "epoch": 0.5137254901960784, "grad_norm": 1.095073383543885, "learning_rate": 1.9234649990454678e-05, "loss": 0.5188, "mean_token_accuracy": 0.8264925237745047, "num_tokens": 56425710.0, "step": 131 }, { "entropy": 0.466278076171875, "epoch": 0.5176470588235295, "grad_norm": 1.0136365133134169, "learning_rate": 1.921796062510882e-05, "loss": 0.521, "mean_token_accuracy": 0.8278189906850457, "num_tokens": 56852573.0, "step": 132 }, { "entropy": 0.465911865234375, "epoch": 0.5215686274509804, "grad_norm": 1.0774587353071405, "learning_rate": 1.920109865186052e-05, "loss": 0.5321, "mean_token_accuracy": 0.8233054708689451, "num_tokens": 57303340.0, "step": 133 }, { "entropy": 0.455169677734375, "epoch": 0.5254901960784314, "grad_norm": 1.1845295561863336, "learning_rate": 1.9184064386453127e-05, "loss": 0.5279, "mean_token_accuracy": 0.8221144881099463, "num_tokens": 57749444.0, "step": 134 }, { "entropy": 0.455780029296875, "epoch": 0.5294117647058824, "grad_norm": 1.0251726517418989, "learning_rate": 1.9166858147856204e-05, "loss": 0.5169, "mean_token_accuracy": 0.8259268430992961, "num_tokens": 58175991.0, "step": 135 }, { "entropy": 0.454925537109375, "epoch": 0.5333333333333333, "grad_norm": 1.1572076691548627, "learning_rate": 1.9149480258259535e-05, "loss": 0.5255, "mean_token_accuracy": 0.8251020405441523, "num_tokens": 58609070.0, "step": 136 }, { "entropy": 0.464202880859375, "epoch": 0.5372549019607843, "grad_norm": 1.1385262369773776, "learning_rate": 1.9131931043067092e-05, "loss": 0.5208, "mean_token_accuracy": 0.824607603251934, "num_tokens": 59034229.0, "step": 137 }, { "entropy": 0.462188720703125, "epoch": 0.5411764705882353, "grad_norm": 1.0421945208646466, "learning_rate": 1.911421083089097e-05, "loss": 0.5161, "mean_token_accuracy": 0.8247706349939108, "num_tokens": 59474279.0, "step": 138 }, { "entropy": 0.455352783203125, "epoch": 0.5450980392156862, "grad_norm": 0.9905728313025403, "learning_rate": 1.9096319953545186e-05, "loss": 0.5145, "mean_token_accuracy": 0.826055621728301, "num_tokens": 59909618.0, "step": 139 }, { "entropy": 0.4610595703125, "epoch": 0.5490196078431373, "grad_norm": 1.2224526998602292, "learning_rate": 1.907825874603951e-05, "loss": 0.516, "mean_token_accuracy": 0.827407187782228, "num_tokens": 60333313.0, "step": 140 }, { "entropy": 0.45953369140625, "epoch": 0.5529411764705883, "grad_norm": 1.031383956090397, "learning_rate": 1.9060027546573164e-05, "loss": 0.5084, "mean_token_accuracy": 0.8289670627564192, "num_tokens": 60769519.0, "step": 141 }, { "entropy": 0.4581298828125, "epoch": 0.5568627450980392, "grad_norm": 0.9286455749135041, "learning_rate": 1.9041626696528503e-05, "loss": 0.5159, "mean_token_accuracy": 0.8258885480463505, "num_tokens": 61197717.0, "step": 142 }, { "entropy": 0.46160888671875, "epoch": 0.5607843137254902, "grad_norm": 1.043604648324428, "learning_rate": 1.9023056540464622e-05, "loss": 0.5088, "mean_token_accuracy": 0.8298035254701972, "num_tokens": 61623020.0, "step": 143 }, { "entropy": 0.45831298828125, "epoch": 0.5647058823529412, "grad_norm": 0.9853779681682011, "learning_rate": 1.9004317426110888e-05, "loss": 0.5081, "mean_token_accuracy": 0.8284985879436135, "num_tokens": 62041651.0, "step": 144 }, { "entropy": 0.45892333984375, "epoch": 0.5686274509803921, "grad_norm": 1.0256172993248214, "learning_rate": 1.8985409704360457e-05, "loss": 0.5237, "mean_token_accuracy": 0.8231825344264507, "num_tokens": 62478366.0, "step": 145 }, { "entropy": 0.4647216796875, "epoch": 0.5725490196078431, "grad_norm": 1.0172104304899114, "learning_rate": 1.8966333729263674e-05, "loss": 0.5017, "mean_token_accuracy": 0.8279380407184362, "num_tokens": 62893575.0, "step": 146 }, { "entropy": 0.457427978515625, "epoch": 0.5764705882352941, "grad_norm": 0.9671420496140763, "learning_rate": 1.8947089858021465e-05, "loss": 0.5169, "mean_token_accuracy": 0.8288818374276161, "num_tokens": 63347404.0, "step": 147 }, { "entropy": 0.46197509765625, "epoch": 0.5803921568627451, "grad_norm": 0.9552903663153749, "learning_rate": 1.892767845097864e-05, "loss": 0.5218, "mean_token_accuracy": 0.8258289452642202, "num_tokens": 63781233.0, "step": 148 }, { "entropy": 0.458984375, "epoch": 0.5843137254901961, "grad_norm": 0.8972556890492476, "learning_rate": 1.8908099871617137e-05, "loss": 0.5101, "mean_token_accuracy": 0.827708194963634, "num_tokens": 64205982.0, "step": 149 }, { "entropy": 0.45684814453125, "epoch": 0.5882352941176471, "grad_norm": 0.9940426982366383, "learning_rate": 1.8888354486549238e-05, "loss": 0.5292, "mean_token_accuracy": 0.8226973535493016, "num_tokens": 64620518.0, "step": 150 }, { "entropy": 0.451263427734375, "epoch": 0.592156862745098, "grad_norm": 1.0209199020984132, "learning_rate": 1.886844266551068e-05, "loss": 0.5037, "mean_token_accuracy": 0.8314398853108287, "num_tokens": 65055500.0, "step": 151 }, { "entropy": 0.450714111328125, "epoch": 0.596078431372549, "grad_norm": 1.0567665577574286, "learning_rate": 1.8848364781353744e-05, "loss": 0.5009, "mean_token_accuracy": 0.828608175739646, "num_tokens": 65478309.0, "step": 152 }, { "entropy": 0.44635009765625, "epoch": 0.6, "grad_norm": 1.0017296774811835, "learning_rate": 1.882812121004028e-05, "loss": 0.4949, "mean_token_accuracy": 0.8324599312618375, "num_tokens": 65910742.0, "step": 153 }, { "entropy": 0.453643798828125, "epoch": 0.6039215686274509, "grad_norm": 1.0657800071579415, "learning_rate": 1.8807712330634645e-05, "loss": 0.4935, "mean_token_accuracy": 0.8339864388108253, "num_tokens": 66319397.0, "step": 154 }, { "entropy": 0.455413818359375, "epoch": 0.6078431372549019, "grad_norm": 1.0028724888261045, "learning_rate": 1.878713852529663e-05, "loss": 0.524, "mean_token_accuracy": 0.8232726603746414, "num_tokens": 66754574.0, "step": 155 }, { "entropy": 0.455322265625, "epoch": 0.611764705882353, "grad_norm": 0.8811386198134418, "learning_rate": 1.8766400179274287e-05, "loss": 0.4927, "mean_token_accuracy": 0.8332832558080554, "num_tokens": 67168621.0, "step": 156 }, { "entropy": 0.45269775390625, "epoch": 0.615686274509804, "grad_norm": 0.9617989903466428, "learning_rate": 1.8745497680896722e-05, "loss": 0.5261, "mean_token_accuracy": 0.825565142557025, "num_tokens": 67617448.0, "step": 157 }, { "entropy": 0.46160888671875, "epoch": 0.6196078431372549, "grad_norm": 0.9722737962073038, "learning_rate": 1.8724431421566822e-05, "loss": 0.5062, "mean_token_accuracy": 0.8278914587572217, "num_tokens": 68042404.0, "step": 158 }, { "entropy": 0.4639892578125, "epoch": 0.6235294117647059, "grad_norm": 0.9314083104367087, "learning_rate": 1.870320179575393e-05, "loss": 0.5115, "mean_token_accuracy": 0.8274906072765589, "num_tokens": 68459798.0, "step": 159 }, { "entropy": 0.44720458984375, "epoch": 0.6274509803921569, "grad_norm": 0.9602745230104547, "learning_rate": 1.868180920098644e-05, "loss": 0.5033, "mean_token_accuracy": 0.8310739854350686, "num_tokens": 68889446.0, "step": 160 }, { "entropy": 0.4462890625, "epoch": 0.6313725490196078, "grad_norm": 0.9314028085759819, "learning_rate": 1.866025403784439e-05, "loss": 0.5136, "mean_token_accuracy": 0.8256695969030261, "num_tokens": 69299743.0, "step": 161 }, { "entropy": 0.442535400390625, "epoch": 0.6352941176470588, "grad_norm": 0.9736165597315848, "learning_rate": 1.8638536709951916e-05, "loss": 0.499, "mean_token_accuracy": 0.8318196469917893, "num_tokens": 69732061.0, "step": 162 }, { "entropy": 0.445587158203125, "epoch": 0.6392156862745098, "grad_norm": 1.101418386080784, "learning_rate": 1.861665762396974e-05, "loss": 0.5002, "mean_token_accuracy": 0.8316458566114306, "num_tokens": 70173908.0, "step": 163 }, { "entropy": 0.455230712890625, "epoch": 0.6431372549019608, "grad_norm": 0.9161407634822796, "learning_rate": 1.8594617189587515e-05, "loss": 0.4924, "mean_token_accuracy": 0.8328952239826322, "num_tokens": 70582274.0, "step": 164 }, { "entropy": 0.44940185546875, "epoch": 0.6470588235294118, "grad_norm": 0.9578481443615958, "learning_rate": 1.8572415819516174e-05, "loss": 0.5026, "mean_token_accuracy": 0.8308584401383996, "num_tokens": 71020923.0, "step": 165 }, { "entropy": 0.447174072265625, "epoch": 0.6509803921568628, "grad_norm": 0.8859183038806147, "learning_rate": 1.8550053929480202e-05, "loss": 0.5044, "mean_token_accuracy": 0.8281080648303032, "num_tokens": 71456687.0, "step": 166 }, { "entropy": 0.4407958984375, "epoch": 0.6549019607843137, "grad_norm": 1.073002915476322, "learning_rate": 1.8527531938209847e-05, "loss": 0.5064, "mean_token_accuracy": 0.8275053184479475, "num_tokens": 71895633.0, "step": 167 }, { "entropy": 0.44793701171875, "epoch": 0.6588235294117647, "grad_norm": 0.8588167816577594, "learning_rate": 1.8504850267433278e-05, "loss": 0.502, "mean_token_accuracy": 0.8324728878214955, "num_tokens": 72315957.0, "step": 168 }, { "entropy": 0.440399169921875, "epoch": 0.6627450980392157, "grad_norm": 0.9162787056521714, "learning_rate": 1.8482009341868696e-05, "loss": 0.5111, "mean_token_accuracy": 0.829299739561975, "num_tokens": 72758849.0, "step": 169 }, { "entropy": 0.449249267578125, "epoch": 0.6666666666666666, "grad_norm": 0.8975412677782875, "learning_rate": 1.8459009589216364e-05, "loss": 0.4852, "mean_token_accuracy": 0.8351985597983003, "num_tokens": 73180765.0, "step": 170 }, { "entropy": 0.444366455078125, "epoch": 0.6705882352941176, "grad_norm": 1.0823871467715829, "learning_rate": 1.843585144015063e-05, "loss": 0.495, "mean_token_accuracy": 0.8346499102190137, "num_tokens": 73588923.0, "step": 171 }, { "entropy": 0.443878173828125, "epoch": 0.6745098039215687, "grad_norm": 0.9614941949925443, "learning_rate": 1.8412535328311813e-05, "loss": 0.5077, "mean_token_accuracy": 0.8277076184749603, "num_tokens": 74045150.0, "step": 172 }, { "entropy": 0.442962646484375, "epoch": 0.6784313725490196, "grad_norm": 0.8983449509382112, "learning_rate": 1.838906169029814e-05, "loss": 0.4842, "mean_token_accuracy": 0.8344194013625383, "num_tokens": 74455581.0, "step": 173 }, { "entropy": 0.440032958984375, "epoch": 0.6823529411764706, "grad_norm": 0.8921437173146779, "learning_rate": 1.8365430965657527e-05, "loss": 0.4891, "mean_token_accuracy": 0.8333101095631719, "num_tokens": 74901722.0, "step": 174 }, { "entropy": 0.44647216796875, "epoch": 0.6862745098039216, "grad_norm": 0.9018886281577161, "learning_rate": 1.834164359687937e-05, "loss": 0.5169, "mean_token_accuracy": 0.8270498281344771, "num_tokens": 75346539.0, "step": 175 }, { "entropy": 0.46038818359375, "epoch": 0.6901960784313725, "grad_norm": 0.9208123316136007, "learning_rate": 1.8317700029386245e-05, "loss": 0.5019, "mean_token_accuracy": 0.8328892076388001, "num_tokens": 75755098.0, "step": 176 }, { "entropy": 0.454498291015625, "epoch": 0.6941176470588235, "grad_norm": 0.9713253889644098, "learning_rate": 1.829360071152559e-05, "loss": 0.4813, "mean_token_accuracy": 0.8358625834807754, "num_tokens": 76155127.0, "step": 177 }, { "entropy": 0.45330810546875, "epoch": 0.6980392156862745, "grad_norm": 0.9928686889194862, "learning_rate": 1.826934609456129e-05, "loss": 0.4968, "mean_token_accuracy": 0.8308405773714185, "num_tokens": 76581278.0, "step": 178 }, { "entropy": 0.448089599609375, "epoch": 0.7019607843137254, "grad_norm": 0.8728503339525145, "learning_rate": 1.8244936632665223e-05, "loss": 0.4943, "mean_token_accuracy": 0.8327812422066927, "num_tokens": 76984426.0, "step": 179 }, { "entropy": 0.442962646484375, "epoch": 0.7058823529411765, "grad_norm": 1.0407682234756923, "learning_rate": 1.8220372782908778e-05, "loss": 0.4951, "mean_token_accuracy": 0.8317067986354232, "num_tokens": 77421977.0, "step": 180 }, { "entropy": 0.440216064453125, "epoch": 0.7098039215686275, "grad_norm": 0.8599102112771374, "learning_rate": 1.8195655005254274e-05, "loss": 0.513, "mean_token_accuracy": 0.8280303832143545, "num_tokens": 77866296.0, "step": 181 }, { "entropy": 0.450531005859375, "epoch": 0.7137254901960784, "grad_norm": 0.89596271187072, "learning_rate": 1.8170783762546363e-05, "loss": 0.5075, "mean_token_accuracy": 0.8298424258828163, "num_tokens": 78304256.0, "step": 182 }, { "entropy": 0.4423828125, "epoch": 0.7176470588235294, "grad_norm": 0.8933849619381508, "learning_rate": 1.814575952050336e-05, "loss": 0.4934, "mean_token_accuracy": 0.8312811693176627, "num_tokens": 78730340.0, "step": 183 }, { "entropy": 0.449249267578125, "epoch": 0.7215686274509804, "grad_norm": 0.9127697862713822, "learning_rate": 1.8120582747708503e-05, "loss": 0.494, "mean_token_accuracy": 0.8302021278068423, "num_tokens": 79156747.0, "step": 184 }, { "entropy": 0.4403076171875, "epoch": 0.7254901960784313, "grad_norm": 0.9995234569515845, "learning_rate": 1.8095253915601207e-05, "loss": 0.486, "mean_token_accuracy": 0.8339323159307241, "num_tokens": 79582648.0, "step": 185 }, { "entropy": 0.4400634765625, "epoch": 0.7294117647058823, "grad_norm": 0.9755695900508942, "learning_rate": 1.8069773498468224e-05, "loss": 0.4915, "mean_token_accuracy": 0.8324770601466298, "num_tokens": 80006926.0, "step": 186 }, { "entropy": 0.43646240234375, "epoch": 0.7333333333333333, "grad_norm": 1.0501557093529328, "learning_rate": 1.804414197343476e-05, "loss": 0.4938, "mean_token_accuracy": 0.8323720088228583, "num_tokens": 80445925.0, "step": 187 }, { "entropy": 0.43475341796875, "epoch": 0.7372549019607844, "grad_norm": 0.9747183754878954, "learning_rate": 1.8018359820455535e-05, "loss": 0.5005, "mean_token_accuracy": 0.8321980787441134, "num_tokens": 80884102.0, "step": 188 }, { "entropy": 0.43695068359375, "epoch": 0.7411764705882353, "grad_norm": 1.013802709267682, "learning_rate": 1.799242752230582e-05, "loss": 0.4867, "mean_token_accuracy": 0.8352240175008774, "num_tokens": 81315280.0, "step": 189 }, { "entropy": 0.437896728515625, "epoch": 0.7450980392156863, "grad_norm": 0.8354277367263349, "learning_rate": 1.796634556457236e-05, "loss": 0.4958, "mean_token_accuracy": 0.8291263459250331, "num_tokens": 81748605.0, "step": 190 }, { "entropy": 0.4354248046875, "epoch": 0.7490196078431373, "grad_norm": 0.9816747402124396, "learning_rate": 1.794011443564432e-05, "loss": 0.483, "mean_token_accuracy": 0.8366444101557136, "num_tokens": 82175348.0, "step": 191 }, { "entropy": 0.44696044921875, "epoch": 0.7529411764705882, "grad_norm": 0.9513646004972339, "learning_rate": 1.791373462670411e-05, "loss": 0.4937, "mean_token_accuracy": 0.8334898129105568, "num_tokens": 82613145.0, "step": 192 }, { "entropy": 0.444091796875, "epoch": 0.7568627450980392, "grad_norm": 0.8255350035205672, "learning_rate": 1.7887206631718202e-05, "loss": 0.4876, "mean_token_accuracy": 0.8342526415362954, "num_tokens": 83047052.0, "step": 193 }, { "entropy": 0.44970703125, "epoch": 0.7607843137254902, "grad_norm": 0.8736860391365483, "learning_rate": 1.7860530947427878e-05, "loss": 0.4976, "mean_token_accuracy": 0.8308942569419742, "num_tokens": 83466934.0, "step": 194 }, { "entropy": 0.447509765625, "epoch": 0.7647058823529411, "grad_norm": 0.8782495436576085, "learning_rate": 1.7833708073339924e-05, "loss": 0.4943, "mean_token_accuracy": 0.832427485845983, "num_tokens": 83900582.0, "step": 195 }, { "entropy": 0.443267822265625, "epoch": 0.7686274509803922, "grad_norm": 0.863039317348048, "learning_rate": 1.780673851171728e-05, "loss": 0.5083, "mean_token_accuracy": 0.8297006255015731, "num_tokens": 84337148.0, "step": 196 }, { "entropy": 0.4403076171875, "epoch": 0.7725490196078432, "grad_norm": 0.8456444047616816, "learning_rate": 1.777962276756965e-05, "loss": 0.4791, "mean_token_accuracy": 0.8360334020107985, "num_tokens": 84757172.0, "step": 197 }, { "entropy": 0.442047119140625, "epoch": 0.7764705882352941, "grad_norm": 0.8417951704771196, "learning_rate": 1.7752361348644012e-05, "loss": 0.4849, "mean_token_accuracy": 0.8353043273091316, "num_tokens": 85186171.0, "step": 198 }, { "entropy": 0.44305419921875, "epoch": 0.7803921568627451, "grad_norm": 0.9116590851768235, "learning_rate": 1.7724954765415137e-05, "loss": 0.4834, "mean_token_accuracy": 0.8365112636238337, "num_tokens": 85616924.0, "step": 199 }, { "entropy": 0.445159912109375, "epoch": 0.7843137254901961, "grad_norm": 0.8973441814276175, "learning_rate": 1.769740353107602e-05, "loss": 0.4932, "mean_token_accuracy": 0.8347958726808429, "num_tokens": 86029797.0, "step": 200 }, { "entropy": 0.452972412109375, "epoch": 0.788235294117647, "grad_norm": 0.9120006674689222, "learning_rate": 1.766970816152828e-05, "loss": 0.4858, "mean_token_accuracy": 0.8320359215140343, "num_tokens": 86441674.0, "step": 201 }, { "entropy": 0.45050048828125, "epoch": 0.792156862745098, "grad_norm": 0.9530574679205193, "learning_rate": 1.7641869175372493e-05, "loss": 0.4986, "mean_token_accuracy": 0.8322794009000063, "num_tokens": 86871318.0, "step": 202 }, { "entropy": 0.449432373046875, "epoch": 0.796078431372549, "grad_norm": 0.7899417130805441, "learning_rate": 1.7613887093898466e-05, "loss": 0.479, "mean_token_accuracy": 0.8354767337441444, "num_tokens": 87301013.0, "step": 203 }, { "entropy": 0.4456787109375, "epoch": 0.8, "grad_norm": 0.8983278002195033, "learning_rate": 1.7585762441075504e-05, "loss": 0.4989, "mean_token_accuracy": 0.8304571500048041, "num_tokens": 87749700.0, "step": 204 }, { "entropy": 0.449981689453125, "epoch": 0.803921568627451, "grad_norm": 0.8482340735997149, "learning_rate": 1.7557495743542586e-05, "loss": 0.4913, "mean_token_accuracy": 0.8334829676896334, "num_tokens": 88201140.0, "step": 205 }, { "entropy": 0.446044921875, "epoch": 0.807843137254902, "grad_norm": 0.964208745237549, "learning_rate": 1.752908753059849e-05, "loss": 0.4923, "mean_token_accuracy": 0.8305018618702888, "num_tokens": 88633660.0, "step": 206 }, { "entropy": 0.443023681640625, "epoch": 0.8117647058823529, "grad_norm": 0.8368396835127176, "learning_rate": 1.7500538334191906e-05, "loss": 0.4852, "mean_token_accuracy": 0.8355229757726192, "num_tokens": 89048406.0, "step": 207 }, { "entropy": 0.4366455078125, "epoch": 0.8156862745098039, "grad_norm": 1.542876567727518, "learning_rate": 1.7471848688911465e-05, "loss": 0.482, "mean_token_accuracy": 0.8349215844646096, "num_tokens": 89469026.0, "step": 208 }, { "entropy": 0.441131591796875, "epoch": 0.8196078431372549, "grad_norm": 0.9628327965048106, "learning_rate": 1.7443019131975716e-05, "loss": 0.484, "mean_token_accuracy": 0.8347738357260823, "num_tokens": 89890734.0, "step": 209 }, { "entropy": 0.43792724609375, "epoch": 0.8235294117647058, "grad_norm": 0.8674843259606271, "learning_rate": 1.7414050203223092e-05, "loss": 0.4797, "mean_token_accuracy": 0.8360974583774805, "num_tokens": 90331223.0, "step": 210 }, { "entropy": 0.455902099609375, "epoch": 0.8274509803921568, "grad_norm": 0.8644106725290759, "learning_rate": 1.7384942445101772e-05, "loss": 0.4888, "mean_token_accuracy": 0.8351241312921047, "num_tokens": 90744739.0, "step": 211 }, { "entropy": 0.44989013671875, "epoch": 0.8313725490196079, "grad_norm": 0.8302621076190317, "learning_rate": 1.735569640265955e-05, "loss": 0.4739, "mean_token_accuracy": 0.8376046605408192, "num_tokens": 91158198.0, "step": 212 }, { "entropy": 0.454345703125, "epoch": 0.8352941176470589, "grad_norm": 0.870764317553708, "learning_rate": 1.7326312623533617e-05, "loss": 0.4778, "mean_token_accuracy": 0.8371938867494464, "num_tokens": 91579459.0, "step": 213 }, { "entropy": 0.446258544921875, "epoch": 0.8392156862745098, "grad_norm": 0.9379049791651398, "learning_rate": 1.72967916579403e-05, "loss": 0.4784, "mean_token_accuracy": 0.8384145144373178, "num_tokens": 92005351.0, "step": 214 }, { "entropy": 0.438934326171875, "epoch": 0.8431372549019608, "grad_norm": 0.8537995734738039, "learning_rate": 1.7267134058664776e-05, "loss": 0.4652, "mean_token_accuracy": 0.8381716851145029, "num_tokens": 92443937.0, "step": 215 }, { "entropy": 0.434356689453125, "epoch": 0.8470588235294118, "grad_norm": 0.7898971606485905, "learning_rate": 1.72373403810507e-05, "loss": 0.4908, "mean_token_accuracy": 0.8318126108497381, "num_tokens": 92892784.0, "step": 216 }, { "entropy": 0.439666748046875, "epoch": 0.8509803921568627, "grad_norm": 0.7535497265357768, "learning_rate": 1.7207411182989834e-05, "loss": 0.4663, "mean_token_accuracy": 0.8424786748364568, "num_tokens": 93325929.0, "step": 217 }, { "entropy": 0.440216064453125, "epoch": 0.8549019607843137, "grad_norm": 0.8363493018404706, "learning_rate": 1.7177347024911562e-05, "loss": 0.4757, "mean_token_accuracy": 0.8364362018182874, "num_tokens": 93745474.0, "step": 218 }, { "entropy": 0.434539794921875, "epoch": 0.8588235294117647, "grad_norm": 0.7460393470424544, "learning_rate": 1.7147148469772433e-05, "loss": 0.4741, "mean_token_accuracy": 0.8371732924133539, "num_tokens": 94189419.0, "step": 219 }, { "entropy": 0.433502197265625, "epoch": 0.8627450980392157, "grad_norm": 0.7927531784250542, "learning_rate": 1.7116816083045603e-05, "loss": 0.4687, "mean_token_accuracy": 0.841255315579474, "num_tokens": 94625785.0, "step": 220 }, { "entropy": 0.439544677734375, "epoch": 0.8666666666666667, "grad_norm": 0.847951723708805, "learning_rate": 1.7086350432710243e-05, "loss": 0.4729, "mean_token_accuracy": 0.8400851683691144, "num_tokens": 95045825.0, "step": 221 }, { "entropy": 0.439727783203125, "epoch": 0.8705882352941177, "grad_norm": 0.8653958965315521, "learning_rate": 1.7055752089240907e-05, "loss": 0.4872, "mean_token_accuracy": 0.8322295276448131, "num_tokens": 95485894.0, "step": 222 }, { "entropy": 0.436309814453125, "epoch": 0.8745098039215686, "grad_norm": 0.8139892412838203, "learning_rate": 1.7025021625596852e-05, "loss": 0.4877, "mean_token_accuracy": 0.8352080434560776, "num_tokens": 95903775.0, "step": 223 }, { "entropy": 0.43829345703125, "epoch": 0.8784313725490196, "grad_norm": 0.9137046083892381, "learning_rate": 1.6994159617211318e-05, "loss": 0.4757, "mean_token_accuracy": 0.8375628003850579, "num_tokens": 96324996.0, "step": 224 }, { "entropy": 0.44122314453125, "epoch": 0.8823529411764706, "grad_norm": 0.8019525849533575, "learning_rate": 1.6963166641980732e-05, "loss": 0.4815, "mean_token_accuracy": 0.8376647494733334, "num_tokens": 96751142.0, "step": 225 }, { "entropy": 0.438232421875, "epoch": 0.8862745098039215, "grad_norm": 0.7661466594667296, "learning_rate": 1.6932043280253892e-05, "loss": 0.4741, "mean_token_accuracy": 0.8382897274568677, "num_tokens": 97183056.0, "step": 226 }, { "entropy": 0.4429931640625, "epoch": 0.8901960784313725, "grad_norm": 0.9246954784051347, "learning_rate": 1.6900790114821122e-05, "loss": 0.4733, "mean_token_accuracy": 0.8375686015933752, "num_tokens": 97611237.0, "step": 227 }, { "entropy": 0.44525146484375, "epoch": 0.8941176470588236, "grad_norm": 0.9222883646984189, "learning_rate": 1.686940773090333e-05, "loss": 0.466, "mean_token_accuracy": 0.8396632606163621, "num_tokens": 98013920.0, "step": 228 }, { "entropy": 0.436859130859375, "epoch": 0.8980392156862745, "grad_norm": 0.7494759382545816, "learning_rate": 1.683789671614107e-05, "loss": 0.4784, "mean_token_accuracy": 0.8365710796788335, "num_tokens": 98445616.0, "step": 229 }, { "entropy": 0.44287109375, "epoch": 0.9019607843137255, "grad_norm": 0.8481872394661378, "learning_rate": 1.6806257660583534e-05, "loss": 0.4752, "mean_token_accuracy": 0.8378980001434684, "num_tokens": 98879880.0, "step": 230 }, { "entropy": 0.443023681640625, "epoch": 0.9058823529411765, "grad_norm": 0.7342093494551309, "learning_rate": 1.6774491156677482e-05, "loss": 0.4678, "mean_token_accuracy": 0.8390908362343907, "num_tokens": 99304994.0, "step": 231 }, { "entropy": 0.439727783203125, "epoch": 0.9098039215686274, "grad_norm": 0.7975903074062608, "learning_rate": 1.6742597799256182e-05, "loss": 0.4694, "mean_token_accuracy": 0.8397601125761867, "num_tokens": 99740204.0, "step": 232 }, { "entropy": 0.440093994140625, "epoch": 0.9137254901960784, "grad_norm": 0.8294915259809778, "learning_rate": 1.6710578185528254e-05, "loss": 0.4875, "mean_token_accuracy": 0.8345666192471981, "num_tokens": 100200826.0, "step": 233 }, { "entropy": 0.4527587890625, "epoch": 0.9176470588235294, "grad_norm": 0.8101787502262441, "learning_rate": 1.6678432915066488e-05, "loss": 0.4735, "mean_token_accuracy": 0.8401564313098788, "num_tokens": 100610122.0, "step": 234 }, { "entropy": 0.451934814453125, "epoch": 0.9215686274509803, "grad_norm": 0.7660857033302512, "learning_rate": 1.6646162589796616e-05, "loss": 0.4823, "mean_token_accuracy": 0.835520800203085, "num_tokens": 101053523.0, "step": 235 }, { "entropy": 0.440185546875, "epoch": 0.9254901960784314, "grad_norm": 0.8122888604746737, "learning_rate": 1.6613767813986045e-05, "loss": 0.4848, "mean_token_accuracy": 0.8362872619181871, "num_tokens": 101485940.0, "step": 236 }, { "entropy": 0.43414306640625, "epoch": 0.9294117647058824, "grad_norm": 0.7831540627887795, "learning_rate": 1.6581249194232533e-05, "loss": 0.4634, "mean_token_accuracy": 0.8410006538033485, "num_tokens": 101920333.0, "step": 237 }, { "entropy": 0.4376220703125, "epoch": 0.9333333333333333, "grad_norm": 0.8091781971460889, "learning_rate": 1.6548607339452853e-05, "loss": 0.4746, "mean_token_accuracy": 0.8403336834162474, "num_tokens": 102362931.0, "step": 238 }, { "entropy": 0.439788818359375, "epoch": 0.9372549019607843, "grad_norm": 0.7677541604132057, "learning_rate": 1.6515842860871355e-05, "loss": 0.4711, "mean_token_accuracy": 0.8392134476453066, "num_tokens": 102791458.0, "step": 239 }, { "entropy": 0.4378662109375, "epoch": 0.9411764705882353, "grad_norm": 0.7630280457517558, "learning_rate": 1.648295637200856e-05, "loss": 0.4694, "mean_token_accuracy": 0.8395261112600565, "num_tokens": 103217546.0, "step": 240 }, { "entropy": 0.44439697265625, "epoch": 0.9450980392156862, "grad_norm": 0.7151353860934048, "learning_rate": 1.644994848866964e-05, "loss": 0.4704, "mean_token_accuracy": 0.8399761924520135, "num_tokens": 103654329.0, "step": 241 }, { "entropy": 0.445831298828125, "epoch": 0.9490196078431372, "grad_norm": 0.7886114163810609, "learning_rate": 1.64168198289329e-05, "loss": 0.4628, "mean_token_accuracy": 0.8427135553210974, "num_tokens": 104065772.0, "step": 242 }, { "entropy": 0.437042236328125, "epoch": 0.9529411764705882, "grad_norm": 0.7506289883748392, "learning_rate": 1.6383571013138214e-05, "loss": 0.464, "mean_token_accuracy": 0.8399297744035721, "num_tokens": 104485121.0, "step": 243 }, { "entropy": 0.43280029296875, "epoch": 0.9568627450980393, "grad_norm": 0.738503159574171, "learning_rate": 1.6350202663875385e-05, "loss": 0.466, "mean_token_accuracy": 0.840507653541863, "num_tokens": 104911315.0, "step": 244 }, { "entropy": 0.4429931640625, "epoch": 0.9607843137254902, "grad_norm": 0.7698793504804927, "learning_rate": 1.631671540597251e-05, "loss": 0.4691, "mean_token_accuracy": 0.8367423294112086, "num_tokens": 105322868.0, "step": 245 }, { "entropy": 0.44146728515625, "epoch": 0.9647058823529412, "grad_norm": 0.7708943980471168, "learning_rate": 1.628310986648427e-05, "loss": 0.4815, "mean_token_accuracy": 0.8389245234429836, "num_tokens": 105754753.0, "step": 246 }, { "entropy": 0.45343017578125, "epoch": 0.9686274509803922, "grad_norm": 0.7584700182022692, "learning_rate": 1.6249386674680186e-05, "loss": 0.4738, "mean_token_accuracy": 0.8398635992780328, "num_tokens": 106148914.0, "step": 247 }, { "entropy": 0.438385009765625, "epoch": 0.9725490196078431, "grad_norm": 0.8570809330325667, "learning_rate": 1.621554646203284e-05, "loss": 0.4727, "mean_token_accuracy": 0.8388482462614775, "num_tokens": 106592260.0, "step": 248 }, { "entropy": 0.429840087890625, "epoch": 0.9764705882352941, "grad_norm": 0.7930657950269568, "learning_rate": 1.6181589862206053e-05, "loss": 0.477, "mean_token_accuracy": 0.8356762723997235, "num_tokens": 107045387.0, "step": 249 }, { "entropy": 0.447479248046875, "epoch": 0.9803921568627451, "grad_norm": 0.7855461075816964, "learning_rate": 1.614751751104301e-05, "loss": 0.4626, "mean_token_accuracy": 0.8429703069850802, "num_tokens": 107450555.0, "step": 250 }, { "entropy": 0.44061279296875, "epoch": 0.984313725490196, "grad_norm": 0.8480693241060412, "learning_rate": 1.6113330046554363e-05, "loss": 0.4725, "mean_token_accuracy": 0.8400067826732993, "num_tokens": 107869202.0, "step": 251 }, { "entropy": 0.437591552734375, "epoch": 0.9882352941176471, "grad_norm": 0.7799681659592422, "learning_rate": 1.607902810890628e-05, "loss": 0.4756, "mean_token_accuracy": 0.838361494243145, "num_tokens": 108294767.0, "step": 252 }, { "entropy": 0.440704345703125, "epoch": 0.9921568627450981, "grad_norm": 0.7683369799036914, "learning_rate": 1.6044612340408466e-05, "loss": 0.4727, "mean_token_accuracy": 0.8381105521693826, "num_tokens": 108700118.0, "step": 253 }, { "entropy": 0.43280029296875, "epoch": 0.996078431372549, "grad_norm": 0.8122897127695787, "learning_rate": 1.601008338550211e-05, "loss": 0.4692, "mean_token_accuracy": 0.8396132970228791, "num_tokens": 109111197.0, "step": 254 }, { "entropy": 0.431365966796875, "epoch": 1.0, "grad_norm": 0.6954505238166176, "learning_rate": 1.5975441890747855e-05, "loss": 0.4774, "mean_token_accuracy": 0.8396214628592134, "num_tokens": 109546636.0, "step": 255 }, { "entropy": 0.442596435546875, "epoch": 1.003921568627451, "grad_norm": 0.6963377343270832, "learning_rate": 1.5940688504813664e-05, "loss": 0.4379, "mean_token_accuracy": 0.8488357551395893, "num_tokens": 109971679.0, "step": 256 }, { "entropy": 0.442474365234375, "epoch": 1.007843137254902, "grad_norm": 0.8143419375331571, "learning_rate": 1.590582387846268e-05, "loss": 0.4392, "mean_token_accuracy": 0.8477622792124748, "num_tokens": 110393426.0, "step": 257 }, { "entropy": 0.429779052734375, "epoch": 1.011764705882353, "grad_norm": 0.7190829117341659, "learning_rate": 1.5870848664541046e-05, "loss": 0.4288, "mean_token_accuracy": 0.84952078666538, "num_tokens": 110824443.0, "step": 258 }, { "entropy": 0.41986083984375, "epoch": 1.0156862745098039, "grad_norm": 0.7240913276129756, "learning_rate": 1.5835763517965676e-05, "loss": 0.4313, "mean_token_accuracy": 0.8483589226379991, "num_tokens": 111260666.0, "step": 259 }, { "entropy": 0.425994873046875, "epoch": 1.0196078431372548, "grad_norm": 0.7709654036315373, "learning_rate": 1.5800569095711983e-05, "loss": 0.4379, "mean_token_accuracy": 0.8487503128126264, "num_tokens": 111695842.0, "step": 260 }, { "entropy": 0.417755126953125, "epoch": 1.0235294117647058, "grad_norm": 0.6752329019748353, "learning_rate": 1.5765266056801603e-05, "loss": 0.4154, "mean_token_accuracy": 0.8561578085646033, "num_tokens": 112126448.0, "step": 261 }, { "entropy": 0.4171142578125, "epoch": 1.0274509803921568, "grad_norm": 0.7858284344679825, "learning_rate": 1.5729855062290024e-05, "loss": 0.4284, "mean_token_accuracy": 0.8497958136722445, "num_tokens": 112561721.0, "step": 262 }, { "entropy": 0.429656982421875, "epoch": 1.0313725490196077, "grad_norm": 0.7086470475884443, "learning_rate": 1.569433677525422e-05, "loss": 0.4252, "mean_token_accuracy": 0.8517554877325892, "num_tokens": 112979380.0, "step": 263 }, { "entropy": 0.430023193359375, "epoch": 1.035294117647059, "grad_norm": 0.7246552088677546, "learning_rate": 1.565871186078025e-05, "loss": 0.4307, "mean_token_accuracy": 0.849640991538763, "num_tokens": 113406744.0, "step": 264 }, { "entropy": 0.4281005859375, "epoch": 1.0392156862745099, "grad_norm": 0.7560945351744308, "learning_rate": 1.562298098595078e-05, "loss": 0.4333, "mean_token_accuracy": 0.8482805853709579, "num_tokens": 113841273.0, "step": 265 }, { "entropy": 0.427642822265625, "epoch": 1.0431372549019609, "grad_norm": 0.7398785092951623, "learning_rate": 1.55871448198326e-05, "loss": 0.4262, "mean_token_accuracy": 0.8510823482647538, "num_tokens": 114273278.0, "step": 266 }, { "entropy": 0.43505859375, "epoch": 1.0470588235294118, "grad_norm": 0.7577350777991329, "learning_rate": 1.5551204033464102e-05, "loss": 0.4236, "mean_token_accuracy": 0.8518886985257268, "num_tokens": 114690533.0, "step": 267 }, { "entropy": 0.431854248046875, "epoch": 1.0509803921568628, "grad_norm": 0.7383675662288005, "learning_rate": 1.551515929984271e-05, "loss": 0.4289, "mean_token_accuracy": 0.850223665125668, "num_tokens": 115119453.0, "step": 268 }, { "entropy": 0.42645263671875, "epoch": 1.0549019607843138, "grad_norm": 0.7881284075354964, "learning_rate": 1.5479011293912273e-05, "loss": 0.4228, "mean_token_accuracy": 0.8536723731085658, "num_tokens": 115560457.0, "step": 269 }, { "entropy": 0.4232177734375, "epoch": 1.0588235294117647, "grad_norm": 0.7954689841424749, "learning_rate": 1.5442760692550443e-05, "loss": 0.4343, "mean_token_accuracy": 0.8489060252904892, "num_tokens": 115989927.0, "step": 270 }, { "entropy": 0.418365478515625, "epoch": 1.0627450980392157, "grad_norm": 0.7491371820505106, "learning_rate": 1.5406408174555978e-05, "loss": 0.4385, "mean_token_accuracy": 0.8481973754242063, "num_tokens": 116426232.0, "step": 271 }, { "entropy": 0.42626953125, "epoch": 1.0666666666666667, "grad_norm": 0.783768787078027, "learning_rate": 1.5369954420636048e-05, "loss": 0.4397, "mean_token_accuracy": 0.8476524073630571, "num_tokens": 116838678.0, "step": 272 }, { "entropy": 0.428741455078125, "epoch": 1.0705882352941176, "grad_norm": 0.7549673317013068, "learning_rate": 1.533340011339348e-05, "loss": 0.4245, "mean_token_accuracy": 0.8504898408427835, "num_tokens": 117262142.0, "step": 273 }, { "entropy": 0.42266845703125, "epoch": 1.0745098039215686, "grad_norm": 0.7760483061593083, "learning_rate": 1.529674593731399e-05, "loss": 0.4238, "mean_token_accuracy": 0.8512313133105636, "num_tokens": 117688261.0, "step": 274 }, { "entropy": 0.429931640625, "epoch": 1.0784313725490196, "grad_norm": 0.7346729703955766, "learning_rate": 1.5259992578753335e-05, "loss": 0.4303, "mean_token_accuracy": 0.8502482092007995, "num_tokens": 118118588.0, "step": 275 }, { "entropy": 0.43316650390625, "epoch": 1.0823529411764705, "grad_norm": 0.7610949914654637, "learning_rate": 1.5223140725924494e-05, "loss": 0.428, "mean_token_accuracy": 0.8525876356288791, "num_tokens": 118533618.0, "step": 276 }, { "entropy": 0.416259765625, "epoch": 1.0862745098039215, "grad_norm": 0.7586040410611942, "learning_rate": 1.5186191068884774e-05, "loss": 0.4172, "mean_token_accuracy": 0.8536369241774082, "num_tokens": 118986771.0, "step": 277 }, { "entropy": 0.419158935546875, "epoch": 1.0901960784313725, "grad_norm": 0.8122333245465506, "learning_rate": 1.5149144299522874e-05, "loss": 0.4353, "mean_token_accuracy": 0.8489217078313231, "num_tokens": 119418848.0, "step": 278 }, { "entropy": 0.4161376953125, "epoch": 1.0941176470588236, "grad_norm": 0.7971027405941981, "learning_rate": 1.5112001111545933e-05, "loss": 0.4328, "mean_token_accuracy": 0.8489033579826355, "num_tokens": 119840841.0, "step": 279 }, { "entropy": 0.4166259765625, "epoch": 1.0980392156862746, "grad_norm": 0.7894844623974637, "learning_rate": 1.5074762200466557e-05, "loss": 0.4314, "mean_token_accuracy": 0.8496137168258429, "num_tokens": 120279680.0, "step": 280 }, { "entropy": 0.420928955078125, "epoch": 1.1019607843137256, "grad_norm": 0.7956959838533788, "learning_rate": 1.5037428263589778e-05, "loss": 0.4318, "mean_token_accuracy": 0.8514725975692272, "num_tokens": 120700317.0, "step": 281 }, { "entropy": 0.423797607421875, "epoch": 1.1058823529411765, "grad_norm": 0.825458569699761, "learning_rate": 1.5000000000000002e-05, "loss": 0.4277, "mean_token_accuracy": 0.8525010421872139, "num_tokens": 121126491.0, "step": 282 }, { "entropy": 0.43621826171875, "epoch": 1.1098039215686275, "grad_norm": 0.7166411546479006, "learning_rate": 1.4962478110547918e-05, "loss": 0.429, "mean_token_accuracy": 0.8502249773591757, "num_tokens": 121555610.0, "step": 283 }, { "entropy": 0.434906005859375, "epoch": 1.1137254901960785, "grad_norm": 0.7706765433868406, "learning_rate": 1.4924863297837378e-05, "loss": 0.4267, "mean_token_accuracy": 0.8501248694956303, "num_tokens": 121983963.0, "step": 284 }, { "entropy": 0.438873291015625, "epoch": 1.1176470588235294, "grad_norm": 0.7683977578682482, "learning_rate": 1.4887156266212237e-05, "loss": 0.4341, "mean_token_accuracy": 0.8469060966745019, "num_tokens": 122390376.0, "step": 285 }, { "entropy": 0.430755615234375, "epoch": 1.1215686274509804, "grad_norm": 0.7288640506851769, "learning_rate": 1.4849357721743169e-05, "loss": 0.4312, "mean_token_accuracy": 0.8482475150376558, "num_tokens": 122832305.0, "step": 286 }, { "entropy": 0.423187255859375, "epoch": 1.1254901960784314, "grad_norm": 0.7112151204180048, "learning_rate": 1.4811468372214432e-05, "loss": 0.4407, "mean_token_accuracy": 0.84721062425524, "num_tokens": 123272477.0, "step": 287 }, { "entropy": 0.425872802734375, "epoch": 1.1294117647058823, "grad_norm": 0.7543951967031106, "learning_rate": 1.4773488927110633e-05, "loss": 0.427, "mean_token_accuracy": 0.8514616079628468, "num_tokens": 123716296.0, "step": 288 }, { "entropy": 0.4237060546875, "epoch": 1.1333333333333333, "grad_norm": 0.7220973122261882, "learning_rate": 1.473542009760343e-05, "loss": 0.4256, "mean_token_accuracy": 0.8516914714127779, "num_tokens": 124149993.0, "step": 289 }, { "entropy": 0.4173583984375, "epoch": 1.1372549019607843, "grad_norm": 0.7187248533832408, "learning_rate": 1.4697262596538227e-05, "loss": 0.424, "mean_token_accuracy": 0.8525945777073503, "num_tokens": 124575738.0, "step": 290 }, { "entropy": 0.427032470703125, "epoch": 1.1411764705882352, "grad_norm": 0.6579326862882529, "learning_rate": 1.4659017138420804e-05, "loss": 0.4294, "mean_token_accuracy": 0.852800234220922, "num_tokens": 125012512.0, "step": 291 }, { "entropy": 0.434814453125, "epoch": 1.1450980392156862, "grad_norm": 0.724724576082981, "learning_rate": 1.4620684439403962e-05, "loss": 0.4401, "mean_token_accuracy": 0.8472226839512587, "num_tokens": 125437893.0, "step": 292 }, { "entropy": 0.428192138671875, "epoch": 1.1490196078431372, "grad_norm": 0.7183600555478822, "learning_rate": 1.4582265217274105e-05, "loss": 0.4229, "mean_token_accuracy": 0.8529646163806319, "num_tokens": 125877544.0, "step": 293 }, { "entropy": 0.429595947265625, "epoch": 1.1529411764705881, "grad_norm": 0.6896120937907955, "learning_rate": 1.454376019143779e-05, "loss": 0.4441, "mean_token_accuracy": 0.8463943209499121, "num_tokens": 126315970.0, "step": 294 }, { "entropy": 0.43206787109375, "epoch": 1.156862745098039, "grad_norm": 0.7105359525136665, "learning_rate": 1.4505170082908269e-05, "loss": 0.4363, "mean_token_accuracy": 0.8492165962234139, "num_tokens": 126749309.0, "step": 295 }, { "entropy": 0.424591064453125, "epoch": 1.1607843137254903, "grad_norm": 0.6660202625171524, "learning_rate": 1.4466495614291977e-05, "loss": 0.4246, "mean_token_accuracy": 0.8515631575137377, "num_tokens": 127191013.0, "step": 296 }, { "entropy": 0.42242431640625, "epoch": 1.1647058823529413, "grad_norm": 0.7632586900497342, "learning_rate": 1.4427737509775008e-05, "loss": 0.4233, "mean_token_accuracy": 0.8523411825299263, "num_tokens": 127628779.0, "step": 297 }, { "entropy": 0.420318603515625, "epoch": 1.1686274509803922, "grad_norm": 0.7099430978314399, "learning_rate": 1.438889649510956e-05, "loss": 0.4249, "mean_token_accuracy": 0.8517980761826038, "num_tokens": 128055765.0, "step": 298 }, { "entropy": 0.415679931640625, "epoch": 1.1725490196078432, "grad_norm": 0.7169933749455566, "learning_rate": 1.4349973297600321e-05, "loss": 0.4274, "mean_token_accuracy": 0.8488310389220715, "num_tokens": 128485889.0, "step": 299 }, { "entropy": 0.416351318359375, "epoch": 1.1764705882352942, "grad_norm": 0.722515066306239, "learning_rate": 1.4310968646090884e-05, "loss": 0.4279, "mean_token_accuracy": 0.8511504996567965, "num_tokens": 128915737.0, "step": 300 }, { "entropy": 0.421173095703125, "epoch": 1.1803921568627451, "grad_norm": 0.6798210825809636, "learning_rate": 1.4271883270950073e-05, "loss": 0.4275, "mean_token_accuracy": 0.8499457621946931, "num_tokens": 129337557.0, "step": 301 }, { "entropy": 0.418731689453125, "epoch": 1.184313725490196, "grad_norm": 0.706619653888489, "learning_rate": 1.423271790405828e-05, "loss": 0.4117, "mean_token_accuracy": 0.8567122034728527, "num_tokens": 129771578.0, "step": 302 }, { "entropy": 0.418914794921875, "epoch": 1.188235294117647, "grad_norm": 0.7361824434334385, "learning_rate": 1.419347327879375e-05, "loss": 0.4291, "mean_token_accuracy": 0.850643745623529, "num_tokens": 130196995.0, "step": 303 }, { "entropy": 0.412841796875, "epoch": 1.192156862745098, "grad_norm": 0.6661584399171278, "learning_rate": 1.4154150130018867e-05, "loss": 0.4345, "mean_token_accuracy": 0.8520185491070151, "num_tokens": 130628897.0, "step": 304 }, { "entropy": 0.417449951171875, "epoch": 1.196078431372549, "grad_norm": 0.7846231723266166, "learning_rate": 1.4114749194066364e-05, "loss": 0.428, "mean_token_accuracy": 0.8515145275741816, "num_tokens": 131045663.0, "step": 305 }, { "entropy": 0.414306640625, "epoch": 1.2, "grad_norm": 0.6574696580235325, "learning_rate": 1.4075271208725572e-05, "loss": 0.411, "mean_token_accuracy": 0.8561296090483665, "num_tokens": 131464562.0, "step": 306 }, { "entropy": 0.410675048828125, "epoch": 1.203921568627451, "grad_norm": 0.6673289170228338, "learning_rate": 1.4035716913228568e-05, "loss": 0.431, "mean_token_accuracy": 0.8509927401319146, "num_tokens": 131908062.0, "step": 307 }, { "entropy": 0.41754150390625, "epoch": 1.2078431372549019, "grad_norm": 0.7427627721794107, "learning_rate": 1.3996087048236357e-05, "loss": 0.425, "mean_token_accuracy": 0.8522914592176676, "num_tokens": 132321024.0, "step": 308 }, { "entropy": 0.405853271484375, "epoch": 1.2117647058823529, "grad_norm": 0.7168551440712705, "learning_rate": 1.3956382355824999e-05, "loss": 0.4325, "mean_token_accuracy": 0.850104920566082, "num_tokens": 132758451.0, "step": 309 }, { "entropy": 0.41082763671875, "epoch": 1.215686274509804, "grad_norm": 0.679207878607794, "learning_rate": 1.3916603579471705e-05, "loss": 0.4181, "mean_token_accuracy": 0.8522440018132329, "num_tokens": 133200509.0, "step": 310 }, { "entropy": 0.41253662109375, "epoch": 1.219607843137255, "grad_norm": 0.8035559374142178, "learning_rate": 1.3876751464040924e-05, "loss": 0.4213, "mean_token_accuracy": 0.8530607046559453, "num_tokens": 133630952.0, "step": 311 }, { "entropy": 0.40924072265625, "epoch": 1.223529411764706, "grad_norm": 0.6881590152098038, "learning_rate": 1.3836826755770386e-05, "loss": 0.4146, "mean_token_accuracy": 0.8529272833839059, "num_tokens": 134044313.0, "step": 312 }, { "entropy": 0.4156494140625, "epoch": 1.227450980392157, "grad_norm": 0.720307099360712, "learning_rate": 1.3796830202257141e-05, "loss": 0.4253, "mean_token_accuracy": 0.8524025613442063, "num_tokens": 134476638.0, "step": 313 }, { "entropy": 0.4110107421875, "epoch": 1.231372549019608, "grad_norm": 0.7218239013473345, "learning_rate": 1.3756762552443555e-05, "loss": 0.4307, "mean_token_accuracy": 0.8505604760721326, "num_tokens": 134910566.0, "step": 314 }, { "entropy": 0.414794921875, "epoch": 1.2352941176470589, "grad_norm": 0.6637567115140405, "learning_rate": 1.3716624556603275e-05, "loss": 0.4198, "mean_token_accuracy": 0.8546493574976921, "num_tokens": 135349522.0, "step": 315 }, { "entropy": 0.41571044921875, "epoch": 1.2392156862745098, "grad_norm": 0.68587837287742, "learning_rate": 1.3676416966327201e-05, "loss": 0.4142, "mean_token_accuracy": 0.8545364672318101, "num_tokens": 135759530.0, "step": 316 }, { "entropy": 0.423095703125, "epoch": 1.2431372549019608, "grad_norm": 0.7041633347426386, "learning_rate": 1.3636140534509392e-05, "loss": 0.4301, "mean_token_accuracy": 0.8494144305586815, "num_tokens": 136191290.0, "step": 317 }, { "entropy": 0.41656494140625, "epoch": 1.2470588235294118, "grad_norm": 0.6645492409494638, "learning_rate": 1.3595796015332986e-05, "loss": 0.4127, "mean_token_accuracy": 0.8538919584825635, "num_tokens": 136621679.0, "step": 318 }, { "entropy": 0.41436767578125, "epoch": 1.2509803921568627, "grad_norm": 0.6809848141129548, "learning_rate": 1.3555384164256048e-05, "loss": 0.4113, "mean_token_accuracy": 0.8555595567449927, "num_tokens": 137072932.0, "step": 319 }, { "entropy": 0.420654296875, "epoch": 1.2549019607843137, "grad_norm": 0.6860893890330958, "learning_rate": 1.3514905737997474e-05, "loss": 0.4257, "mean_token_accuracy": 0.8516252571716905, "num_tokens": 137498595.0, "step": 320 }, { "entropy": 0.41839599609375, "epoch": 1.2588235294117647, "grad_norm": 0.7209641987545784, "learning_rate": 1.3474361494522769e-05, "loss": 0.4135, "mean_token_accuracy": 0.8547264030203223, "num_tokens": 137929823.0, "step": 321 }, { "entropy": 0.41461181640625, "epoch": 1.2627450980392156, "grad_norm": 0.6642408443800759, "learning_rate": 1.3433752193029888e-05, "loss": 0.4227, "mean_token_accuracy": 0.8537326790392399, "num_tokens": 138366548.0, "step": 322 }, { "entropy": 0.414642333984375, "epoch": 1.2666666666666666, "grad_norm": 0.7129550559327684, "learning_rate": 1.3393078593934998e-05, "loss": 0.4252, "mean_token_accuracy": 0.8525771573185921, "num_tokens": 138805069.0, "step": 323 }, { "entropy": 0.422607421875, "epoch": 1.2705882352941176, "grad_norm": 0.7470458148795028, "learning_rate": 1.3352341458858264e-05, "loss": 0.4308, "mean_token_accuracy": 0.8501967024058104, "num_tokens": 139222262.0, "step": 324 }, { "entropy": 0.41412353515625, "epoch": 1.2745098039215685, "grad_norm": 0.7073090732730448, "learning_rate": 1.3311541550609566e-05, "loss": 0.4326, "mean_token_accuracy": 0.8521415013819933, "num_tokens": 139655569.0, "step": 325 }, { "entropy": 0.42816162109375, "epoch": 1.2784313725490195, "grad_norm": 0.6632060012055889, "learning_rate": 1.3270679633174219e-05, "loss": 0.4196, "mean_token_accuracy": 0.8533567879348993, "num_tokens": 140063044.0, "step": 326 }, { "entropy": 0.42047119140625, "epoch": 1.2823529411764705, "grad_norm": 0.7145450450096429, "learning_rate": 1.3229756471698674e-05, "loss": 0.4264, "mean_token_accuracy": 0.8511630315333605, "num_tokens": 140491506.0, "step": 327 }, { "entropy": 0.427001953125, "epoch": 1.2862745098039214, "grad_norm": 0.7417491681577237, "learning_rate": 1.318877283247619e-05, "loss": 0.4199, "mean_token_accuracy": 0.8536494439467788, "num_tokens": 140915505.0, "step": 328 }, { "entropy": 0.42401123046875, "epoch": 1.2901960784313726, "grad_norm": 0.7026578815553888, "learning_rate": 1.3147729482932473e-05, "loss": 0.4182, "mean_token_accuracy": 0.8537089116871357, "num_tokens": 141348827.0, "step": 329 }, { "entropy": 0.420562744140625, "epoch": 1.2941176470588236, "grad_norm": 0.7920529959240524, "learning_rate": 1.3106627191611333e-05, "loss": 0.4175, "mean_token_accuracy": 0.8526282785460353, "num_tokens": 141763227.0, "step": 330 }, { "entropy": 0.404693603515625, "epoch": 1.2980392156862746, "grad_norm": 0.7187972107180246, "learning_rate": 1.3065466728160253e-05, "loss": 0.4332, "mean_token_accuracy": 0.8482584049925208, "num_tokens": 142216480.0, "step": 331 }, { "entropy": 0.4119873046875, "epoch": 1.3019607843137255, "grad_norm": 0.7174309524641856, "learning_rate": 1.3024248863316012e-05, "loss": 0.423, "mean_token_accuracy": 0.8517096359282732, "num_tokens": 142657049.0, "step": 332 }, { "entropy": 0.409637451171875, "epoch": 1.3058823529411765, "grad_norm": 0.7028509712861, "learning_rate": 1.2982974368890243e-05, "loss": 0.4051, "mean_token_accuracy": 0.8556926595047116, "num_tokens": 143105947.0, "step": 333 }, { "entropy": 0.413299560546875, "epoch": 1.3098039215686275, "grad_norm": 0.7056529897906295, "learning_rate": 1.2941644017754964e-05, "loss": 0.4207, "mean_token_accuracy": 0.851080933585763, "num_tokens": 143536222.0, "step": 334 }, { "entropy": 0.4180908203125, "epoch": 1.3137254901960784, "grad_norm": 0.7476049940926148, "learning_rate": 1.2900258583828138e-05, "loss": 0.4287, "mean_token_accuracy": 0.8512825155630708, "num_tokens": 143975464.0, "step": 335 }, { "entropy": 0.419647216796875, "epoch": 1.3176470588235294, "grad_norm": 0.7336378531937496, "learning_rate": 1.2858818842059145e-05, "loss": 0.4152, "mean_token_accuracy": 0.8559805741533637, "num_tokens": 144399794.0, "step": 336 }, { "entropy": 0.416168212890625, "epoch": 1.3215686274509804, "grad_norm": 0.7405211465935665, "learning_rate": 1.2817325568414299e-05, "loss": 0.4246, "mean_token_accuracy": 0.8506488613784313, "num_tokens": 144828576.0, "step": 337 }, { "entropy": 0.42138671875, "epoch": 1.3254901960784313, "grad_norm": 0.7122042109511074, "learning_rate": 1.2775779539862305e-05, "loss": 0.4152, "mean_token_accuracy": 0.8549979459494352, "num_tokens": 145258310.0, "step": 338 }, { "entropy": 0.41534423828125, "epoch": 1.3294117647058823, "grad_norm": 0.7320013767426466, "learning_rate": 1.273418153435971e-05, "loss": 0.4269, "mean_token_accuracy": 0.8519812626764178, "num_tokens": 145683977.0, "step": 339 }, { "entropy": 0.417236328125, "epoch": 1.3333333333333333, "grad_norm": 0.6927031984828272, "learning_rate": 1.2692532330836346e-05, "loss": 0.428, "mean_token_accuracy": 0.8527764491736889, "num_tokens": 146122446.0, "step": 340 }, { "entropy": 0.416900634765625, "epoch": 1.3372549019607844, "grad_norm": 0.7155795792658018, "learning_rate": 1.2650832709180727e-05, "loss": 0.4058, "mean_token_accuracy": 0.856788550503552, "num_tokens": 146522149.0, "step": 341 }, { "entropy": 0.410552978515625, "epoch": 1.3411764705882354, "grad_norm": 0.6848398168854253, "learning_rate": 1.2609083450225468e-05, "loss": 0.4248, "mean_token_accuracy": 0.8541076770052314, "num_tokens": 146983682.0, "step": 342 }, { "entropy": 0.417816162109375, "epoch": 1.3450980392156864, "grad_norm": 0.6766196619423954, "learning_rate": 1.2567285335732633e-05, "loss": 0.4117, "mean_token_accuracy": 0.8540923977270722, "num_tokens": 147407175.0, "step": 343 }, { "entropy": 0.42071533203125, "epoch": 1.3490196078431373, "grad_norm": 0.7039047785450393, "learning_rate": 1.2525439148379127e-05, "loss": 0.4046, "mean_token_accuracy": 0.8577220821753144, "num_tokens": 147803720.0, "step": 344 }, { "entropy": 0.417327880859375, "epoch": 1.3529411764705883, "grad_norm": 0.699655445151693, "learning_rate": 1.248354567174203e-05, "loss": 0.4151, "mean_token_accuracy": 0.8557805633172393, "num_tokens": 148249099.0, "step": 345 }, { "entropy": 0.416839599609375, "epoch": 1.3568627450980393, "grad_norm": 0.6808551312339953, "learning_rate": 1.2441605690283915e-05, "loss": 0.4178, "mean_token_accuracy": 0.8543298495933414, "num_tokens": 148679725.0, "step": 346 }, { "entropy": 0.4178466796875, "epoch": 1.3607843137254902, "grad_norm": 0.6714585744701463, "learning_rate": 1.2399619989338165e-05, "loss": 0.4176, "mean_token_accuracy": 0.853199539706111, "num_tokens": 149105797.0, "step": 347 }, { "entropy": 0.419891357421875, "epoch": 1.3647058823529412, "grad_norm": 0.6847257289307138, "learning_rate": 1.2357589355094275e-05, "loss": 0.4176, "mean_token_accuracy": 0.8529877169057727, "num_tokens": 149521643.0, "step": 348 }, { "entropy": 0.42413330078125, "epoch": 1.3686274509803922, "grad_norm": 0.7120004884140041, "learning_rate": 1.2315514574583113e-05, "loss": 0.4181, "mean_token_accuracy": 0.8546496015042067, "num_tokens": 149936921.0, "step": 349 }, { "entropy": 0.413299560546875, "epoch": 1.3725490196078431, "grad_norm": 0.6593385779430017, "learning_rate": 1.2273396435662212e-05, "loss": 0.4077, "mean_token_accuracy": 0.8568513067439198, "num_tokens": 150358247.0, "step": 350 }, { "entropy": 0.416351318359375, "epoch": 1.3764705882352941, "grad_norm": 0.6748945354600052, "learning_rate": 1.2231235727000977e-05, "loss": 0.412, "mean_token_accuracy": 0.8564546350389719, "num_tokens": 150781639.0, "step": 351 }, { "entropy": 0.412689208984375, "epoch": 1.380392156862745, "grad_norm": 0.6627273883982908, "learning_rate": 1.218903323806595e-05, "loss": 0.4101, "mean_token_accuracy": 0.8563144765794277, "num_tokens": 151210244.0, "step": 352 }, { "entropy": 0.41314697265625, "epoch": 1.384313725490196, "grad_norm": 0.6804939839140359, "learning_rate": 1.2146789759106016e-05, "loss": 0.4297, "mean_token_accuracy": 0.8509758925065398, "num_tokens": 151652694.0, "step": 353 }, { "entropy": 0.409027099609375, "epoch": 1.388235294117647, "grad_norm": 0.7551814729523321, "learning_rate": 1.2104506081137608e-05, "loss": 0.4171, "mean_token_accuracy": 0.8555029472336173, "num_tokens": 152069524.0, "step": 354 }, { "entropy": 0.411651611328125, "epoch": 1.392156862745098, "grad_norm": 0.6638398446727486, "learning_rate": 1.2062182995929883e-05, "loss": 0.4133, "mean_token_accuracy": 0.8564111962914467, "num_tokens": 152496092.0, "step": 355 }, { "entropy": 0.4066162109375, "epoch": 1.396078431372549, "grad_norm": 0.6956702354585098, "learning_rate": 1.2019821295989913e-05, "loss": 0.4141, "mean_token_accuracy": 0.8558093551546335, "num_tokens": 152936586.0, "step": 356 }, { "entropy": 0.412200927734375, "epoch": 1.4, "grad_norm": 0.7356334108107045, "learning_rate": 1.1977421774547832e-05, "loss": 0.4142, "mean_token_accuracy": 0.8541790386661887, "num_tokens": 153371852.0, "step": 357 }, { "entropy": 0.4122314453125, "epoch": 1.4039215686274509, "grad_norm": 0.7014777687549074, "learning_rate": 1.1934985225541998e-05, "loss": 0.4125, "mean_token_accuracy": 0.8557530920952559, "num_tokens": 153789570.0, "step": 358 }, { "entropy": 0.40606689453125, "epoch": 1.4078431372549018, "grad_norm": 0.6865162084151079, "learning_rate": 1.1892512443604103e-05, "loss": 0.4116, "mean_token_accuracy": 0.854659709148109, "num_tokens": 154208764.0, "step": 359 }, { "entropy": 0.408111572265625, "epoch": 1.4117647058823528, "grad_norm": 0.712196123175708, "learning_rate": 1.1850004224044315e-05, "loss": 0.3975, "mean_token_accuracy": 0.8598070461302996, "num_tokens": 154630249.0, "step": 360 }, { "entropy": 0.405975341796875, "epoch": 1.415686274509804, "grad_norm": 0.6986703398021826, "learning_rate": 1.1807461362836382e-05, "loss": 0.4112, "mean_token_accuracy": 0.8554151114076376, "num_tokens": 155063669.0, "step": 361 }, { "entropy": 0.40484619140625, "epoch": 1.419607843137255, "grad_norm": 0.7011227779262905, "learning_rate": 1.1764884656602711e-05, "loss": 0.4155, "mean_token_accuracy": 0.8543020207434893, "num_tokens": 155502743.0, "step": 362 }, { "entropy": 0.41925048828125, "epoch": 1.423529411764706, "grad_norm": 0.7388781102918118, "learning_rate": 1.1722274902599469e-05, "loss": 0.4188, "mean_token_accuracy": 0.8535276213660836, "num_tokens": 155911398.0, "step": 363 }, { "entropy": 0.409423828125, "epoch": 1.427450980392157, "grad_norm": 0.6903468704485393, "learning_rate": 1.1679632898701649e-05, "loss": 0.428, "mean_token_accuracy": 0.849765595048666, "num_tokens": 156361652.0, "step": 364 }, { "entropy": 0.41168212890625, "epoch": 1.4313725490196079, "grad_norm": 0.6778766361688789, "learning_rate": 1.1636959443388131e-05, "loss": 0.4015, "mean_token_accuracy": 0.8571812696754932, "num_tokens": 156807343.0, "step": 365 }, { "entropy": 0.412109375, "epoch": 1.4352941176470588, "grad_norm": 0.642733238897495, "learning_rate": 1.1594255335726725e-05, "loss": 0.4145, "mean_token_accuracy": 0.853903261013329, "num_tokens": 157249464.0, "step": 366 }, { "entropy": 0.424652099609375, "epoch": 1.4392156862745098, "grad_norm": 0.6483544400990201, "learning_rate": 1.1551521375359207e-05, "loss": 0.4046, "mean_token_accuracy": 0.857556514441967, "num_tokens": 157640719.0, "step": 367 }, { "entropy": 0.409210205078125, "epoch": 1.4431372549019608, "grad_norm": 0.7222910236522575, "learning_rate": 1.1508758362486358e-05, "loss": 0.4266, "mean_token_accuracy": 0.8515659496188164, "num_tokens": 158086707.0, "step": 368 }, { "entropy": 0.410858154296875, "epoch": 1.4470588235294117, "grad_norm": 0.6823438746266127, "learning_rate": 1.1465967097852971e-05, "loss": 0.4092, "mean_token_accuracy": 0.8546876255422831, "num_tokens": 158534215.0, "step": 369 }, { "entropy": 0.4141845703125, "epoch": 1.4509803921568627, "grad_norm": 0.7292179149719259, "learning_rate": 1.1423148382732854e-05, "loss": 0.4237, "mean_token_accuracy": 0.8510736022144556, "num_tokens": 158964888.0, "step": 370 }, { "entropy": 0.410247802734375, "epoch": 1.4549019607843137, "grad_norm": 0.6971738873757594, "learning_rate": 1.1380303018913832e-05, "loss": 0.4172, "mean_token_accuracy": 0.8541428428143263, "num_tokens": 159399267.0, "step": 371 }, { "entropy": 0.415679931640625, "epoch": 1.4588235294117646, "grad_norm": 0.7383800792346643, "learning_rate": 1.133743180868273e-05, "loss": 0.4221, "mean_token_accuracy": 0.8528790548443794, "num_tokens": 159819677.0, "step": 372 }, { "entropy": 0.412841796875, "epoch": 1.4627450980392158, "grad_norm": 0.6805212338474808, "learning_rate": 1.1294535554810356e-05, "loss": 0.4161, "mean_token_accuracy": 0.8552266210317612, "num_tokens": 160254854.0, "step": 373 }, { "entropy": 0.412506103515625, "epoch": 1.4666666666666668, "grad_norm": 0.7480351284538674, "learning_rate": 1.125161506053646e-05, "loss": 0.4071, "mean_token_accuracy": 0.8572438461706042, "num_tokens": 160685537.0, "step": 374 }, { "entropy": 0.41534423828125, "epoch": 1.4705882352941178, "grad_norm": 0.6771163486521969, "learning_rate": 1.1208671129554703e-05, "loss": 0.4162, "mean_token_accuracy": 0.8534526033326983, "num_tokens": 161126542.0, "step": 375 }, { "entropy": 0.412841796875, "epoch": 1.4745098039215687, "grad_norm": 0.6397900219361803, "learning_rate": 1.1165704565997593e-05, "loss": 0.4009, "mean_token_accuracy": 0.8574716188013554, "num_tokens": 161551358.0, "step": 376 }, { "entropy": 0.41497802734375, "epoch": 1.4784313725490197, "grad_norm": 0.7413777447374873, "learning_rate": 1.1122716174421446e-05, "loss": 0.4097, "mean_token_accuracy": 0.8574340445920825, "num_tokens": 161967605.0, "step": 377 }, { "entropy": 0.41448974609375, "epoch": 1.4823529411764707, "grad_norm": 0.6851273431643456, "learning_rate": 1.1079706759791311e-05, "loss": 0.4105, "mean_token_accuracy": 0.8551990939304233, "num_tokens": 162376155.0, "step": 378 }, { "entropy": 0.40765380859375, "epoch": 1.4862745098039216, "grad_norm": 0.7262571621062189, "learning_rate": 1.103667712746589e-05, "loss": 0.4063, "mean_token_accuracy": 0.8556642541661859, "num_tokens": 162795464.0, "step": 379 }, { "entropy": 0.418060302734375, "epoch": 1.4901960784313726, "grad_norm": 0.709837787003394, "learning_rate": 1.0993628083182468e-05, "loss": 0.4101, "mean_token_accuracy": 0.8567473096773028, "num_tokens": 163211049.0, "step": 380 }, { "entropy": 0.412078857421875, "epoch": 1.4941176470588236, "grad_norm": 0.7160640024703722, "learning_rate": 1.0950560433041825e-05, "loss": 0.4204, "mean_token_accuracy": 0.853213481605053, "num_tokens": 163651169.0, "step": 381 }, { "entropy": 0.4249267578125, "epoch": 1.4980392156862745, "grad_norm": 0.6635137849439725, "learning_rate": 1.0907474983493144e-05, "loss": 0.408, "mean_token_accuracy": 0.8586263991892338, "num_tokens": 164052131.0, "step": 382 }, { "entropy": 0.41943359375, "epoch": 1.5019607843137255, "grad_norm": 0.7005678883006693, "learning_rate": 1.0864372541318891e-05, "loss": 0.3892, "mean_token_accuracy": 0.8622283479198813, "num_tokens": 164472628.0, "step": 383 }, { "entropy": 0.419921875, "epoch": 1.5058823529411764, "grad_norm": 0.7354010110687161, "learning_rate": 1.0821253913619727e-05, "loss": 0.4204, "mean_token_accuracy": 0.8538774671033025, "num_tokens": 164917451.0, "step": 384 }, { "entropy": 0.411956787109375, "epoch": 1.5098039215686274, "grad_norm": 0.7080530760627707, "learning_rate": 1.0778119907799399e-05, "loss": 0.4022, "mean_token_accuracy": 0.8572713797912002, "num_tokens": 165348112.0, "step": 385 }, { "entropy": 0.411041259765625, "epoch": 1.5137254901960784, "grad_norm": 0.6752314903003961, "learning_rate": 1.0734971331549604e-05, "loss": 0.4014, "mean_token_accuracy": 0.8585884692147374, "num_tokens": 165757610.0, "step": 386 }, { "entropy": 0.409271240234375, "epoch": 1.5176470588235293, "grad_norm": 0.859464788101391, "learning_rate": 1.0691808992834866e-05, "loss": 0.4138, "mean_token_accuracy": 0.8532675765454769, "num_tokens": 166191021.0, "step": 387 }, { "entropy": 0.40966796875, "epoch": 1.5215686274509803, "grad_norm": 0.6723968211994824, "learning_rate": 1.064863369987743e-05, "loss": 0.411, "mean_token_accuracy": 0.8572338540107012, "num_tokens": 166647810.0, "step": 388 }, { "entropy": 0.4100341796875, "epoch": 1.5254901960784313, "grad_norm": 0.6654450287296578, "learning_rate": 1.06054462611421e-05, "loss": 0.4053, "mean_token_accuracy": 0.8566803587600589, "num_tokens": 167091919.0, "step": 389 }, { "entropy": 0.411102294921875, "epoch": 1.5294117647058822, "grad_norm": 0.6743637396313977, "learning_rate": 1.0562247485321116e-05, "loss": 0.4102, "mean_token_accuracy": 0.8562160143628716, "num_tokens": 167524975.0, "step": 390 }, { "entropy": 0.412933349609375, "epoch": 1.5333333333333332, "grad_norm": 0.6979119955346013, "learning_rate": 1.0519038181319e-05, "loss": 0.3989, "mean_token_accuracy": 0.8592202458530664, "num_tokens": 167957081.0, "step": 391 }, { "entropy": 0.414581298828125, "epoch": 1.5372549019607842, "grad_norm": 0.7214480435006635, "learning_rate": 1.0475819158237426e-05, "loss": 0.4164, "mean_token_accuracy": 0.854176253080368, "num_tokens": 168382901.0, "step": 392 }, { "entropy": 0.410247802734375, "epoch": 1.5411764705882351, "grad_norm": 0.7240884794501748, "learning_rate": 1.0432591225360052e-05, "loss": 0.4269, "mean_token_accuracy": 0.851943246088922, "num_tokens": 168830532.0, "step": 393 }, { "entropy": 0.41070556640625, "epoch": 1.5450980392156861, "grad_norm": 0.6615798629050048, "learning_rate": 1.0389355192137379e-05, "loss": 0.396, "mean_token_accuracy": 0.8609147928655148, "num_tokens": 169260615.0, "step": 394 }, { "entropy": 0.403564453125, "epoch": 1.5490196078431373, "grad_norm": 0.6950370598617063, "learning_rate": 1.0346111868171584e-05, "loss": 0.3964, "mean_token_accuracy": 0.8605449888855219, "num_tokens": 169688457.0, "step": 395 }, { "entropy": 0.406524658203125, "epoch": 1.5529411764705883, "grad_norm": 0.6784651046053325, "learning_rate": 1.0302862063201367e-05, "loss": 0.4099, "mean_token_accuracy": 0.8544306671246886, "num_tokens": 170139299.0, "step": 396 }, { "entropy": 0.4044189453125, "epoch": 1.5568627450980392, "grad_norm": 0.7847918254095473, "learning_rate": 1.0259606587086783e-05, "loss": 0.4179, "mean_token_accuracy": 0.8526219138875604, "num_tokens": 170587531.0, "step": 397 }, { "entropy": 0.40472412109375, "epoch": 1.5607843137254902, "grad_norm": 0.6894614523455117, "learning_rate": 1.0216346249794087e-05, "loss": 0.4079, "mean_token_accuracy": 0.8570181773975492, "num_tokens": 171025712.0, "step": 398 }, { "entropy": 0.40240478515625, "epoch": 1.5647058823529412, "grad_norm": 0.6841419524220748, "learning_rate": 1.0173081861380551e-05, "loss": 0.391, "mean_token_accuracy": 0.8597865039482713, "num_tokens": 171479929.0, "step": 399 }, { "entropy": 0.412628173828125, "epoch": 1.5686274509803921, "grad_norm": 0.6912607545067652, "learning_rate": 1.012981423197931e-05, "loss": 0.3995, "mean_token_accuracy": 0.8574095563963056, "num_tokens": 171913547.0, "step": 400 }, { "entropy": 0.41455078125, "epoch": 1.572549019607843, "grad_norm": 0.6752207564966213, "learning_rate": 1.0086544171784187e-05, "loss": 0.4011, "mean_token_accuracy": 0.8586401976644993, "num_tokens": 172350197.0, "step": 401 }, { "entropy": 0.40863037109375, "epoch": 1.576470588235294, "grad_norm": 0.71746520315725, "learning_rate": 1.0043272491034523e-05, "loss": 0.3977, "mean_token_accuracy": 0.8575174137949944, "num_tokens": 172777991.0, "step": 402 }, { "entropy": 0.41156005859375, "epoch": 1.5803921568627453, "grad_norm": 0.675986123292717, "learning_rate": 1e-05, "loss": 0.3949, "mean_token_accuracy": 0.8585043726488948, "num_tokens": 173200136.0, "step": 403 }, { "entropy": 0.407012939453125, "epoch": 1.5843137254901962, "grad_norm": 0.6925221354784149, "learning_rate": 9.956727508965482e-06, "loss": 0.414, "mean_token_accuracy": 0.8540292549878359, "num_tokens": 173629631.0, "step": 404 }, { "entropy": 0.410736083984375, "epoch": 1.5882352941176472, "grad_norm": 0.6847506074125855, "learning_rate": 9.913455828215815e-06, "loss": 0.3959, "mean_token_accuracy": 0.8615149781107903, "num_tokens": 174054145.0, "step": 405 }, { "entropy": 0.410858154296875, "epoch": 1.5921568627450982, "grad_norm": 0.722859402637327, "learning_rate": 9.870185768020694e-06, "loss": 0.3992, "mean_token_accuracy": 0.8596215089783072, "num_tokens": 174473463.0, "step": 406 }, { "entropy": 0.406646728515625, "epoch": 1.5960784313725491, "grad_norm": 0.6946870646270319, "learning_rate": 9.826918138619454e-06, "loss": 0.4038, "mean_token_accuracy": 0.8573076035827398, "num_tokens": 174892440.0, "step": 407 }, { "entropy": 0.41070556640625, "epoch": 1.6, "grad_norm": 0.6890905076143856, "learning_rate": 9.783653750205916e-06, "loss": 0.3854, "mean_token_accuracy": 0.8630838803946972, "num_tokens": 175299479.0, "step": 408 }, { "entropy": 0.40509033203125, "epoch": 1.603921568627451, "grad_norm": 0.7026567580948715, "learning_rate": 9.740393412913219e-06, "loss": 0.3993, "mean_token_accuracy": 0.8585938615724444, "num_tokens": 175727422.0, "step": 409 }, { "entropy": 0.407501220703125, "epoch": 1.607843137254902, "grad_norm": 0.6499788718397761, "learning_rate": 9.697137936798635e-06, "loss": 0.4118, "mean_token_accuracy": 0.8579577170312405, "num_tokens": 176171135.0, "step": 410 }, { "entropy": 0.41387939453125, "epoch": 1.611764705882353, "grad_norm": 0.7288347926112773, "learning_rate": 9.65388813182842e-06, "loss": 0.4048, "mean_token_accuracy": 0.857942121103406, "num_tokens": 176605815.0, "step": 411 }, { "entropy": 0.406585693359375, "epoch": 1.615686274509804, "grad_norm": 0.6873130367941563, "learning_rate": 9.610644807862625e-06, "loss": 0.385, "mean_token_accuracy": 0.8628216292709112, "num_tokens": 177046849.0, "step": 412 }, { "entropy": 0.409515380859375, "epoch": 1.619607843137255, "grad_norm": 0.8159477025546342, "learning_rate": 9.567408774639951e-06, "loss": 0.4114, "mean_token_accuracy": 0.8583543403074145, "num_tokens": 177479814.0, "step": 413 }, { "entropy": 0.4022216796875, "epoch": 1.6235294117647059, "grad_norm": 0.7046825130274702, "learning_rate": 9.524180841762577e-06, "loss": 0.4079, "mean_token_accuracy": 0.8577337488532066, "num_tokens": 177942489.0, "step": 414 }, { "entropy": 0.406951904296875, "epoch": 1.6274509803921569, "grad_norm": 0.67812309257672, "learning_rate": 9.480961818681004e-06, "loss": 0.3981, "mean_token_accuracy": 0.8608548073098063, "num_tokens": 178372403.0, "step": 415 }, { "entropy": 0.408172607421875, "epoch": 1.6313725490196078, "grad_norm": 0.6501390152436636, "learning_rate": 9.437752514678888e-06, "loss": 0.3822, "mean_token_accuracy": 0.864359175786376, "num_tokens": 178802249.0, "step": 416 }, { "entropy": 0.411834716796875, "epoch": 1.6352941176470588, "grad_norm": 0.6919125667004101, "learning_rate": 9.394553738857902e-06, "loss": 0.3804, "mean_token_accuracy": 0.8647814923897386, "num_tokens": 179211558.0, "step": 417 }, { "entropy": 0.418670654296875, "epoch": 1.6392156862745098, "grad_norm": 0.745063777167442, "learning_rate": 9.351366300122569e-06, "loss": 0.4066, "mean_token_accuracy": 0.8586796801537275, "num_tokens": 179626922.0, "step": 418 }, { "entropy": 0.40985107421875, "epoch": 1.6431372549019607, "grad_norm": 0.7460150646397898, "learning_rate": 9.308191007165135e-06, "loss": 0.4107, "mean_token_accuracy": 0.8577667633071542, "num_tokens": 180060705.0, "step": 419 }, { "entropy": 0.4140625, "epoch": 1.6470588235294117, "grad_norm": 0.6550263847678918, "learning_rate": 9.265028668450403e-06, "loss": 0.3917, "mean_token_accuracy": 0.8609657865017653, "num_tokens": 180478026.0, "step": 420 }, { "entropy": 0.413665771484375, "epoch": 1.6509803921568627, "grad_norm": 0.7020773548739169, "learning_rate": 9.221880092200601e-06, "loss": 0.4072, "mean_token_accuracy": 0.8573073288425803, "num_tokens": 180923505.0, "step": 421 }, { "entropy": 0.40521240234375, "epoch": 1.6549019607843136, "grad_norm": 0.6966400451319761, "learning_rate": 9.178746086380274e-06, "loss": 0.4021, "mean_token_accuracy": 0.8583230208605528, "num_tokens": 181351214.0, "step": 422 }, { "entropy": 0.41058349609375, "epoch": 1.6588235294117646, "grad_norm": 0.6698243839063328, "learning_rate": 9.135627458681116e-06, "loss": 0.3967, "mean_token_accuracy": 0.8596462178975344, "num_tokens": 181785481.0, "step": 423 }, { "entropy": 0.40679931640625, "epoch": 1.6627450980392156, "grad_norm": 0.6475108302803604, "learning_rate": 9.092525016506858e-06, "loss": 0.3886, "mean_token_accuracy": 0.8637061798945069, "num_tokens": 182219067.0, "step": 424 }, { "entropy": 0.41473388671875, "epoch": 1.6666666666666665, "grad_norm": 0.6675953966752504, "learning_rate": 9.049439566958176e-06, "loss": 0.4046, "mean_token_accuracy": 0.8553493404760957, "num_tokens": 182636470.0, "step": 425 }, { "entropy": 0.40753173828125, "epoch": 1.6705882352941175, "grad_norm": 0.6619441579391007, "learning_rate": 9.006371916817533e-06, "loss": 0.3942, "mean_token_accuracy": 0.860662579536438, "num_tokens": 183060041.0, "step": 426 }, { "entropy": 0.408294677734375, "epoch": 1.6745098039215687, "grad_norm": 0.6445359135710907, "learning_rate": 8.963322872534115e-06, "loss": 0.4007, "mean_token_accuracy": 0.8577142441645265, "num_tokens": 183475870.0, "step": 427 }, { "entropy": 0.409027099609375, "epoch": 1.6784313725490196, "grad_norm": 0.6979215249121394, "learning_rate": 8.920293240208694e-06, "loss": 0.4005, "mean_token_accuracy": 0.8606764739379287, "num_tokens": 183910388.0, "step": 428 }, { "entropy": 0.415252685546875, "epoch": 1.6823529411764706, "grad_norm": 0.6899546001964146, "learning_rate": 8.877283825578554e-06, "loss": 0.3991, "mean_token_accuracy": 0.8583256499841809, "num_tokens": 184319219.0, "step": 429 }, { "entropy": 0.40948486328125, "epoch": 1.6862745098039216, "grad_norm": 0.6302580019373215, "learning_rate": 8.83429543400241e-06, "loss": 0.4009, "mean_token_accuracy": 0.8594886185601354, "num_tokens": 184751471.0, "step": 430 }, { "entropy": 0.412750244140625, "epoch": 1.6901960784313725, "grad_norm": 0.6911336842606688, "learning_rate": 8.791328870445302e-06, "loss": 0.399, "mean_token_accuracy": 0.8586211362853646, "num_tokens": 185166878.0, "step": 431 }, { "entropy": 0.407196044921875, "epoch": 1.6941176470588235, "grad_norm": 0.6623519559272918, "learning_rate": 8.748384939463543e-06, "loss": 0.401, "mean_token_accuracy": 0.8593526994809508, "num_tokens": 185606889.0, "step": 432 }, { "entropy": 0.401275634765625, "epoch": 1.6980392156862745, "grad_norm": 0.6115072966136919, "learning_rate": 8.705464445189646e-06, "loss": 0.3945, "mean_token_accuracy": 0.8617425132542849, "num_tokens": 186057141.0, "step": 433 }, { "entropy": 0.406341552734375, "epoch": 1.7019607843137254, "grad_norm": 0.6563129181567589, "learning_rate": 8.662568191317273e-06, "loss": 0.4008, "mean_token_accuracy": 0.8593833548948169, "num_tokens": 186496413.0, "step": 434 }, { "entropy": 0.4078369140625, "epoch": 1.7058823529411766, "grad_norm": 0.6384184762621999, "learning_rate": 8.619696981086173e-06, "loss": 0.3878, "mean_token_accuracy": 0.8609198983758688, "num_tokens": 186908586.0, "step": 435 }, { "entropy": 0.407470703125, "epoch": 1.7098039215686276, "grad_norm": 0.6456952863053911, "learning_rate": 8.576851617267151e-06, "loss": 0.4054, "mean_token_accuracy": 0.8583989115431905, "num_tokens": 187344594.0, "step": 436 }, { "entropy": 0.415130615234375, "epoch": 1.7137254901960786, "grad_norm": 0.6051085180740002, "learning_rate": 8.53403290214703e-06, "loss": 0.3887, "mean_token_accuracy": 0.8639670200645924, "num_tokens": 187761423.0, "step": 437 }, { "entropy": 0.407196044921875, "epoch": 1.7176470588235295, "grad_norm": 0.6231489555374776, "learning_rate": 8.491241637513644e-06, "loss": 0.3909, "mean_token_accuracy": 0.8602004619315267, "num_tokens": 188191066.0, "step": 438 }, { "entropy": 0.407196044921875, "epoch": 1.7215686274509805, "grad_norm": 0.6793096084424919, "learning_rate": 8.448478624640798e-06, "loss": 0.4076, "mean_token_accuracy": 0.8562532840296626, "num_tokens": 188644360.0, "step": 439 }, { "entropy": 0.40386962890625, "epoch": 1.7254901960784315, "grad_norm": 0.6460668298473338, "learning_rate": 8.405744664273278e-06, "loss": 0.3988, "mean_token_accuracy": 0.8608309207484126, "num_tokens": 189102349.0, "step": 440 }, { "entropy": 0.41302490234375, "epoch": 1.7294117647058824, "grad_norm": 0.648333243521565, "learning_rate": 8.363040556611872e-06, "loss": 0.3979, "mean_token_accuracy": 0.8595832930877805, "num_tokens": 189532865.0, "step": 441 }, { "entropy": 0.407989501953125, "epoch": 1.7333333333333334, "grad_norm": 0.6671747464688874, "learning_rate": 8.320367101298351e-06, "loss": 0.3986, "mean_token_accuracy": 0.8617238756269217, "num_tokens": 189973506.0, "step": 442 }, { "entropy": 0.404937744140625, "epoch": 1.7372549019607844, "grad_norm": 0.6590397435344738, "learning_rate": 8.277725097400536e-06, "loss": 0.397, "mean_token_accuracy": 0.8606142830103636, "num_tokens": 190421714.0, "step": 443 }, { "entropy": 0.406005859375, "epoch": 1.7411764705882353, "grad_norm": 0.6530611449981759, "learning_rate": 8.235115343397295e-06, "loss": 0.4075, "mean_token_accuracy": 0.85643027164042, "num_tokens": 190862770.0, "step": 444 }, { "entropy": 0.4041748046875, "epoch": 1.7450980392156863, "grad_norm": 0.7027101958624039, "learning_rate": 8.19253863716362e-06, "loss": 0.3914, "mean_token_accuracy": 0.8620561547577381, "num_tokens": 191304126.0, "step": 445 }, { "entropy": 0.41015625, "epoch": 1.7490196078431373, "grad_norm": 0.6786856809017515, "learning_rate": 8.149995775955686e-06, "loss": 0.3917, "mean_token_accuracy": 0.8625259781256318, "num_tokens": 191719523.0, "step": 446 }, { "entropy": 0.40704345703125, "epoch": 1.7529411764705882, "grad_norm": 0.6615376726535924, "learning_rate": 8.107487556395902e-06, "loss": 0.3913, "mean_token_accuracy": 0.8633209681138396, "num_tokens": 192142207.0, "step": 447 }, { "entropy": 0.409759521484375, "epoch": 1.7568627450980392, "grad_norm": 0.6878125438153613, "learning_rate": 8.065014774458004e-06, "loss": 0.4007, "mean_token_accuracy": 0.8581640059128404, "num_tokens": 192579162.0, "step": 448 }, { "entropy": 0.416351318359375, "epoch": 1.7607843137254902, "grad_norm": 0.6538402222716845, "learning_rate": 8.02257822545217e-06, "loss": 0.3905, "mean_token_accuracy": 0.8591163596138358, "num_tokens": 192987107.0, "step": 449 }, { "entropy": 0.41094970703125, "epoch": 1.7647058823529411, "grad_norm": 0.6971665818355606, "learning_rate": 7.980178704010089e-06, "loss": 0.4006, "mean_token_accuracy": 0.8593992488458753, "num_tokens": 193421020.0, "step": 450 }, { "entropy": 0.409881591796875, "epoch": 1.768627450980392, "grad_norm": 0.6409197670335629, "learning_rate": 7.93781700407012e-06, "loss": 0.3897, "mean_token_accuracy": 0.8616615459322929, "num_tokens": 193846467.0, "step": 451 }, { "entropy": 0.41131591796875, "epoch": 1.772549019607843, "grad_norm": 0.6931163174376977, "learning_rate": 7.895493918862395e-06, "loss": 0.4036, "mean_token_accuracy": 0.8571218065917492, "num_tokens": 194287774.0, "step": 452 }, { "entropy": 0.403717041015625, "epoch": 1.776470588235294, "grad_norm": 0.66666924829856, "learning_rate": 7.853210240893985e-06, "loss": 0.3905, "mean_token_accuracy": 0.8637211918830872, "num_tokens": 194720263.0, "step": 453 }, { "entropy": 0.41131591796875, "epoch": 1.780392156862745, "grad_norm": 0.6204235116772743, "learning_rate": 7.810966761934053e-06, "loss": 0.3921, "mean_token_accuracy": 0.8618059307336807, "num_tokens": 195146105.0, "step": 454 }, { "entropy": 0.4031982421875, "epoch": 1.784313725490196, "grad_norm": 0.6342484649096467, "learning_rate": 7.76876427299903e-06, "loss": 0.4121, "mean_token_accuracy": 0.8566878782585263, "num_tokens": 195584542.0, "step": 455 }, { "entropy": 0.406951904296875, "epoch": 1.788235294117647, "grad_norm": 0.6328100517917241, "learning_rate": 7.726603564337791e-06, "loss": 0.386, "mean_token_accuracy": 0.8630632497370243, "num_tokens": 195999765.0, "step": 456 }, { "entropy": 0.406463623046875, "epoch": 1.792156862745098, "grad_norm": 0.6652068952143365, "learning_rate": 7.684485425416888e-06, "loss": 0.3955, "mean_token_accuracy": 0.8625632170587778, "num_tokens": 196418621.0, "step": 457 }, { "entropy": 0.406951904296875, "epoch": 1.7960784313725489, "grad_norm": 0.6326591145779984, "learning_rate": 7.642410644905726e-06, "loss": 0.3961, "mean_token_accuracy": 0.860276403836906, "num_tokens": 196847355.0, "step": 458 }, { "entropy": 0.41046142578125, "epoch": 1.8, "grad_norm": 0.668308364754529, "learning_rate": 7.600380010661836e-06, "loss": 0.3885, "mean_token_accuracy": 0.8643168518319726, "num_tokens": 197280641.0, "step": 459 }, { "entropy": 0.40557861328125, "epoch": 1.803921568627451, "grad_norm": 0.6138313731892276, "learning_rate": 7.558394309716088e-06, "loss": 0.3821, "mean_token_accuracy": 0.86452910117805, "num_tokens": 197727391.0, "step": 460 }, { "entropy": 0.39898681640625, "epoch": 1.807843137254902, "grad_norm": 0.6364964632727911, "learning_rate": 7.516454328257969e-06, "loss": 0.3952, "mean_token_accuracy": 0.8590056737884879, "num_tokens": 198148365.0, "step": 461 }, { "entropy": 0.399169921875, "epoch": 1.811764705882353, "grad_norm": 0.681759781228122, "learning_rate": 7.474560851620873e-06, "loss": 0.4151, "mean_token_accuracy": 0.8547424823045731, "num_tokens": 198592222.0, "step": 462 }, { "entropy": 0.391326904296875, "epoch": 1.815686274509804, "grad_norm": 0.6596255218589232, "learning_rate": 7.432714664267373e-06, "loss": 0.3859, "mean_token_accuracy": 0.8629283830523491, "num_tokens": 199059847.0, "step": 463 }, { "entropy": 0.40460205078125, "epoch": 1.8196078431372549, "grad_norm": 0.6799616657323965, "learning_rate": 7.390916549774536e-06, "loss": 0.399, "mean_token_accuracy": 0.8610562225803733, "num_tokens": 199490320.0, "step": 464 }, { "entropy": 0.4024658203125, "epoch": 1.8235294117647058, "grad_norm": 0.6357942300751293, "learning_rate": 7.349167290819274e-06, "loss": 0.3884, "mean_token_accuracy": 0.8627661904320121, "num_tokens": 199926836.0, "step": 465 }, { "entropy": 0.40557861328125, "epoch": 1.8274509803921568, "grad_norm": 0.6212601820985758, "learning_rate": 7.307467669163655e-06, "loss": 0.3806, "mean_token_accuracy": 0.8631542297080159, "num_tokens": 200351704.0, "step": 466 }, { "entropy": 0.40411376953125, "epoch": 1.831372549019608, "grad_norm": 0.6644528066997022, "learning_rate": 7.265818465640292e-06, "loss": 0.3898, "mean_token_accuracy": 0.8605430433526635, "num_tokens": 200779543.0, "step": 467 }, { "entropy": 0.40142822265625, "epoch": 1.835294117647059, "grad_norm": 0.6192510699914716, "learning_rate": 7.224220460137701e-06, "loss": 0.383, "mean_token_accuracy": 0.8634469164535403, "num_tokens": 201219423.0, "step": 468 }, { "entropy": 0.411956787109375, "epoch": 1.83921568627451, "grad_norm": 0.6758910902237301, "learning_rate": 7.182674431585703e-06, "loss": 0.3851, "mean_token_accuracy": 0.8633188679814339, "num_tokens": 201629124.0, "step": 469 }, { "entropy": 0.3983154296875, "epoch": 1.843137254901961, "grad_norm": 0.6240002104477036, "learning_rate": 7.141181157940859e-06, "loss": 0.3849, "mean_token_accuracy": 0.8628418175503612, "num_tokens": 202050007.0, "step": 470 }, { "entropy": 0.39898681640625, "epoch": 1.8470588235294119, "grad_norm": 0.6329914255858695, "learning_rate": 7.099741416171866e-06, "loss": 0.3853, "mean_token_accuracy": 0.8619992816820741, "num_tokens": 202471383.0, "step": 471 }, { "entropy": 0.398651123046875, "epoch": 1.8509803921568628, "grad_norm": 0.6397271420327242, "learning_rate": 7.058355982245038e-06, "loss": 0.394, "mean_token_accuracy": 0.8587851086631417, "num_tokens": 202916687.0, "step": 472 }, { "entropy": 0.39678955078125, "epoch": 1.8549019607843138, "grad_norm": 0.6245793958337541, "learning_rate": 7.017025631109762e-06, "loss": 0.3806, "mean_token_accuracy": 0.8651279462501407, "num_tokens": 203367706.0, "step": 473 }, { "entropy": 0.402496337890625, "epoch": 1.8588235294117648, "grad_norm": 0.6480244793330756, "learning_rate": 6.97575113668399e-06, "loss": 0.3798, "mean_token_accuracy": 0.8655670257285237, "num_tokens": 203794061.0, "step": 474 }, { "entropy": 0.396575927734375, "epoch": 1.8627450980392157, "grad_norm": 0.668132844077502, "learning_rate": 6.934533271839751e-06, "loss": 0.3853, "mean_token_accuracy": 0.8610855452716351, "num_tokens": 204218730.0, "step": 475 }, { "entropy": 0.39825439453125, "epoch": 1.8666666666666667, "grad_norm": 0.6162152767479371, "learning_rate": 6.893372808388674e-06, "loss": 0.3959, "mean_token_accuracy": 0.8632792960852385, "num_tokens": 204665453.0, "step": 476 }, { "entropy": 0.404296875, "epoch": 1.8705882352941177, "grad_norm": 0.638832679338457, "learning_rate": 6.852270517067527e-06, "loss": 0.3942, "mean_token_accuracy": 0.8628870220854878, "num_tokens": 205090169.0, "step": 477 }, { "entropy": 0.396942138671875, "epoch": 1.8745098039215686, "grad_norm": 0.6191281527349773, "learning_rate": 6.8112271675238154e-06, "loss": 0.3947, "mean_token_accuracy": 0.8603170970454812, "num_tokens": 205566713.0, "step": 478 }, { "entropy": 0.398956298828125, "epoch": 1.8784313725490196, "grad_norm": 0.6341770253300271, "learning_rate": 6.7702435283013315e-06, "loss": 0.3799, "mean_token_accuracy": 0.8677375428378582, "num_tokens": 205992968.0, "step": 479 }, { "entropy": 0.400604248046875, "epoch": 1.8823529411764706, "grad_norm": 0.6146178497897123, "learning_rate": 6.729320366825785e-06, "loss": 0.3793, "mean_token_accuracy": 0.8669349849224091, "num_tokens": 206415484.0, "step": 480 }, { "entropy": 0.398651123046875, "epoch": 1.8862745098039215, "grad_norm": 0.5904407168244195, "learning_rate": 6.688458449390438e-06, "loss": 0.3772, "mean_token_accuracy": 0.8662282424047589, "num_tokens": 206838020.0, "step": 481 }, { "entropy": 0.403839111328125, "epoch": 1.8901960784313725, "grad_norm": 0.6138200970304336, "learning_rate": 6.647658541141735e-06, "loss": 0.3865, "mean_token_accuracy": 0.863483252003789, "num_tokens": 207262730.0, "step": 482 }, { "entropy": 0.39947509765625, "epoch": 1.8941176470588235, "grad_norm": 0.6411707572669829, "learning_rate": 6.606921406065003e-06, "loss": 0.393, "mean_token_accuracy": 0.8628878751769662, "num_tokens": 207681374.0, "step": 483 }, { "entropy": 0.399810791015625, "epoch": 1.8980392156862744, "grad_norm": 0.6242158740652315, "learning_rate": 6.566247806970119e-06, "loss": 0.3815, "mean_token_accuracy": 0.8648646343499422, "num_tokens": 208111486.0, "step": 484 }, { "entropy": 0.4017333984375, "epoch": 1.9019607843137254, "grad_norm": 0.6214585655042043, "learning_rate": 6.525638505477232e-06, "loss": 0.3961, "mean_token_accuracy": 0.8618203224614263, "num_tokens": 208524640.0, "step": 485 }, { "entropy": 0.406768798828125, "epoch": 1.9058823529411764, "grad_norm": 0.675756751207708, "learning_rate": 6.485094262002529e-06, "loss": 0.3834, "mean_token_accuracy": 0.8650505719706416, "num_tokens": 208943322.0, "step": 486 }, { "entropy": 0.40185546875, "epoch": 1.9098039215686273, "grad_norm": 0.6339598234809348, "learning_rate": 6.444615835743955e-06, "loss": 0.378, "mean_token_accuracy": 0.8643421633169055, "num_tokens": 209372138.0, "step": 487 }, { "entropy": 0.399261474609375, "epoch": 1.9137254901960783, "grad_norm": 0.6669096574204565, "learning_rate": 6.404203984667019e-06, "loss": 0.3937, "mean_token_accuracy": 0.8616707855835557, "num_tokens": 209800419.0, "step": 488 }, { "entropy": 0.400360107421875, "epoch": 1.9176470588235293, "grad_norm": 0.6298220613709057, "learning_rate": 6.363859465490609e-06, "loss": 0.3677, "mean_token_accuracy": 0.8683587471023202, "num_tokens": 210215161.0, "step": 489 }, { "entropy": 0.404510498046875, "epoch": 1.9215686274509802, "grad_norm": 0.6584599935418873, "learning_rate": 6.323583033672799e-06, "loss": 0.393, "mean_token_accuracy": 0.8635195046663284, "num_tokens": 210635827.0, "step": 490 }, { "entropy": 0.405609130859375, "epoch": 1.9254901960784314, "grad_norm": 0.647939059725834, "learning_rate": 6.283375443396726e-06, "loss": 0.3833, "mean_token_accuracy": 0.8655750313773751, "num_tokens": 211041992.0, "step": 491 }, { "entropy": 0.397003173828125, "epoch": 1.9294117647058824, "grad_norm": 0.6137155057965655, "learning_rate": 6.24323744755645e-06, "loss": 0.3811, "mean_token_accuracy": 0.8649553088471293, "num_tokens": 211477066.0, "step": 492 }, { "entropy": 0.399658203125, "epoch": 1.9333333333333333, "grad_norm": 0.651612311993538, "learning_rate": 6.203169797742862e-06, "loss": 0.3901, "mean_token_accuracy": 0.8612717455253005, "num_tokens": 211902746.0, "step": 493 }, { "entropy": 0.4049072265625, "epoch": 1.9372549019607843, "grad_norm": 0.6550364986232016, "learning_rate": 6.163173244229618e-06, "loss": 0.3784, "mean_token_accuracy": 0.8649454573169351, "num_tokens": 212330551.0, "step": 494 }, { "entropy": 0.4041748046875, "epoch": 1.9411764705882353, "grad_norm": 0.6257234356087099, "learning_rate": 6.123248535959083e-06, "loss": 0.3883, "mean_token_accuracy": 0.862628510221839, "num_tokens": 212750893.0, "step": 495 }, { "entropy": 0.396514892578125, "epoch": 1.9450980392156862, "grad_norm": 0.6207760034425814, "learning_rate": 6.083396420528298e-06, "loss": 0.3915, "mean_token_accuracy": 0.8619965445250273, "num_tokens": 213191351.0, "step": 496 }, { "entropy": 0.4000244140625, "epoch": 1.9490196078431372, "grad_norm": 0.6435317991184669, "learning_rate": 6.043617644175005e-06, "loss": 0.3843, "mean_token_accuracy": 0.8652150267735124, "num_tokens": 213609485.0, "step": 497 }, { "entropy": 0.4049072265625, "epoch": 1.9529411764705882, "grad_norm": 0.6445614380022352, "learning_rate": 6.003912951763644e-06, "loss": 0.3818, "mean_token_accuracy": 0.8633128497749567, "num_tokens": 214021808.0, "step": 498 }, { "entropy": 0.40032958984375, "epoch": 1.9568627450980394, "grad_norm": 0.6188612791246716, "learning_rate": 5.964283086771435e-06, "loss": 0.3763, "mean_token_accuracy": 0.8673922391608357, "num_tokens": 214467620.0, "step": 499 }, { "entropy": 0.404510498046875, "epoch": 1.9607843137254903, "grad_norm": 0.6210830962778174, "learning_rate": 5.924728791274432e-06, "loss": 0.3928, "mean_token_accuracy": 0.8633805690333247, "num_tokens": 214896426.0, "step": 500 }, { "entropy": 0.401397705078125, "epoch": 1.9647058823529413, "grad_norm": 0.6310710096571402, "learning_rate": 5.885250805933636e-06, "loss": 0.3728, "mean_token_accuracy": 0.8687439002096653, "num_tokens": 215328235.0, "step": 501 }, { "entropy": 0.399017333984375, "epoch": 1.9686274509803923, "grad_norm": 0.6300326675580238, "learning_rate": 5.845849869981137e-06, "loss": 0.3742, "mean_token_accuracy": 0.8672775719314814, "num_tokens": 215762622.0, "step": 502 }, { "entropy": 0.393951416015625, "epoch": 1.9725490196078432, "grad_norm": 0.6216931663392556, "learning_rate": 5.806526721206252e-06, "loss": 0.3705, "mean_token_accuracy": 0.8691313751041889, "num_tokens": 216192091.0, "step": 503 }, { "entropy": 0.401214599609375, "epoch": 1.9764705882352942, "grad_norm": 0.6530296855077998, "learning_rate": 5.767282095941725e-06, "loss": 0.4023, "mean_token_accuracy": 0.8586861994117498, "num_tokens": 216628887.0, "step": 504 }, { "entropy": 0.403411865234375, "epoch": 1.9803921568627452, "grad_norm": 0.6208031961076891, "learning_rate": 5.728116729049929e-06, "loss": 0.3675, "mean_token_accuracy": 0.8702347576618195, "num_tokens": 217039678.0, "step": 505 }, { "entropy": 0.397613525390625, "epoch": 1.9843137254901961, "grad_norm": 0.626565179565779, "learning_rate": 5.68903135390912e-06, "loss": 0.3779, "mean_token_accuracy": 0.8659409172832966, "num_tokens": 217486630.0, "step": 506 }, { "entropy": 0.402008056640625, "epoch": 1.988235294117647, "grad_norm": 0.6381555753272591, "learning_rate": 5.65002670239968e-06, "loss": 0.3852, "mean_token_accuracy": 0.8616180000826716, "num_tokens": 217923066.0, "step": 507 }, { "entropy": 0.3978271484375, "epoch": 1.992156862745098, "grad_norm": 0.6111355854752799, "learning_rate": 5.611103504890444e-06, "loss": 0.3864, "mean_token_accuracy": 0.8644295651465654, "num_tokens": 218363543.0, "step": 508 }, { "entropy": 0.401824951171875, "epoch": 1.996078431372549, "grad_norm": 0.6279421120839183, "learning_rate": 5.57226249022499e-06, "loss": 0.3733, "mean_token_accuracy": 0.8667671736329794, "num_tokens": 218782677.0, "step": 509 }, { "entropy": 0.401641845703125, "epoch": 2.0, "grad_norm": 0.6235365780243987, "learning_rate": 5.533504385708024e-06, "loss": 0.3707, "mean_token_accuracy": 0.8663612883538008, "num_tokens": 219186566.0, "step": 510 }, { "entropy": 0.396575927734375, "epoch": 2.003921568627451, "grad_norm": 0.6850387994534668, "learning_rate": 5.494829917091733e-06, "loss": 0.3429, "mean_token_accuracy": 0.8770228121429682, "num_tokens": 219628415.0, "step": 511 }, { "entropy": 0.401519775390625, "epoch": 2.007843137254902, "grad_norm": 0.6382337497649052, "learning_rate": 5.45623980856221e-06, "loss": 0.332, "mean_token_accuracy": 0.8810373740270734, "num_tokens": 220042043.0, "step": 512 }, { "entropy": 0.393768310546875, "epoch": 2.011764705882353, "grad_norm": 0.671827878794182, "learning_rate": 5.417734782725896e-06, "loss": 0.3253, "mean_token_accuracy": 0.8824055539444089, "num_tokens": 220458080.0, "step": 513 }, { "entropy": 0.384246826171875, "epoch": 2.015686274509804, "grad_norm": 0.7263272169698258, "learning_rate": 5.379315560596038e-06, "loss": 0.3357, "mean_token_accuracy": 0.879870074801147, "num_tokens": 220905634.0, "step": 514 }, { "entropy": 0.385040283203125, "epoch": 2.019607843137255, "grad_norm": 0.7920113629895514, "learning_rate": 5.340982861579199e-06, "loss": 0.3454, "mean_token_accuracy": 0.8794268425554037, "num_tokens": 221352570.0, "step": 515 }, { "entropy": 0.39019775390625, "epoch": 2.023529411764706, "grad_norm": 0.641834875539887, "learning_rate": 5.302737403461778e-06, "loss": 0.3259, "mean_token_accuracy": 0.8824308048933744, "num_tokens": 221787156.0, "step": 516 }, { "entropy": 0.401153564453125, "epoch": 2.0274509803921568, "grad_norm": 0.6849334728948456, "learning_rate": 5.26457990239657e-06, "loss": 0.3342, "mean_token_accuracy": 0.8797427834942937, "num_tokens": 222201542.0, "step": 517 }, { "entropy": 0.39581298828125, "epoch": 2.0313725490196077, "grad_norm": 0.7111898891704417, "learning_rate": 5.226511072889371e-06, "loss": 0.3313, "mean_token_accuracy": 0.8803043775260448, "num_tokens": 222623123.0, "step": 518 }, { "entropy": 0.3948974609375, "epoch": 2.0352941176470587, "grad_norm": 0.6505100364503669, "learning_rate": 5.188531627785573e-06, "loss": 0.3475, "mean_token_accuracy": 0.8759458484128118, "num_tokens": 223044097.0, "step": 519 }, { "entropy": 0.388336181640625, "epoch": 2.0392156862745097, "grad_norm": 0.6256506799827058, "learning_rate": 5.1506422782568345e-06, "loss": 0.3429, "mean_token_accuracy": 0.8781215418130159, "num_tokens": 223501353.0, "step": 520 }, { "entropy": 0.387054443359375, "epoch": 2.0431372549019606, "grad_norm": 0.6572960727423126, "learning_rate": 5.112843733787765e-06, "loss": 0.3399, "mean_token_accuracy": 0.8787580663338304, "num_tokens": 223935868.0, "step": 521 }, { "entropy": 0.389312744140625, "epoch": 2.0470588235294116, "grad_norm": 0.6372996031263783, "learning_rate": 5.075136702162622e-06, "loss": 0.3322, "mean_token_accuracy": 0.8786961110308766, "num_tokens": 224355787.0, "step": 522 }, { "entropy": 0.380645751953125, "epoch": 2.0509803921568626, "grad_norm": 0.6985340105840396, "learning_rate": 5.037521889452084e-06, "loss": 0.3299, "mean_token_accuracy": 0.8814136106520891, "num_tokens": 224806688.0, "step": 523 }, { "entropy": 0.387451171875, "epoch": 2.0549019607843135, "grad_norm": 0.6713865414340499, "learning_rate": 5.000000000000003e-06, "loss": 0.3529, "mean_token_accuracy": 0.8738525109365582, "num_tokens": 225233035.0, "step": 524 }, { "entropy": 0.39373779296875, "epoch": 2.0588235294117645, "grad_norm": 0.6027577042393886, "learning_rate": 4.962571736410224e-06, "loss": 0.3208, "mean_token_accuracy": 0.8810843704268336, "num_tokens": 225650532.0, "step": 525 }, { "entropy": 0.390838623046875, "epoch": 2.0627450980392155, "grad_norm": 0.6529329845320468, "learning_rate": 4.925237799533445e-06, "loss": 0.3371, "mean_token_accuracy": 0.8787398906424642, "num_tokens": 226080538.0, "step": 526 }, { "entropy": 0.388580322265625, "epoch": 2.066666666666667, "grad_norm": 0.6777393301419915, "learning_rate": 4.8879988884540705e-06, "loss": 0.322, "mean_token_accuracy": 0.8831636533141136, "num_tokens": 226508253.0, "step": 527 }, { "entropy": 0.390777587890625, "epoch": 2.070588235294118, "grad_norm": 0.6437515132444687, "learning_rate": 4.85085570047713e-06, "loss": 0.3352, "mean_token_accuracy": 0.8790671909227967, "num_tokens": 226944784.0, "step": 528 }, { "entropy": 0.385345458984375, "epoch": 2.074509803921569, "grad_norm": 0.6602168660987684, "learning_rate": 4.813808931115228e-06, "loss": 0.3319, "mean_token_accuracy": 0.8783455807715654, "num_tokens": 227377942.0, "step": 529 }, { "entropy": 0.382965087890625, "epoch": 2.0784313725490198, "grad_norm": 0.6089184508280152, "learning_rate": 4.776859274075506e-06, "loss": 0.3182, "mean_token_accuracy": 0.8840960031375289, "num_tokens": 227811996.0, "step": 530 }, { "entropy": 0.385040283203125, "epoch": 2.0823529411764707, "grad_norm": 0.8297306434177946, "learning_rate": 4.7400074212466705e-06, "loss": 0.3404, "mean_token_accuracy": 0.8791711116209626, "num_tokens": 228240221.0, "step": 531 }, { "entropy": 0.38531494140625, "epoch": 2.0862745098039217, "grad_norm": 0.6227790902239935, "learning_rate": 4.703254062686017e-06, "loss": 0.3299, "mean_token_accuracy": 0.8817263282835484, "num_tokens": 228694396.0, "step": 532 }, { "entropy": 0.3951416015625, "epoch": 2.0901960784313727, "grad_norm": 0.6726552929084186, "learning_rate": 4.666599886606521e-06, "loss": 0.321, "mean_token_accuracy": 0.8835817389190197, "num_tokens": 229089982.0, "step": 533 }, { "entropy": 0.3868408203125, "epoch": 2.0941176470588236, "grad_norm": 0.6460262137645605, "learning_rate": 4.6300455793639565e-06, "loss": 0.3343, "mean_token_accuracy": 0.8793362881988287, "num_tokens": 229517812.0, "step": 534 }, { "entropy": 0.386138916015625, "epoch": 2.0980392156862746, "grad_norm": 0.6713226114014762, "learning_rate": 4.593591825444028e-06, "loss": 0.3238, "mean_token_accuracy": 0.8822187837213278, "num_tokens": 229966675.0, "step": 535 }, { "entropy": 0.394989013671875, "epoch": 2.1019607843137256, "grad_norm": 0.6444115484817178, "learning_rate": 4.557239307449562e-06, "loss": 0.3176, "mean_token_accuracy": 0.883355819620192, "num_tokens": 230370130.0, "step": 536 }, { "entropy": 0.383544921875, "epoch": 2.1058823529411765, "grad_norm": 0.623684910415606, "learning_rate": 4.520988706087731e-06, "loss": 0.3188, "mean_token_accuracy": 0.8842975506559014, "num_tokens": 230798042.0, "step": 537 }, { "entropy": 0.3836669921875, "epoch": 2.1098039215686275, "grad_norm": 0.6733188037497384, "learning_rate": 4.4848407001572945e-06, "loss": 0.3383, "mean_token_accuracy": 0.8791207261383533, "num_tokens": 231227119.0, "step": 538 }, { "entropy": 0.3836669921875, "epoch": 2.1137254901960785, "grad_norm": 0.6527809232837684, "learning_rate": 4.448795966535903e-06, "loss": 0.3307, "mean_token_accuracy": 0.8810558579862118, "num_tokens": 231653005.0, "step": 539 }, { "entropy": 0.38861083984375, "epoch": 2.1176470588235294, "grad_norm": 0.6241280769218844, "learning_rate": 4.412855180167406e-06, "loss": 0.3401, "mean_token_accuracy": 0.8785578245297074, "num_tokens": 232083947.0, "step": 540 }, { "entropy": 0.38385009765625, "epoch": 2.1215686274509804, "grad_norm": 0.7053422092034839, "learning_rate": 4.377019014049223e-06, "loss": 0.3368, "mean_token_accuracy": 0.8770233364775777, "num_tokens": 232534306.0, "step": 541 }, { "entropy": 0.3924560546875, "epoch": 2.1254901960784314, "grad_norm": 0.646525042345649, "learning_rate": 4.341288139219752e-06, "loss": 0.3259, "mean_token_accuracy": 0.8811231376603246, "num_tokens": 232956999.0, "step": 542 }, { "entropy": 0.38153076171875, "epoch": 2.1294117647058823, "grad_norm": 0.6353328108356256, "learning_rate": 4.30566322474578e-06, "loss": 0.3261, "mean_token_accuracy": 0.8826108202338219, "num_tokens": 233389852.0, "step": 543 }, { "entropy": 0.385589599609375, "epoch": 2.1333333333333333, "grad_norm": 0.6564549054251345, "learning_rate": 4.270144937709981e-06, "loss": 0.3289, "mean_token_accuracy": 0.883696929551661, "num_tokens": 233811670.0, "step": 544 }, { "entropy": 0.385406494140625, "epoch": 2.1372549019607843, "grad_norm": 0.6841239113938077, "learning_rate": 4.234733943198399e-06, "loss": 0.3443, "mean_token_accuracy": 0.8776548197492957, "num_tokens": 234250912.0, "step": 545 }, { "entropy": 0.390472412109375, "epoch": 2.1411764705882352, "grad_norm": 0.6429128731108159, "learning_rate": 4.19943090428802e-06, "loss": 0.3263, "mean_token_accuracy": 0.8829544661566615, "num_tokens": 234669494.0, "step": 546 }, { "entropy": 0.395050048828125, "epoch": 2.145098039215686, "grad_norm": 0.6627579431444133, "learning_rate": 4.1642364820343276e-06, "loss": 0.3309, "mean_token_accuracy": 0.8791402000933886, "num_tokens": 235089829.0, "step": 547 }, { "entropy": 0.39361572265625, "epoch": 2.149019607843137, "grad_norm": 0.7016968609489542, "learning_rate": 4.1291513354589576e-06, "loss": 0.3213, "mean_token_accuracy": 0.8828855343163013, "num_tokens": 235511064.0, "step": 548 }, { "entropy": 0.389129638671875, "epoch": 2.152941176470588, "grad_norm": 0.6331735006523904, "learning_rate": 4.094176121537321e-06, "loss": 0.3352, "mean_token_accuracy": 0.8795699439942837, "num_tokens": 235949743.0, "step": 549 }, { "entropy": 0.38787841796875, "epoch": 2.156862745098039, "grad_norm": 0.6437682670590288, "learning_rate": 4.059311495186338e-06, "loss": 0.3148, "mean_token_accuracy": 0.8851141845807433, "num_tokens": 236387462.0, "step": 550 }, { "entropy": 0.3834228515625, "epoch": 2.16078431372549, "grad_norm": 0.6391407058805494, "learning_rate": 4.024558109252148e-06, "loss": 0.3305, "mean_token_accuracy": 0.8804508438333869, "num_tokens": 236829823.0, "step": 551 }, { "entropy": 0.38372802734375, "epoch": 2.164705882352941, "grad_norm": 0.6724194226514026, "learning_rate": 3.989916614497891e-06, "loss": 0.3391, "mean_token_accuracy": 0.8799557471647859, "num_tokens": 237264397.0, "step": 552 }, { "entropy": 0.380615234375, "epoch": 2.168627450980392, "grad_norm": 0.6683704841557025, "learning_rate": 3.955387659591538e-06, "loss": 0.337, "mean_token_accuracy": 0.8779965220019221, "num_tokens": 237706025.0, "step": 553 }, { "entropy": 0.381622314453125, "epoch": 2.172549019607843, "grad_norm": 0.5976323682888441, "learning_rate": 3.9209718910937174e-06, "loss": 0.3346, "mean_token_accuracy": 0.8799093374982476, "num_tokens": 238132565.0, "step": 554 }, { "entropy": 0.38897705078125, "epoch": 2.176470588235294, "grad_norm": 0.7021643775294942, "learning_rate": 3.886669953445638e-06, "loss": 0.3314, "mean_token_accuracy": 0.880199084058404, "num_tokens": 238552169.0, "step": 555 }, { "entropy": 0.38323974609375, "epoch": 2.180392156862745, "grad_norm": 0.6201242758831783, "learning_rate": 3.852482488956992e-06, "loss": 0.3434, "mean_token_accuracy": 0.8788248943164945, "num_tokens": 238990692.0, "step": 556 }, { "entropy": 0.382843017578125, "epoch": 2.1843137254901963, "grad_norm": 0.677214260217929, "learning_rate": 3.818410137793947e-06, "loss": 0.3269, "mean_token_accuracy": 0.8832383183762431, "num_tokens": 239425631.0, "step": 557 }, { "entropy": 0.38092041015625, "epoch": 2.1882352941176473, "grad_norm": 0.6209875432176569, "learning_rate": 3.784453537967161e-06, "loss": 0.3352, "mean_token_accuracy": 0.8799733864143491, "num_tokens": 239882913.0, "step": 558 }, { "entropy": 0.38623046875, "epoch": 2.1921568627450982, "grad_norm": 0.6614671874297028, "learning_rate": 3.7506133253198173e-06, "loss": 0.3296, "mean_token_accuracy": 0.8816101523116231, "num_tokens": 240305968.0, "step": 559 }, { "entropy": 0.377227783203125, "epoch": 2.196078431372549, "grad_norm": 0.6899542910895241, "learning_rate": 3.7168901335157313e-06, "loss": 0.3332, "mean_token_accuracy": 0.8795541236177087, "num_tokens": 240745419.0, "step": 560 }, { "entropy": 0.375732421875, "epoch": 2.2, "grad_norm": 0.7037636770557544, "learning_rate": 3.683284594027492e-06, "loss": 0.3283, "mean_token_accuracy": 0.8814961658790708, "num_tokens": 241182851.0, "step": 561 }, { "entropy": 0.380126953125, "epoch": 2.203921568627451, "grad_norm": 0.6910588607446845, "learning_rate": 3.6497973361246153e-06, "loss": 0.3417, "mean_token_accuracy": 0.8777891835197806, "num_tokens": 241625327.0, "step": 562 }, { "entropy": 0.379852294921875, "epoch": 2.207843137254902, "grad_norm": 0.6505651822651323, "learning_rate": 3.6164289868617884e-06, "loss": 0.3221, "mean_token_accuracy": 0.8816747777163982, "num_tokens": 242060360.0, "step": 563 }, { "entropy": 0.3958740234375, "epoch": 2.211764705882353, "grad_norm": 0.650666952366499, "learning_rate": 3.583180171067101e-06, "loss": 0.3314, "mean_token_accuracy": 0.8786816103383899, "num_tokens": 242470897.0, "step": 564 }, { "entropy": 0.38525390625, "epoch": 2.215686274509804, "grad_norm": 0.6591628517737783, "learning_rate": 3.550051511330361e-06, "loss": 0.3229, "mean_token_accuracy": 0.8851354466751218, "num_tokens": 242894132.0, "step": 565 }, { "entropy": 0.389678955078125, "epoch": 2.219607843137255, "grad_norm": 0.6501659301424558, "learning_rate": 3.517043627991441e-06, "loss": 0.3296, "mean_token_accuracy": 0.8841333854943514, "num_tokens": 243315484.0, "step": 566 }, { "entropy": 0.381011962890625, "epoch": 2.223529411764706, "grad_norm": 0.6736372583136302, "learning_rate": 3.4841571391286466e-06, "loss": 0.3331, "mean_token_accuracy": 0.8803358906880021, "num_tokens": 243761682.0, "step": 567 }, { "entropy": 0.384857177734375, "epoch": 2.227450980392157, "grad_norm": 0.6385641107317546, "learning_rate": 3.4513926605471504e-06, "loss": 0.3232, "mean_token_accuracy": 0.8852359773591161, "num_tokens": 244217985.0, "step": 568 }, { "entropy": 0.38055419921875, "epoch": 2.231372549019608, "grad_norm": 0.6957847971556891, "learning_rate": 3.418750805767469e-06, "loss": 0.3213, "mean_token_accuracy": 0.8834536019712687, "num_tokens": 244651979.0, "step": 569 }, { "entropy": 0.37164306640625, "epoch": 2.235294117647059, "grad_norm": 0.6877384491865183, "learning_rate": 3.3862321860139578e-06, "loss": 0.3394, "mean_token_accuracy": 0.8793074004352093, "num_tokens": 245111895.0, "step": 570 }, { "entropy": 0.38189697265625, "epoch": 2.23921568627451, "grad_norm": 0.70793552801571, "learning_rate": 3.3538374102033865e-06, "loss": 0.3282, "mean_token_accuracy": 0.883271967060864, "num_tokens": 245523688.0, "step": 571 }, { "entropy": 0.378173828125, "epoch": 2.243137254901961, "grad_norm": 0.6522649260770007, "learning_rate": 3.3215670849335156e-06, "loss": 0.326, "mean_token_accuracy": 0.8824318377301097, "num_tokens": 245976983.0, "step": 572 }, { "entropy": 0.38177490234375, "epoch": 2.2470588235294118, "grad_norm": 0.6371798326781967, "learning_rate": 3.2894218144717473e-06, "loss": 0.336, "mean_token_accuracy": 0.87877048086375, "num_tokens": 246399132.0, "step": 573 }, { "entropy": 0.379364013671875, "epoch": 2.2509803921568627, "grad_norm": 0.6281985651614281, "learning_rate": 3.257402200743821e-06, "loss": 0.3314, "mean_token_accuracy": 0.8802892221137881, "num_tokens": 246829053.0, "step": 574 }, { "entropy": 0.3812255859375, "epoch": 2.2549019607843137, "grad_norm": 0.6429952031987357, "learning_rate": 3.2255088433225246e-06, "loss": 0.3348, "mean_token_accuracy": 0.8793577533215284, "num_tokens": 247284964.0, "step": 575 }, { "entropy": 0.38433837890625, "epoch": 2.2588235294117647, "grad_norm": 0.6591416460917164, "learning_rate": 3.19374233941647e-06, "loss": 0.3315, "mean_token_accuracy": 0.8827766692265868, "num_tokens": 247726518.0, "step": 576 }, { "entropy": 0.385955810546875, "epoch": 2.2627450980392156, "grad_norm": 0.6509477090622025, "learning_rate": 3.1621032838589307e-06, "loss": 0.3286, "mean_token_accuracy": 0.8831933671608567, "num_tokens": 248131065.0, "step": 577 }, { "entropy": 0.387359619140625, "epoch": 2.2666666666666666, "grad_norm": 0.6536748739080436, "learning_rate": 3.1305922690966705e-06, "loss": 0.3282, "mean_token_accuracy": 0.8814155226573348, "num_tokens": 248542637.0, "step": 578 }, { "entropy": 0.3798828125, "epoch": 2.2705882352941176, "grad_norm": 0.6326768923288004, "learning_rate": 3.099209885178882e-06, "loss": 0.3142, "mean_token_accuracy": 0.8848884990438819, "num_tokens": 248952209.0, "step": 579 }, { "entropy": 0.38568115234375, "epoch": 2.2745098039215685, "grad_norm": 0.6988942589342891, "learning_rate": 3.0679567197461135e-06, "loss": 0.3245, "mean_token_accuracy": 0.8818568410351872, "num_tokens": 249350069.0, "step": 580 }, { "entropy": 0.380950927734375, "epoch": 2.2784313725490195, "grad_norm": 0.6437532388899305, "learning_rate": 3.0368333580192734e-06, "loss": 0.3362, "mean_token_accuracy": 0.8800135338678956, "num_tokens": 249790943.0, "step": 581 }, { "entropy": 0.38519287109375, "epoch": 2.2823529411764705, "grad_norm": 0.6802253424414118, "learning_rate": 3.005840382788685e-06, "loss": 0.3285, "mean_token_accuracy": 0.8817092962563038, "num_tokens": 250212691.0, "step": 582 }, { "entropy": 0.385223388671875, "epoch": 2.2862745098039214, "grad_norm": 0.649252155054827, "learning_rate": 2.974978374403147e-06, "loss": 0.3395, "mean_token_accuracy": 0.8796671917662024, "num_tokens": 250645778.0, "step": 583 }, { "entropy": 0.385650634765625, "epoch": 2.2901960784313724, "grad_norm": 0.6266536100784984, "learning_rate": 2.944247910759097e-06, "loss": 0.3369, "mean_token_accuracy": 0.8803050145506859, "num_tokens": 251091472.0, "step": 584 }, { "entropy": 0.383941650390625, "epoch": 2.2941176470588234, "grad_norm": 0.6384710127584259, "learning_rate": 2.9136495672897592e-06, "loss": 0.3253, "mean_token_accuracy": 0.8817528560757637, "num_tokens": 251508403.0, "step": 585 }, { "entropy": 0.389373779296875, "epoch": 2.2980392156862743, "grad_norm": 0.6573963751809176, "learning_rate": 2.8831839169543998e-06, "loss": 0.3233, "mean_token_accuracy": 0.882289039902389, "num_tokens": 251930049.0, "step": 586 }, { "entropy": 0.384124755859375, "epoch": 2.3019607843137253, "grad_norm": 0.6252660748375973, "learning_rate": 2.852851530227566e-06, "loss": 0.3294, "mean_token_accuracy": 0.8839052841067314, "num_tokens": 252346931.0, "step": 587 }, { "entropy": 0.382720947265625, "epoch": 2.3058823529411763, "grad_norm": 0.8966787930059511, "learning_rate": 2.8226529750884403e-06, "loss": 0.3183, "mean_token_accuracy": 0.8840129124000669, "num_tokens": 252782839.0, "step": 588 }, { "entropy": 0.38275146484375, "epoch": 2.3098039215686272, "grad_norm": 0.625666212633788, "learning_rate": 2.7925888170101667e-06, "loss": 0.3276, "mean_token_accuracy": 0.8817493235692382, "num_tokens": 253210845.0, "step": 589 }, { "entropy": 0.384735107421875, "epoch": 2.313725490196078, "grad_norm": 0.6415168482799497, "learning_rate": 2.7626596189492983e-06, "loss": 0.3315, "mean_token_accuracy": 0.8816859601065516, "num_tokens": 253626459.0, "step": 590 }, { "entropy": 0.388214111328125, "epoch": 2.317647058823529, "grad_norm": 0.7267922599495401, "learning_rate": 2.7328659413352266e-06, "loss": 0.3155, "mean_token_accuracy": 0.8850083062425256, "num_tokens": 254051003.0, "step": 591 }, { "entropy": 0.3841552734375, "epoch": 2.3215686274509806, "grad_norm": 0.6591517111208269, "learning_rate": 2.7032083420597e-06, "loss": 0.3154, "mean_token_accuracy": 0.8871585316956043, "num_tokens": 254456077.0, "step": 592 }, { "entropy": 0.377532958984375, "epoch": 2.3254901960784315, "grad_norm": 0.6441326100639279, "learning_rate": 2.673687376466385e-06, "loss": 0.3077, "mean_token_accuracy": 0.8879004623740911, "num_tokens": 254877575.0, "step": 593 }, { "entropy": 0.385467529296875, "epoch": 2.3294117647058825, "grad_norm": 0.7976063543611124, "learning_rate": 2.6443035973404497e-06, "loss": 0.3349, "mean_token_accuracy": 0.880553713068366, "num_tokens": 255303970.0, "step": 594 }, { "entropy": 0.38482666015625, "epoch": 2.3333333333333335, "grad_norm": 0.6519593817176016, "learning_rate": 2.6150575548982295e-06, "loss": 0.3286, "mean_token_accuracy": 0.883098304271698, "num_tokens": 255742340.0, "step": 595 }, { "entropy": 0.380828857421875, "epoch": 2.3372549019607844, "grad_norm": 0.6652407912562928, "learning_rate": 2.585949796776912e-06, "loss": 0.3393, "mean_token_accuracy": 0.8775012623518705, "num_tokens": 256171483.0, "step": 596 }, { "entropy": 0.3831787109375, "epoch": 2.3411764705882354, "grad_norm": 0.6268195088150889, "learning_rate": 2.5569808680242826e-06, "loss": 0.3444, "mean_token_accuracy": 0.878986076451838, "num_tokens": 256605019.0, "step": 597 }, { "entropy": 0.381011962890625, "epoch": 2.3450980392156864, "grad_norm": 0.7284967186382716, "learning_rate": 2.528151311088537e-06, "loss": 0.3217, "mean_token_accuracy": 0.8857952449470758, "num_tokens": 257047852.0, "step": 598 }, { "entropy": 0.3873291015625, "epoch": 2.3490196078431373, "grad_norm": 0.6808759200253363, "learning_rate": 2.499461665808095e-06, "loss": 0.3361, "mean_token_accuracy": 0.8788221376016736, "num_tokens": 257484717.0, "step": 599 }, { "entropy": 0.39202880859375, "epoch": 2.3529411764705883, "grad_norm": 0.7302214681581166, "learning_rate": 2.470912469401512e-06, "loss": 0.3091, "mean_token_accuracy": 0.8890265924856067, "num_tokens": 257896029.0, "step": 600 }, { "entropy": 0.386993408203125, "epoch": 2.3568627450980393, "grad_norm": 0.6447515967872068, "learning_rate": 2.4425042564574186e-06, "loss": 0.3182, "mean_token_accuracy": 0.8860885631293058, "num_tokens": 258313253.0, "step": 601 }, { "entropy": 0.37738037109375, "epoch": 2.3607843137254902, "grad_norm": 0.6796476546531216, "learning_rate": 2.414237558924496e-06, "loss": 0.3139, "mean_token_accuracy": 0.8867853293195367, "num_tokens": 258739041.0, "step": 602 }, { "entropy": 0.375518798828125, "epoch": 2.364705882352941, "grad_norm": 0.6800822428528028, "learning_rate": 2.3861129061015355e-06, "loss": 0.3122, "mean_token_accuracy": 0.8855758523568511, "num_tokens": 259174100.0, "step": 603 }, { "entropy": 0.378173828125, "epoch": 2.368627450980392, "grad_norm": 0.6695828945285421, "learning_rate": 2.3581308246275103e-06, "loss": 0.314, "mean_token_accuracy": 0.8833903381600976, "num_tokens": 259597011.0, "step": 604 }, { "entropy": 0.372467041015625, "epoch": 2.372549019607843, "grad_norm": 0.6595833290855022, "learning_rate": 2.3302918384717175e-06, "loss": 0.3213, "mean_token_accuracy": 0.8831760762259364, "num_tokens": 260072716.0, "step": 605 }, { "entropy": 0.379364013671875, "epoch": 2.376470588235294, "grad_norm": 0.6213706069462764, "learning_rate": 2.302596468923981e-06, "loss": 0.313, "mean_token_accuracy": 0.8855204563587904, "num_tokens": 260504261.0, "step": 606 }, { "entropy": 0.37933349609375, "epoch": 2.380392156862745, "grad_norm": 0.7209555926267776, "learning_rate": 2.2750452345848684e-06, "loss": 0.3235, "mean_token_accuracy": 0.8838987145572901, "num_tokens": 260929791.0, "step": 607 }, { "entropy": 0.380828857421875, "epoch": 2.384313725490196, "grad_norm": 0.6636808498029044, "learning_rate": 2.247638651355991e-06, "loss": 0.3211, "mean_token_accuracy": 0.8834717646241188, "num_tokens": 261373621.0, "step": 608 }, { "entropy": 0.377593994140625, "epoch": 2.388235294117647, "grad_norm": 0.6570217727767104, "learning_rate": 2.220377232430353e-06, "loss": 0.3257, "mean_token_accuracy": 0.8826047889888287, "num_tokens": 261814225.0, "step": 609 }, { "entropy": 0.388275146484375, "epoch": 2.392156862745098, "grad_norm": 0.6547880967559162, "learning_rate": 2.1932614882827196e-06, "loss": 0.3324, "mean_token_accuracy": 0.8816813975572586, "num_tokens": 262224785.0, "step": 610 }, { "entropy": 0.378448486328125, "epoch": 2.396078431372549, "grad_norm": 0.6467394539814979, "learning_rate": 2.1662919266600814e-06, "loss": 0.3146, "mean_token_accuracy": 0.8854999002069235, "num_tokens": 262644460.0, "step": 611 }, { "entropy": 0.37994384765625, "epoch": 2.4, "grad_norm": 0.6657441544890862, "learning_rate": 2.1394690525721275e-06, "loss": 0.314, "mean_token_accuracy": 0.8874372495338321, "num_tokens": 263073097.0, "step": 612 }, { "entropy": 0.3804931640625, "epoch": 2.403921568627451, "grad_norm": 0.674015693139673, "learning_rate": 2.112793368281799e-06, "loss": 0.3149, "mean_token_accuracy": 0.8849289892241359, "num_tokens": 263502290.0, "step": 613 }, { "entropy": 0.380401611328125, "epoch": 2.407843137254902, "grad_norm": 0.6232086802467347, "learning_rate": 2.0862653732958914e-06, "loss": 0.3199, "mean_token_accuracy": 0.8842472266405821, "num_tokens": 263945936.0, "step": 614 }, { "entropy": 0.375823974609375, "epoch": 2.411764705882353, "grad_norm": 0.671370115531208, "learning_rate": 2.0598855643556824e-06, "loss": 0.3132, "mean_token_accuracy": 0.8874692656099796, "num_tokens": 264373906.0, "step": 615 }, { "entropy": 0.378387451171875, "epoch": 2.4156862745098038, "grad_norm": 0.7287126820698647, "learning_rate": 2.03365443542764e-06, "loss": 0.3275, "mean_token_accuracy": 0.8789404109120369, "num_tokens": 264796869.0, "step": 616 }, { "entropy": 0.3797607421875, "epoch": 2.4196078431372547, "grad_norm": 0.6769800485762081, "learning_rate": 2.0075724776941842e-06, "loss": 0.3175, "mean_token_accuracy": 0.8853352731093764, "num_tokens": 265211333.0, "step": 617 }, { "entropy": 0.37451171875, "epoch": 2.4235294117647057, "grad_norm": 0.6535066383233836, "learning_rate": 1.9816401795444664e-06, "loss": 0.3043, "mean_token_accuracy": 0.8888634694740176, "num_tokens": 265634712.0, "step": 618 }, { "entropy": 0.379425048828125, "epoch": 2.4274509803921567, "grad_norm": 0.6238556414580247, "learning_rate": 1.9558580265652448e-06, "loss": 0.3214, "mean_token_accuracy": 0.8860293151810765, "num_tokens": 266062848.0, "step": 619 }, { "entropy": 0.38507080078125, "epoch": 2.431372549019608, "grad_norm": 0.6581497114358473, "learning_rate": 1.93022650153178e-06, "loss": 0.3211, "mean_token_accuracy": 0.8834013622254133, "num_tokens": 266472687.0, "step": 620 }, { "entropy": 0.375823974609375, "epoch": 2.435294117647059, "grad_norm": 0.6183363507599968, "learning_rate": 1.9047460843987963e-06, "loss": 0.3102, "mean_token_accuracy": 0.8872299622744322, "num_tokens": 266906318.0, "step": 621 }, { "entropy": 0.379852294921875, "epoch": 2.43921568627451, "grad_norm": 0.6261417698058828, "learning_rate": 1.8794172522915022e-06, "loss": 0.3141, "mean_token_accuracy": 0.8863415522500873, "num_tokens": 267329307.0, "step": 622 }, { "entropy": 0.3753662109375, "epoch": 2.443137254901961, "grad_norm": 0.6350012536405131, "learning_rate": 1.854240479496643e-06, "loss": 0.3192, "mean_token_accuracy": 0.8854794921353459, "num_tokens": 267762405.0, "step": 623 }, { "entropy": 0.3779296875, "epoch": 2.447058823529412, "grad_norm": 0.6121205166878543, "learning_rate": 1.829216237453637e-06, "loss": 0.3162, "mean_token_accuracy": 0.884573670104146, "num_tokens": 268207007.0, "step": 624 }, { "entropy": 0.38092041015625, "epoch": 2.450980392156863, "grad_norm": 0.6158676556614557, "learning_rate": 1.804344994745727e-06, "loss": 0.3261, "mean_token_accuracy": 0.8815991748124361, "num_tokens": 268640841.0, "step": 625 }, { "entropy": 0.3822021484375, "epoch": 2.454901960784314, "grad_norm": 0.6844627149146101, "learning_rate": 1.7796272170912255e-06, "loss": 0.3305, "mean_token_accuracy": 0.882242445833981, "num_tokens": 269058281.0, "step": 626 }, { "entropy": 0.3802490234375, "epoch": 2.458823529411765, "grad_norm": 0.6351667381198283, "learning_rate": 1.7550633673347783e-06, "loss": 0.3182, "mean_token_accuracy": 0.8831239556893706, "num_tokens": 269500856.0, "step": 627 }, { "entropy": 0.37982177734375, "epoch": 2.462745098039216, "grad_norm": 0.6454334947707677, "learning_rate": 1.730653905438714e-06, "loss": 0.3115, "mean_token_accuracy": 0.8864633357152343, "num_tokens": 269916404.0, "step": 628 }, { "entropy": 0.3763427734375, "epoch": 2.466666666666667, "grad_norm": 0.6184143204773618, "learning_rate": 1.7063992884744096e-06, "loss": 0.3093, "mean_token_accuracy": 0.8881733799353242, "num_tokens": 270355413.0, "step": 629 }, { "entropy": 0.381805419921875, "epoch": 2.4705882352941178, "grad_norm": 0.631196154024867, "learning_rate": 1.6822999706137565e-06, "loss": 0.3189, "mean_token_accuracy": 0.8858399009332061, "num_tokens": 270795892.0, "step": 630 }, { "entropy": 0.37823486328125, "epoch": 2.4745098039215687, "grad_norm": 0.6231692839077786, "learning_rate": 1.6583564031206357e-06, "loss": 0.3246, "mean_token_accuracy": 0.8817324228584766, "num_tokens": 271230718.0, "step": 631 }, { "entropy": 0.384552001953125, "epoch": 2.4784313725490197, "grad_norm": 0.6458519012457633, "learning_rate": 1.6345690343424758e-06, "loss": 0.3266, "mean_token_accuracy": 0.8848798228427768, "num_tokens": 271656053.0, "step": 632 }, { "entropy": 0.38848876953125, "epoch": 2.4823529411764707, "grad_norm": 0.6081484349151737, "learning_rate": 1.6109383097018628e-06, "loss": 0.3043, "mean_token_accuracy": 0.887602380476892, "num_tokens": 272080970.0, "step": 633 }, { "entropy": 0.387176513671875, "epoch": 2.4862745098039216, "grad_norm": 0.6151092693902757, "learning_rate": 1.587464671688187e-06, "loss": 0.3147, "mean_token_accuracy": 0.8866362348198891, "num_tokens": 272507901.0, "step": 634 }, { "entropy": 0.3912353515625, "epoch": 2.4901960784313726, "grad_norm": 0.6543393306086803, "learning_rate": 1.5641485598493744e-06, "loss": 0.3174, "mean_token_accuracy": 0.8848397238180041, "num_tokens": 272903491.0, "step": 635 }, { "entropy": 0.378570556640625, "epoch": 2.4941176470588236, "grad_norm": 0.6289329862500893, "learning_rate": 1.540990410783636e-06, "loss": 0.3145, "mean_token_accuracy": 0.8862317893654108, "num_tokens": 273341473.0, "step": 636 }, { "entropy": 0.376922607421875, "epoch": 2.4980392156862745, "grad_norm": 0.6091966079924904, "learning_rate": 1.5179906581313063e-06, "loss": 0.318, "mean_token_accuracy": 0.8845315454527736, "num_tokens": 273777511.0, "step": 637 }, { "entropy": 0.383026123046875, "epoch": 2.5019607843137255, "grad_norm": 0.6690355442288055, "learning_rate": 1.495149732566723e-06, "loss": 0.3082, "mean_token_accuracy": 0.8887853538617492, "num_tokens": 274187994.0, "step": 638 }, { "entropy": 0.3792724609375, "epoch": 2.5058823529411764, "grad_norm": 0.5924866327426076, "learning_rate": 1.4724680617901565e-06, "loss": 0.3105, "mean_token_accuracy": 0.8864017892628908, "num_tokens": 274640249.0, "step": 639 }, { "entropy": 0.381591796875, "epoch": 2.5098039215686274, "grad_norm": 0.6192344124742932, "learning_rate": 1.4499460705198e-06, "loss": 0.3067, "mean_token_accuracy": 0.8865052172914147, "num_tokens": 275057297.0, "step": 640 }, { "entropy": 0.3802490234375, "epoch": 2.5137254901960784, "grad_norm": 0.659901776871247, "learning_rate": 1.4275841804838298e-06, "loss": 0.3159, "mean_token_accuracy": 0.8849191283807158, "num_tokens": 275471254.0, "step": 641 }, { "entropy": 0.37738037109375, "epoch": 2.5176470588235293, "grad_norm": 0.6638824013938602, "learning_rate": 1.4053828104124867e-06, "loss": 0.3253, "mean_token_accuracy": 0.8840452143922448, "num_tokens": 275891640.0, "step": 642 }, { "entropy": 0.382904052734375, "epoch": 2.5215686274509803, "grad_norm": 0.6641332193014144, "learning_rate": 1.383342376030261e-06, "loss": 0.3241, "mean_token_accuracy": 0.8852192750200629, "num_tokens": 276307124.0, "step": 643 }, { "entropy": 0.3870849609375, "epoch": 2.5254901960784313, "grad_norm": 0.6160087628480071, "learning_rate": 1.361463290048085e-06, "loss": 0.309, "mean_token_accuracy": 0.886401055380702, "num_tokens": 276704241.0, "step": 644 }, { "entropy": 0.376007080078125, "epoch": 2.5294117647058822, "grad_norm": 0.6571155139010876, "learning_rate": 1.339745962155613e-06, "loss": 0.3261, "mean_token_accuracy": 0.8838352803140879, "num_tokens": 277154479.0, "step": 645 }, { "entropy": 0.381927490234375, "epoch": 2.533333333333333, "grad_norm": 0.6297597292306508, "learning_rate": 1.3181907990135624e-06, "loss": 0.3073, "mean_token_accuracy": 0.8882267083972692, "num_tokens": 277567661.0, "step": 646 }, { "entropy": 0.37628173828125, "epoch": 2.537254901960784, "grad_norm": 0.6329324728088497, "learning_rate": 1.2967982042460758e-06, "loss": 0.3171, "mean_token_accuracy": 0.8866855762898922, "num_tokens": 278003814.0, "step": 647 }, { "entropy": 0.385406494140625, "epoch": 2.541176470588235, "grad_norm": 0.6162701360426496, "learning_rate": 1.2755685784331784e-06, "loss": 0.3204, "mean_token_accuracy": 0.884353194385767, "num_tokens": 278430116.0, "step": 648 }, { "entropy": 0.381744384765625, "epoch": 2.545098039215686, "grad_norm": 0.6126382608490024, "learning_rate": 1.25450231910328e-06, "loss": 0.3237, "mean_token_accuracy": 0.8824541233479977, "num_tokens": 278877688.0, "step": 649 }, { "entropy": 0.379852294921875, "epoch": 2.549019607843137, "grad_norm": 0.6326607766200493, "learning_rate": 1.2335998207257138e-06, "loss": 0.317, "mean_token_accuracy": 0.8837267532944679, "num_tokens": 279288243.0, "step": 650 }, { "entropy": 0.380035400390625, "epoch": 2.552941176470588, "grad_norm": 0.680830740300124, "learning_rate": 1.2128614747033728e-06, "loss": 0.3306, "mean_token_accuracy": 0.8820574702695012, "num_tokens": 279710264.0, "step": 651 }, { "entropy": 0.378814697265625, "epoch": 2.556862745098039, "grad_norm": 0.6099946083878783, "learning_rate": 1.1922876693653584e-06, "loss": 0.3114, "mean_token_accuracy": 0.8873716210946441, "num_tokens": 280131883.0, "step": 652 }, { "entropy": 0.377716064453125, "epoch": 2.56078431372549, "grad_norm": 0.6295409472221051, "learning_rate": 1.1718787899597239e-06, "loss": 0.3002, "mean_token_accuracy": 0.8915431387722492, "num_tokens": 280576360.0, "step": 653 }, { "entropy": 0.375396728515625, "epoch": 2.564705882352941, "grad_norm": 0.6209039798446115, "learning_rate": 1.1516352186462588e-06, "loss": 0.3198, "mean_token_accuracy": 0.8849526178091764, "num_tokens": 281018212.0, "step": 654 }, { "entropy": 0.37811279296875, "epoch": 2.568627450980392, "grad_norm": 0.6320912091179962, "learning_rate": 1.131557334489326e-06, "loss": 0.3049, "mean_token_accuracy": 0.8900288715958595, "num_tokens": 281432844.0, "step": 655 }, { "entropy": 0.38299560546875, "epoch": 2.572549019607843, "grad_norm": 0.6821011372034809, "learning_rate": 1.1116455134507665e-06, "loss": 0.3163, "mean_token_accuracy": 0.8853369234129786, "num_tokens": 281847459.0, "step": 656 }, { "entropy": 0.37677001953125, "epoch": 2.576470588235294, "grad_norm": 0.6235261380861172, "learning_rate": 1.0919001283828666e-06, "loss": 0.3133, "mean_token_accuracy": 0.8864665804430842, "num_tokens": 282286944.0, "step": 657 }, { "entropy": 0.38006591796875, "epoch": 2.5803921568627453, "grad_norm": 0.6221303988087854, "learning_rate": 1.0723215490213635e-06, "loss": 0.311, "mean_token_accuracy": 0.8873193822801113, "num_tokens": 282710819.0, "step": 658 }, { "entropy": 0.37469482421875, "epoch": 2.5843137254901962, "grad_norm": 0.6574754213052622, "learning_rate": 1.052910141978537e-06, "loss": 0.3125, "mean_token_accuracy": 0.886294306255877, "num_tokens": 283156052.0, "step": 659 }, { "entropy": 0.3814697265625, "epoch": 2.588235294117647, "grad_norm": 0.6320334063566375, "learning_rate": 1.0336662707363287e-06, "loss": 0.3173, "mean_token_accuracy": 0.8854138003662229, "num_tokens": 283595201.0, "step": 660 }, { "entropy": 0.376983642578125, "epoch": 2.592156862745098, "grad_norm": 0.619569861836383, "learning_rate": 1.0145902956395449e-06, "loss": 0.3154, "mean_token_accuracy": 0.8877551760524511, "num_tokens": 284021326.0, "step": 661 }, { "entropy": 0.378021240234375, "epoch": 2.596078431372549, "grad_norm": 0.6178131877623513, "learning_rate": 9.95682573889114e-07, "loss": 0.3101, "mean_token_accuracy": 0.8884122371673584, "num_tokens": 284444911.0, "step": 662 }, { "entropy": 0.374053955078125, "epoch": 2.6, "grad_norm": 0.6257874518096788, "learning_rate": 9.76943459535381e-07, "loss": 0.3054, "mean_token_accuracy": 0.8889162968844175, "num_tokens": 284883556.0, "step": 663 }, { "entropy": 0.381072998046875, "epoch": 2.603921568627451, "grad_norm": 0.608051038563699, "learning_rate": 9.583733034714982e-07, "loss": 0.3196, "mean_token_accuracy": 0.886277524754405, "num_tokens": 285330782.0, "step": 664 }, { "entropy": 0.3798828125, "epoch": 2.607843137254902, "grad_norm": 0.5909285116426348, "learning_rate": 9.399724534268385e-07, "loss": 0.3042, "mean_token_accuracy": 0.8899399247020483, "num_tokens": 285763663.0, "step": 665 }, { "entropy": 0.380279541015625, "epoch": 2.611764705882353, "grad_norm": 0.6431264626693568, "learning_rate": 9.217412539604942e-07, "loss": 0.2959, "mean_token_accuracy": 0.8899163901805878, "num_tokens": 286199562.0, "step": 666 }, { "entropy": 0.376007080078125, "epoch": 2.615686274509804, "grad_norm": 0.5982503195100611, "learning_rate": 9.036800464548157e-07, "loss": 0.3192, "mean_token_accuracy": 0.8865080736577511, "num_tokens": 286621995.0, "step": 667 }, { "entropy": 0.379364013671875, "epoch": 2.619607843137255, "grad_norm": 0.6270853297070971, "learning_rate": 8.857891691090336e-07, "loss": 0.3134, "mean_token_accuracy": 0.8862617230042815, "num_tokens": 287057155.0, "step": 668 }, { "entropy": 0.3865966796875, "epoch": 2.623529411764706, "grad_norm": 0.6300575883109812, "learning_rate": 8.680689569329071e-07, "loss": 0.3233, "mean_token_accuracy": 0.8842667685821652, "num_tokens": 287460969.0, "step": 669 }, { "entropy": 0.377655029296875, "epoch": 2.627450980392157, "grad_norm": 0.6453389395674596, "learning_rate": 8.505197417404687e-07, "loss": 0.3187, "mean_token_accuracy": 0.8853940945118666, "num_tokens": 287895047.0, "step": 670 }, { "entropy": 0.382720947265625, "epoch": 2.631372549019608, "grad_norm": 0.6238765655563359, "learning_rate": 8.331418521437973e-07, "loss": 0.3069, "mean_token_accuracy": 0.8877802221104503, "num_tokens": 288310065.0, "step": 671 }, { "entropy": 0.374603271484375, "epoch": 2.635294117647059, "grad_norm": 0.6047173519053458, "learning_rate": 8.159356135468721e-07, "loss": 0.3219, "mean_token_accuracy": 0.8853782610967755, "num_tokens": 288751087.0, "step": 672 }, { "entropy": 0.3780517578125, "epoch": 2.6392156862745098, "grad_norm": 0.622327693826135, "learning_rate": 7.989013481394813e-07, "loss": 0.304, "mean_token_accuracy": 0.8917795773595572, "num_tokens": 289180726.0, "step": 673 }, { "entropy": 0.380523681640625, "epoch": 2.6431372549019607, "grad_norm": 0.6222821808342242, "learning_rate": 7.820393748911792e-07, "loss": 0.3074, "mean_token_accuracy": 0.8889235118404031, "num_tokens": 289591818.0, "step": 674 }, { "entropy": 0.3770751953125, "epoch": 2.6470588235294117, "grad_norm": 0.6125349893982243, "learning_rate": 7.653500095453248e-07, "loss": 0.3086, "mean_token_accuracy": 0.8879644311964512, "num_tokens": 290041282.0, "step": 675 }, { "entropy": 0.3822021484375, "epoch": 2.6509803921568627, "grad_norm": 0.6282527306601516, "learning_rate": 7.488335646131628e-07, "loss": 0.3051, "mean_token_accuracy": 0.8894842406734824, "num_tokens": 290458439.0, "step": 676 }, { "entropy": 0.379425048828125, "epoch": 2.6549019607843136, "grad_norm": 0.6299084942404871, "learning_rate": 7.324903493679703e-07, "loss": 0.3097, "mean_token_accuracy": 0.8882144782692194, "num_tokens": 290895655.0, "step": 677 }, { "entropy": 0.37530517578125, "epoch": 2.6588235294117646, "grad_norm": 0.6069024822726714, "learning_rate": 7.163206698392744e-07, "loss": 0.3099, "mean_token_accuracy": 0.8873086804524064, "num_tokens": 291353643.0, "step": 678 }, { "entropy": 0.382171630859375, "epoch": 2.6627450980392156, "grad_norm": 0.6334982938868678, "learning_rate": 7.003248288071118e-07, "loss": 0.3144, "mean_token_accuracy": 0.887144822627306, "num_tokens": 291762961.0, "step": 679 }, { "entropy": 0.38092041015625, "epoch": 2.6666666666666665, "grad_norm": 0.6488389669112627, "learning_rate": 6.845031257963619e-07, "loss": 0.316, "mean_token_accuracy": 0.8869073716923594, "num_tokens": 292193427.0, "step": 680 }, { "entropy": 0.38385009765625, "epoch": 2.6705882352941175, "grad_norm": 0.6227550674311402, "learning_rate": 6.688558570711468e-07, "loss": 0.3166, "mean_token_accuracy": 0.8865640126168728, "num_tokens": 292607942.0, "step": 681 }, { "entropy": 0.38037109375, "epoch": 2.674509803921569, "grad_norm": 0.6169942122797281, "learning_rate": 6.53383315629268e-07, "loss": 0.3048, "mean_token_accuracy": 0.8906151866540313, "num_tokens": 293040009.0, "step": 682 }, { "entropy": 0.375091552734375, "epoch": 2.67843137254902, "grad_norm": 0.615932545560767, "learning_rate": 6.380857911967364e-07, "loss": 0.3065, "mean_token_accuracy": 0.888974635861814, "num_tokens": 293489302.0, "step": 683 }, { "entropy": 0.3773193359375, "epoch": 2.682352941176471, "grad_norm": 0.6224114881397005, "learning_rate": 6.229635702223325e-07, "loss": 0.3114, "mean_token_accuracy": 0.8888672534376383, "num_tokens": 293918626.0, "step": 684 }, { "entropy": 0.381591796875, "epoch": 2.686274509803922, "grad_norm": 0.6075798679022492, "learning_rate": 6.08016935872251e-07, "loss": 0.3071, "mean_token_accuracy": 0.8886892907321453, "num_tokens": 294344050.0, "step": 685 }, { "entropy": 0.375885009765625, "epoch": 2.6901960784313728, "grad_norm": 0.6348423219100143, "learning_rate": 5.932461680248014e-07, "loss": 0.3022, "mean_token_accuracy": 0.8917614500969648, "num_tokens": 294782851.0, "step": 686 }, { "entropy": 0.384307861328125, "epoch": 2.6941176470588237, "grad_norm": 0.627257222117316, "learning_rate": 5.786515432651563e-07, "loss": 0.3033, "mean_token_accuracy": 0.8891853494569659, "num_tokens": 295179548.0, "step": 687 }, { "entropy": 0.37713623046875, "epoch": 2.6980392156862747, "grad_norm": 0.6233347416359626, "learning_rate": 5.64233334880181e-07, "loss": 0.3184, "mean_token_accuracy": 0.8849873188883066, "num_tokens": 295611827.0, "step": 688 }, { "entropy": 0.3787841796875, "epoch": 2.7019607843137257, "grad_norm": 0.6121392174435754, "learning_rate": 5.499918128533155e-07, "loss": 0.3126, "mean_token_accuracy": 0.8860092610120773, "num_tokens": 296030475.0, "step": 689 }, { "entropy": 0.37786865234375, "epoch": 2.7058823529411766, "grad_norm": 0.6016793737256022, "learning_rate": 5.359272438595153e-07, "loss": 0.3069, "mean_token_accuracy": 0.8904592543840408, "num_tokens": 296453853.0, "step": 690 }, { "entropy": 0.377716064453125, "epoch": 2.7098039215686276, "grad_norm": 0.6620926577713818, "learning_rate": 5.22039891260262e-07, "loss": 0.3257, "mean_token_accuracy": 0.8827581582590938, "num_tokens": 296901649.0, "step": 691 }, { "entropy": 0.38031005859375, "epoch": 2.7137254901960786, "grad_norm": 0.6209236794788132, "learning_rate": 5.083300150986259e-07, "loss": 0.3133, "mean_token_accuracy": 0.8865346414968371, "num_tokens": 297319515.0, "step": 692 }, { "entropy": 0.38470458984375, "epoch": 2.7176470588235295, "grad_norm": 0.6169264345057406, "learning_rate": 4.947978720944025e-07, "loss": 0.3281, "mean_token_accuracy": 0.883204753510654, "num_tokens": 297749112.0, "step": 693 }, { "entropy": 0.3839111328125, "epoch": 2.7215686274509805, "grad_norm": 0.7143983827383057, "learning_rate": 4.814437156393048e-07, "loss": 0.3114, "mean_token_accuracy": 0.8880084175616503, "num_tokens": 298166481.0, "step": 694 }, { "entropy": 0.375640869140625, "epoch": 2.7254901960784315, "grad_norm": 0.6088784931751031, "learning_rate": 4.682677957922155e-07, "loss": 0.3069, "mean_token_accuracy": 0.8904805909842253, "num_tokens": 298606282.0, "step": 695 }, { "entropy": 0.378326416015625, "epoch": 2.7294117647058824, "grad_norm": 0.6317368332658094, "learning_rate": 4.5527035927450337e-07, "loss": 0.31, "mean_token_accuracy": 0.887597025372088, "num_tokens": 299050219.0, "step": 696 }, { "entropy": 0.379425048828125, "epoch": 2.7333333333333334, "grad_norm": 0.6095714519363978, "learning_rate": 4.424516494654119e-07, "loss": 0.3155, "mean_token_accuracy": 0.8861374296247959, "num_tokens": 299491767.0, "step": 697 }, { "entropy": 0.376434326171875, "epoch": 2.7372549019607844, "grad_norm": 0.5957575584979452, "learning_rate": 4.298119063974915e-07, "loss": 0.3075, "mean_token_accuracy": 0.8884414276108146, "num_tokens": 299937012.0, "step": 698 }, { "entropy": 0.38043212890625, "epoch": 2.7411764705882353, "grad_norm": 0.6145754054259317, "learning_rate": 4.173513667521123e-07, "loss": 0.3231, "mean_token_accuracy": 0.8824452431872487, "num_tokens": 300366708.0, "step": 699 }, { "entropy": 0.377410888671875, "epoch": 2.7450980392156863, "grad_norm": 0.596226526796468, "learning_rate": 4.0507026385502747e-07, "loss": 0.3185, "mean_token_accuracy": 0.8844813704490662, "num_tokens": 300808950.0, "step": 700 }, { "entropy": 0.387542724609375, "epoch": 2.7490196078431373, "grad_norm": 0.6125484910454695, "learning_rate": 3.929688276720045e-07, "loss": 0.3061, "mean_token_accuracy": 0.8885686565190554, "num_tokens": 301222241.0, "step": 701 }, { "entropy": 0.3780517578125, "epoch": 2.7529411764705882, "grad_norm": 0.624916010504038, "learning_rate": 3.810472848045266e-07, "loss": 0.3177, "mean_token_accuracy": 0.8858394585549831, "num_tokens": 301663064.0, "step": 702 }, { "entropy": 0.376312255859375, "epoch": 2.756862745098039, "grad_norm": 0.6058817794906675, "learning_rate": 3.693058584855369e-07, "loss": 0.3151, "mean_token_accuracy": 0.8896542508155107, "num_tokens": 302117732.0, "step": 703 }, { "entropy": 0.379486083984375, "epoch": 2.76078431372549, "grad_norm": 0.6464785466948662, "learning_rate": 3.5774476857527107e-07, "loss": 0.3153, "mean_token_accuracy": 0.8852464202791452, "num_tokens": 302565320.0, "step": 704 }, { "entropy": 0.3858642578125, "epoch": 2.764705882352941, "grad_norm": 0.6403154212673284, "learning_rate": 3.463642315571292e-07, "loss": 0.3057, "mean_token_accuracy": 0.8876703213900328, "num_tokens": 302985307.0, "step": 705 }, { "entropy": 0.3782958984375, "epoch": 2.768627450980392, "grad_norm": 0.6095940050206947, "learning_rate": 3.3516446053363015e-07, "loss": 0.3082, "mean_token_accuracy": 0.8882761783897877, "num_tokens": 303424607.0, "step": 706 }, { "entropy": 0.379974365234375, "epoch": 2.772549019607843, "grad_norm": 0.6161476280093703, "learning_rate": 3.241456652224184e-07, "loss": 0.3138, "mean_token_accuracy": 0.8859195495024323, "num_tokens": 303860684.0, "step": 707 }, { "entropy": 0.379730224609375, "epoch": 2.776470588235294, "grad_norm": 0.5960959951045524, "learning_rate": 3.1330805195233684e-07, "loss": 0.3058, "mean_token_accuracy": 0.8890456901863217, "num_tokens": 304293142.0, "step": 708 }, { "entropy": 0.38690185546875, "epoch": 2.780392156862745, "grad_norm": 0.6160682077130808, "learning_rate": 3.0265182365956213e-07, "loss": 0.309, "mean_token_accuracy": 0.8865752117708325, "num_tokens": 304706003.0, "step": 709 }, { "entropy": 0.38427734375, "epoch": 2.784313725490196, "grad_norm": 0.6111529979519884, "learning_rate": 2.921771798838069e-07, "loss": 0.3237, "mean_token_accuracy": 0.8845190508291125, "num_tokens": 305131254.0, "step": 710 }, { "entropy": 0.377349853515625, "epoch": 2.788235294117647, "grad_norm": 0.6099627522064227, "learning_rate": 2.818843167645835e-07, "loss": 0.3129, "mean_token_accuracy": 0.8868130994960666, "num_tokens": 305567833.0, "step": 711 }, { "entropy": 0.385406494140625, "epoch": 2.792156862745098, "grad_norm": 0.6105851628336647, "learning_rate": 2.717734270375272e-07, "loss": 0.3197, "mean_token_accuracy": 0.8870526794344187, "num_tokens": 305990304.0, "step": 712 }, { "entropy": 0.386505126953125, "epoch": 2.796078431372549, "grad_norm": 0.6247273295141441, "learning_rate": 2.618447000307922e-07, "loss": 0.3137, "mean_token_accuracy": 0.8874122854322195, "num_tokens": 306395161.0, "step": 713 }, { "entropy": 0.381256103515625, "epoch": 2.8, "grad_norm": 0.635299701134664, "learning_rate": 2.520983216615047e-07, "loss": 0.298, "mean_token_accuracy": 0.8914454290643334, "num_tokens": 306825012.0, "step": 714 }, { "entropy": 0.38519287109375, "epoch": 2.803921568627451, "grad_norm": 0.595748483597398, "learning_rate": 2.4253447443228106e-07, "loss": 0.3045, "mean_token_accuracy": 0.8908979864791036, "num_tokens": 307253222.0, "step": 715 }, { "entropy": 0.381011962890625, "epoch": 2.8078431372549018, "grad_norm": 0.6134283304279732, "learning_rate": 2.3315333742780942e-07, "loss": 0.3175, "mean_token_accuracy": 0.8856520559638739, "num_tokens": 307689012.0, "step": 716 }, { "entropy": 0.379150390625, "epoch": 2.8117647058823527, "grad_norm": 0.6057712094651984, "learning_rate": 2.23955086311497e-07, "loss": 0.3126, "mean_token_accuracy": 0.8885315740481019, "num_tokens": 308125986.0, "step": 717 }, { "entropy": 0.3779296875, "epoch": 2.8156862745098037, "grad_norm": 0.635688295650433, "learning_rate": 2.1493989332218468e-07, "loss": 0.306, "mean_token_accuracy": 0.8884778618812561, "num_tokens": 308545495.0, "step": 718 }, { "entropy": 0.380035400390625, "epoch": 2.8196078431372547, "grad_norm": 0.6261639298694902, "learning_rate": 2.0610792727091434e-07, "loss": 0.3156, "mean_token_accuracy": 0.886909999884665, "num_tokens": 308975486.0, "step": 719 }, { "entropy": 0.38824462890625, "epoch": 2.8235294117647056, "grad_norm": 0.6134392371302576, "learning_rate": 1.9745935353777222e-07, "loss": 0.3067, "mean_token_accuracy": 0.8894705669954419, "num_tokens": 309390246.0, "step": 720 }, { "entropy": 0.376953125, "epoch": 2.8274509803921566, "grad_norm": 0.6215029371259475, "learning_rate": 1.889943340687961e-07, "loss": 0.3069, "mean_token_accuracy": 0.8888574000447989, "num_tokens": 309818332.0, "step": 721 }, { "entropy": 0.376678466796875, "epoch": 2.831372549019608, "grad_norm": 0.6070304102792409, "learning_rate": 1.8071302737293294e-07, "loss": 0.3059, "mean_token_accuracy": 0.8882985869422555, "num_tokens": 310258005.0, "step": 722 }, { "entropy": 0.38250732421875, "epoch": 2.835294117647059, "grad_norm": 0.611400797022777, "learning_rate": 1.7261558851908056e-07, "loss": 0.3103, "mean_token_accuracy": 0.8894185777753592, "num_tokens": 310676709.0, "step": 723 }, { "entropy": 0.377838134765625, "epoch": 2.83921568627451, "grad_norm": 0.6356223613609566, "learning_rate": 1.6470216913317628e-07, "loss": 0.3143, "mean_token_accuracy": 0.8873595865443349, "num_tokens": 311108400.0, "step": 724 }, { "entropy": 0.383544921875, "epoch": 2.843137254901961, "grad_norm": 0.6229792377890674, "learning_rate": 1.569729173953638e-07, "loss": 0.3033, "mean_token_accuracy": 0.8884387910366058, "num_tokens": 311510481.0, "step": 725 }, { "entropy": 0.37615966796875, "epoch": 2.847058823529412, "grad_norm": 0.6431143365231876, "learning_rate": 1.4942797803721543e-07, "loss": 0.3218, "mean_token_accuracy": 0.8845628844574094, "num_tokens": 311942786.0, "step": 726 }, { "entropy": 0.38525390625, "epoch": 2.850980392156863, "grad_norm": 0.6206033497042358, "learning_rate": 1.4206749233902084e-07, "loss": 0.3054, "mean_token_accuracy": 0.8902052519842982, "num_tokens": 312370010.0, "step": 727 }, { "entropy": 0.3773193359375, "epoch": 2.854901960784314, "grad_norm": 0.6498627060531488, "learning_rate": 1.348915981271437e-07, "loss": 0.3177, "mean_token_accuracy": 0.885548148304224, "num_tokens": 312798692.0, "step": 728 }, { "entropy": 0.3800048828125, "epoch": 2.8588235294117648, "grad_norm": 0.6411848862187207, "learning_rate": 1.2790042977144256e-07, "loss": 0.3082, "mean_token_accuracy": 0.8862299472093582, "num_tokens": 313206172.0, "step": 729 }, { "entropy": 0.379302978515625, "epoch": 2.8627450980392157, "grad_norm": 0.626796859270999, "learning_rate": 1.2109411818274851e-07, "loss": 0.3128, "mean_token_accuracy": 0.8862822419032454, "num_tokens": 313626626.0, "step": 730 }, { "entropy": 0.376983642578125, "epoch": 2.8666666666666667, "grad_norm": 0.6173793755783579, "learning_rate": 1.1447279081042262e-07, "loss": 0.3091, "mean_token_accuracy": 0.8877417473122478, "num_tokens": 314049995.0, "step": 731 }, { "entropy": 0.3756103515625, "epoch": 2.8705882352941177, "grad_norm": 0.6153282901073235, "learning_rate": 1.0803657163995896e-07, "loss": 0.3135, "mean_token_accuracy": 0.8866222072392702, "num_tokens": 314479750.0, "step": 732 }, { "entropy": 0.380462646484375, "epoch": 2.8745098039215686, "grad_norm": 0.6837019195802836, "learning_rate": 1.0178558119067316e-07, "loss": 0.3243, "mean_token_accuracy": 0.886231679469347, "num_tokens": 314909938.0, "step": 733 }, { "entropy": 0.377532958984375, "epoch": 2.8784313725490196, "grad_norm": 0.599153352101819, "learning_rate": 9.571993651343869e-08, "loss": 0.3156, "mean_token_accuracy": 0.8857722133398056, "num_tokens": 315360267.0, "step": 734 }, { "entropy": 0.387359619140625, "epoch": 2.8823529411764706, "grad_norm": 0.6417439052424432, "learning_rate": 8.983975118849853e-08, "loss": 0.32, "mean_token_accuracy": 0.8861173130571842, "num_tokens": 315778855.0, "step": 735 }, { "entropy": 0.3782958984375, "epoch": 2.8862745098039215, "grad_norm": 0.6249131544678768, "learning_rate": 8.41451353233369e-08, "loss": 0.3139, "mean_token_accuracy": 0.8885722318664193, "num_tokens": 316222091.0, "step": 736 }, { "entropy": 0.378143310546875, "epoch": 2.8901960784313725, "grad_norm": 0.6129592636225704, "learning_rate": 7.863619555061874e-08, "loss": 0.2987, "mean_token_accuracy": 0.8897266024723649, "num_tokens": 316647632.0, "step": 737 }, { "entropy": 0.378021240234375, "epoch": 2.8941176470588235, "grad_norm": 0.62004735541224, "learning_rate": 7.331303502618903e-08, "loss": 0.3088, "mean_token_accuracy": 0.8891623057425022, "num_tokens": 317083092.0, "step": 738 }, { "entropy": 0.379150390625, "epoch": 2.8980392156862744, "grad_norm": 0.6220730786367578, "learning_rate": 6.817575342714988e-08, "loss": 0.3165, "mean_token_accuracy": 0.8854466071352363, "num_tokens": 317521244.0, "step": 739 }, { "entropy": 0.378814697265625, "epoch": 2.9019607843137254, "grad_norm": 0.6132017832098181, "learning_rate": 6.32244469499832e-08, "loss": 0.3166, "mean_token_accuracy": 0.8851985028013587, "num_tokens": 317962973.0, "step": 740 }, { "entropy": 0.381866455078125, "epoch": 2.9058823529411764, "grad_norm": 0.602046192463936, "learning_rate": 5.845920830875651e-08, "loss": 0.3097, "mean_token_accuracy": 0.8887276640161872, "num_tokens": 318402805.0, "step": 741 }, { "entropy": 0.38330078125, "epoch": 2.9098039215686273, "grad_norm": 0.6594166630680239, "learning_rate": 5.388012673338661e-08, "loss": 0.3263, "mean_token_accuracy": 0.8835462033748627, "num_tokens": 318832764.0, "step": 742 }, { "entropy": 0.3841552734375, "epoch": 2.9137254901960783, "grad_norm": 0.6003852014770814, "learning_rate": 4.9487287967964206e-08, "loss": 0.3139, "mean_token_accuracy": 0.8860819693654776, "num_tokens": 319258968.0, "step": 743 }, { "entropy": 0.372955322265625, "epoch": 2.9176470588235293, "grad_norm": 0.6042410386322835, "learning_rate": 4.528077426915412e-08, "loss": 0.309, "mean_token_accuracy": 0.8899551248177886, "num_tokens": 319697667.0, "step": 744 }, { "entropy": 0.37725830078125, "epoch": 2.9215686274509802, "grad_norm": 0.6075840713659405, "learning_rate": 4.126066440464982e-08, "loss": 0.309, "mean_token_accuracy": 0.8887394778430462, "num_tokens": 320116941.0, "step": 745 }, { "entropy": 0.385833740234375, "epoch": 2.9254901960784316, "grad_norm": 0.6025272498394463, "learning_rate": 3.7427033651702414e-08, "loss": 0.3113, "mean_token_accuracy": 0.8870739573612809, "num_tokens": 320545709.0, "step": 746 }, { "entropy": 0.38299560546875, "epoch": 2.9294117647058826, "grad_norm": 0.6081689478979662, "learning_rate": 3.377995379570731e-08, "loss": 0.3274, "mean_token_accuracy": 0.8839912796393037, "num_tokens": 320960157.0, "step": 747 }, { "entropy": 0.3834228515625, "epoch": 2.9333333333333336, "grad_norm": 0.5859210580796687, "learning_rate": 3.03194931288664e-08, "loss": 0.3094, "mean_token_accuracy": 0.8882154319435358, "num_tokens": 321386558.0, "step": 748 }, { "entropy": 0.3800048828125, "epoch": 2.9372549019607845, "grad_norm": 0.6840561170761486, "learning_rate": 2.7045716448901305e-08, "loss": 0.3073, "mean_token_accuracy": 0.8912063157185912, "num_tokens": 321808424.0, "step": 749 }, { "entropy": 0.38714599609375, "epoch": 2.9411764705882355, "grad_norm": 0.6504399582680688, "learning_rate": 2.3958685057844378e-08, "loss": 0.3176, "mean_token_accuracy": 0.8845123695209622, "num_tokens": 322199235.0, "step": 750 }, { "entropy": 0.379180908203125, "epoch": 2.9450980392156865, "grad_norm": 0.6102222267932175, "learning_rate": 2.10584567608918e-08, "loss": 0.3143, "mean_token_accuracy": 0.8865124061703682, "num_tokens": 322626166.0, "step": 751 }, { "entropy": 0.38189697265625, "epoch": 2.9490196078431374, "grad_norm": 0.5963276311619458, "learning_rate": 1.83450858653178e-08, "loss": 0.3147, "mean_token_accuracy": 0.8874166300520301, "num_tokens": 323061501.0, "step": 752 }, { "entropy": 0.384246826171875, "epoch": 2.9529411764705884, "grad_norm": 0.6117631243948043, "learning_rate": 1.5818623179459924e-08, "loss": 0.308, "mean_token_accuracy": 0.8874607440084219, "num_tokens": 323481673.0, "step": 753 }, { "entropy": 0.38275146484375, "epoch": 2.9568627450980394, "grad_norm": 0.6167324543611288, "learning_rate": 1.3479116011769766e-08, "loss": 0.3286, "mean_token_accuracy": 0.8801386319100857, "num_tokens": 323906728.0, "step": 754 }, { "entropy": 0.3828125, "epoch": 2.9607843137254903, "grad_norm": 0.5960943457045179, "learning_rate": 1.1326608169920373e-08, "loss": 0.3375, "mean_token_accuracy": 0.8788588084280491, "num_tokens": 324319349.0, "step": 755 }, { "entropy": 0.383697509765625, "epoch": 2.9647058823529413, "grad_norm": 0.5965957357845432, "learning_rate": 9.361139959993549e-09, "loss": 0.3146, "mean_token_accuracy": 0.8867331743240356, "num_tokens": 324746953.0, "step": 756 }, { "entropy": 0.379486083984375, "epoch": 2.9686274509803923, "grad_norm": 0.636712886908032, "learning_rate": 7.582748185719357e-09, "loss": 0.3261, "mean_token_accuracy": 0.8830597475171089, "num_tokens": 325186860.0, "step": 757 }, { "entropy": 0.388946533203125, "epoch": 2.9725490196078432, "grad_norm": 0.6118740609991475, "learning_rate": 5.991466147791114e-09, "loss": 0.312, "mean_token_accuracy": 0.8873140625655651, "num_tokens": 325595646.0, "step": 758 }, { "entropy": 0.38189697265625, "epoch": 2.976470588235294, "grad_norm": 0.6001174174999216, "learning_rate": 4.587323643240327e-09, "loss": 0.3126, "mean_token_accuracy": 0.885263648815453, "num_tokens": 326034848.0, "step": 759 }, { "entropy": 0.37396240234375, "epoch": 2.980392156862745, "grad_norm": 0.6464429544279401, "learning_rate": 3.3703469648760367e-09, "loss": 0.3078, "mean_token_accuracy": 0.8882996095344424, "num_tokens": 326467480.0, "step": 760 }, { "entropy": 0.375457763671875, "epoch": 2.984313725490196, "grad_norm": 0.6289692164207805, "learning_rate": 2.340558900796319e-09, "loss": 0.3076, "mean_token_accuracy": 0.8890138473361731, "num_tokens": 326902334.0, "step": 761 }, { "entropy": 0.384246826171875, "epoch": 2.988235294117647, "grad_norm": 0.6949597130048252, "learning_rate": 1.497978733961958e-09, "loss": 0.3198, "mean_token_accuracy": 0.8849008204415441, "num_tokens": 327332137.0, "step": 762 }, { "entropy": 0.38165283203125, "epoch": 2.992156862745098, "grad_norm": 0.630348147185592, "learning_rate": 8.426222418311814e-10, "loss": 0.3024, "mean_token_accuracy": 0.8867057710886002, "num_tokens": 327746883.0, "step": 763 }, { "entropy": 0.383331298828125, "epoch": 2.996078431372549, "grad_norm": 0.6041240098580235, "learning_rate": 3.745016960665648e-10, "loss": 0.3177, "mean_token_accuracy": 0.8889250578358769, "num_tokens": 328189791.0, "step": 764 }, { "entropy": 0.379364013671875, "epoch": 3.0, "grad_norm": 0.5940158938944181, "learning_rate": 9.362586230632353e-11, "loss": 0.3202, "mean_token_accuracy": 0.8860361827537417, "num_tokens": 328632895.0, "step": 765 }, { "epoch": 3.0, "step": 765, "total_flos": 607742654087168.0, "train_loss": 0.4391140220601574, "train_runtime": 58765.7127, "train_samples_per_second": 1.258, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 765, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 64, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 607742654087168.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }