{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 756, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5677337646484375, "epoch": 0.007936507936507936, "grad_norm": 5.825740150213408, "learning_rate": 0.0, "loss": 1.3956, "mean_token_accuracy": 0.6547382255084813, "num_tokens": 849869.0, "step": 1 }, { "entropy": 0.569549560546875, "epoch": 0.015873015873015872, "grad_norm": 5.801156934108041, "learning_rate": 2.6315789473684213e-07, "loss": 1.4001, "mean_token_accuracy": 0.6515501267276704, "num_tokens": 1710146.0, "step": 2 }, { "entropy": 0.5733184814453125, "epoch": 0.023809523809523808, "grad_norm": 5.697225721311094, "learning_rate": 5.263157894736843e-07, "loss": 1.3825, "mean_token_accuracy": 0.6571523365564644, "num_tokens": 2560005.0, "step": 3 }, { "entropy": 0.5648651123046875, "epoch": 0.031746031746031744, "grad_norm": 5.692098743617845, "learning_rate": 7.894736842105263e-07, "loss": 1.3997, "mean_token_accuracy": 0.65298081189394, "num_tokens": 3457966.0, "step": 4 }, { "entropy": 0.57421875, "epoch": 0.03968253968253968, "grad_norm": 5.78106605094393, "learning_rate": 1.0526315789473685e-06, "loss": 1.4008, "mean_token_accuracy": 0.6524212104268372, "num_tokens": 4321827.0, "step": 5 }, { "entropy": 0.5650482177734375, "epoch": 0.047619047619047616, "grad_norm": 5.616712544244806, "learning_rate": 1.3157894736842106e-06, "loss": 1.3776, "mean_token_accuracy": 0.6610458297654986, "num_tokens": 5188122.0, "step": 6 }, { "entropy": 0.5651702880859375, "epoch": 0.05555555555555555, "grad_norm": 5.431878258315051, "learning_rate": 1.5789473684210526e-06, "loss": 1.3756, "mean_token_accuracy": 0.6562704290263355, "num_tokens": 6042413.0, "step": 7 }, { "entropy": 0.5759429931640625, "epoch": 0.06349206349206349, "grad_norm": 5.337410424762961, "learning_rate": 1.8421052631578948e-06, "loss": 1.3735, "mean_token_accuracy": 0.6550004091113806, "num_tokens": 6898441.0, "step": 8 }, { "entropy": 0.562835693359375, "epoch": 0.07142857142857142, "grad_norm": 5.254246336716741, "learning_rate": 2.105263157894737e-06, "loss": 1.3621, "mean_token_accuracy": 0.6594638815149665, "num_tokens": 7794638.0, "step": 9 }, { "entropy": 0.5644378662109375, "epoch": 0.07936507936507936, "grad_norm": 4.504374040638264, "learning_rate": 2.368421052631579e-06, "loss": 1.3207, "mean_token_accuracy": 0.6636323751881719, "num_tokens": 8673402.0, "step": 10 }, { "entropy": 0.5662384033203125, "epoch": 0.0873015873015873, "grad_norm": 4.290465182305912, "learning_rate": 2.631578947368421e-06, "loss": 1.2869, "mean_token_accuracy": 0.6718250620178878, "num_tokens": 9525436.0, "step": 11 }, { "entropy": 0.573974609375, "epoch": 0.09523809523809523, "grad_norm": 4.112783175310539, "learning_rate": 2.8947368421052634e-06, "loss": 1.2744, "mean_token_accuracy": 0.6722556869499385, "num_tokens": 10358777.0, "step": 12 }, { "entropy": 0.5698089599609375, "epoch": 0.10317460317460317, "grad_norm": 3.231368850972016, "learning_rate": 3.157894736842105e-06, "loss": 1.1762, "mean_token_accuracy": 0.6900928025133908, "num_tokens": 11211677.0, "step": 13 }, { "entropy": 0.568206787109375, "epoch": 0.1111111111111111, "grad_norm": 3.2472932637870775, "learning_rate": 3.421052631578948e-06, "loss": 1.1595, "mean_token_accuracy": 0.6925145331770182, "num_tokens": 12067363.0, "step": 14 }, { "entropy": 0.5522003173828125, "epoch": 0.11904761904761904, "grad_norm": 2.955470813245027, "learning_rate": 3.6842105263157896e-06, "loss": 1.1484, "mean_token_accuracy": 0.6949247056618333, "num_tokens": 12945458.0, "step": 15 }, { "entropy": 0.5524139404296875, "epoch": 0.12698412698412698, "grad_norm": 2.953534618001751, "learning_rate": 3.947368421052632e-06, "loss": 1.1231, "mean_token_accuracy": 0.6996096298098564, "num_tokens": 13815066.0, "step": 16 }, { "entropy": 0.5273590087890625, "epoch": 0.1349206349206349, "grad_norm": 3.6738847590405457, "learning_rate": 4.210526315789474e-06, "loss": 1.0456, "mean_token_accuracy": 0.715466492343694, "num_tokens": 14685173.0, "step": 17 }, { "entropy": 0.5328826904296875, "epoch": 0.14285714285714285, "grad_norm": 3.975231208187267, "learning_rate": 4.473684210526316e-06, "loss": 1.0218, "mean_token_accuracy": 0.7184609142132103, "num_tokens": 15522062.0, "step": 18 }, { "entropy": 0.5339813232421875, "epoch": 0.15079365079365079, "grad_norm": 3.7568291660582056, "learning_rate": 4.736842105263158e-06, "loss": 0.9846, "mean_token_accuracy": 0.7254857295192778, "num_tokens": 16388252.0, "step": 19 }, { "entropy": 0.539520263671875, "epoch": 0.15873015873015872, "grad_norm": 3.256772081860255, "learning_rate": 5e-06, "loss": 0.9572, "mean_token_accuracy": 0.7309625665657222, "num_tokens": 17235205.0, "step": 20 }, { "entropy": 0.5419464111328125, "epoch": 0.16666666666666666, "grad_norm": 2.7287809182707607, "learning_rate": 5.263157894736842e-06, "loss": 0.9449, "mean_token_accuracy": 0.734284377656877, "num_tokens": 18090069.0, "step": 21 }, { "entropy": 0.541290283203125, "epoch": 0.1746031746031746, "grad_norm": 2.661465266751511, "learning_rate": 5.526315789473685e-06, "loss": 0.9245, "mean_token_accuracy": 0.7380136135034263, "num_tokens": 18955962.0, "step": 22 }, { "entropy": 0.5420989990234375, "epoch": 0.18253968253968253, "grad_norm": 2.1871180143795454, "learning_rate": 5.789473684210527e-06, "loss": 0.8806, "mean_token_accuracy": 0.7492561861872673, "num_tokens": 19812261.0, "step": 23 }, { "entropy": 0.5357818603515625, "epoch": 0.19047619047619047, "grad_norm": 2.9467428670698266, "learning_rate": 6.0526315789473685e-06, "loss": 0.8643, "mean_token_accuracy": 0.7490303930826485, "num_tokens": 20647709.0, "step": 24 }, { "entropy": 0.52435302734375, "epoch": 0.1984126984126984, "grad_norm": 2.907660231508903, "learning_rate": 6.31578947368421e-06, "loss": 0.861, "mean_token_accuracy": 0.7502002012915909, "num_tokens": 21486371.0, "step": 25 }, { "entropy": 0.5368194580078125, "epoch": 0.20634920634920634, "grad_norm": 2.490760554791867, "learning_rate": 6.578947368421054e-06, "loss": 0.8441, "mean_token_accuracy": 0.7527890643104911, "num_tokens": 22318414.0, "step": 26 }, { "entropy": 0.5297393798828125, "epoch": 0.21428571428571427, "grad_norm": 2.1289542070292082, "learning_rate": 6.842105263157896e-06, "loss": 0.8129, "mean_token_accuracy": 0.7607404845766723, "num_tokens": 23134604.0, "step": 27 }, { "entropy": 0.51568603515625, "epoch": 0.2222222222222222, "grad_norm": 1.8284519133414685, "learning_rate": 7.1052631578947375e-06, "loss": 0.8081, "mean_token_accuracy": 0.7623137319460511, "num_tokens": 24009770.0, "step": 28 }, { "entropy": 0.52362060546875, "epoch": 0.23015873015873015, "grad_norm": 1.9471502088574602, "learning_rate": 7.368421052631579e-06, "loss": 0.8048, "mean_token_accuracy": 0.762959006242454, "num_tokens": 24869806.0, "step": 29 }, { "entropy": 0.512420654296875, "epoch": 0.23809523809523808, "grad_norm": 1.745367819572919, "learning_rate": 7.631578947368423e-06, "loss": 0.7896, "mean_token_accuracy": 0.7669625347480178, "num_tokens": 25762999.0, "step": 30 }, { "entropy": 0.5128631591796875, "epoch": 0.24603174603174602, "grad_norm": 2.0285955824692774, "learning_rate": 7.894736842105265e-06, "loss": 0.7605, "mean_token_accuracy": 0.77361392788589, "num_tokens": 26651412.0, "step": 31 }, { "entropy": 0.51251220703125, "epoch": 0.25396825396825395, "grad_norm": 1.8504240732599995, "learning_rate": 8.157894736842106e-06, "loss": 0.7568, "mean_token_accuracy": 0.7740332204848528, "num_tokens": 27520069.0, "step": 32 }, { "entropy": 0.5124359130859375, "epoch": 0.2619047619047619, "grad_norm": 1.5796846536434936, "learning_rate": 8.421052631578948e-06, "loss": 0.7509, "mean_token_accuracy": 0.7743185707367957, "num_tokens": 28367541.0, "step": 33 }, { "entropy": 0.506439208984375, "epoch": 0.2698412698412698, "grad_norm": 1.3378034052723207, "learning_rate": 8.68421052631579e-06, "loss": 0.7161, "mean_token_accuracy": 0.7827055719681084, "num_tokens": 29217570.0, "step": 34 }, { "entropy": 0.4996490478515625, "epoch": 0.2777777777777778, "grad_norm": 1.679208635550248, "learning_rate": 8.947368421052632e-06, "loss": 0.7187, "mean_token_accuracy": 0.7833654824644327, "num_tokens": 30088869.0, "step": 35 }, { "entropy": 0.4967041015625, "epoch": 0.2857142857142857, "grad_norm": 1.6387552610606482, "learning_rate": 9.210526315789474e-06, "loss": 0.7145, "mean_token_accuracy": 0.7830667225643992, "num_tokens": 30959821.0, "step": 36 }, { "entropy": 0.500274658203125, "epoch": 0.29365079365079366, "grad_norm": 1.371112117801989, "learning_rate": 9.473684210526315e-06, "loss": 0.7097, "mean_token_accuracy": 0.7837298880331218, "num_tokens": 31830767.0, "step": 37 }, { "entropy": 0.5020751953125, "epoch": 0.30158730158730157, "grad_norm": 1.6810348517072142, "learning_rate": 9.736842105263159e-06, "loss": 0.6954, "mean_token_accuracy": 0.7868552934378386, "num_tokens": 32692726.0, "step": 38 }, { "entropy": 0.5, "epoch": 0.30952380952380953, "grad_norm": 1.3054950683288973, "learning_rate": 1e-05, "loss": 0.669, "mean_token_accuracy": 0.7914999099448323, "num_tokens": 33543076.0, "step": 39 }, { "entropy": 0.4976806640625, "epoch": 0.31746031746031744, "grad_norm": 1.5523188226474438, "learning_rate": 9.99995213807381e-06, "loss": 0.677, "mean_token_accuracy": 0.7901066686026752, "num_tokens": 34404352.0, "step": 40 }, { "entropy": 0.5042724609375, "epoch": 0.3253968253968254, "grad_norm": 1.3329229488550147, "learning_rate": 9.99980855321154e-06, "loss": 0.6721, "mean_token_accuracy": 0.7911685910075903, "num_tokens": 35236933.0, "step": 41 }, { "entropy": 0.491058349609375, "epoch": 0.3333333333333333, "grad_norm": 1.2836381282552631, "learning_rate": 9.999569248162095e-06, "loss": 0.662, "mean_token_accuracy": 0.7938097110018134, "num_tokens": 36088050.0, "step": 42 }, { "entropy": 0.491973876953125, "epoch": 0.3412698412698413, "grad_norm": 1.5436048820238097, "learning_rate": 9.999234227506912e-06, "loss": 0.659, "mean_token_accuracy": 0.7951631429605186, "num_tokens": 36942407.0, "step": 43 }, { "entropy": 0.488128662109375, "epoch": 0.3492063492063492, "grad_norm": 1.5701418827048832, "learning_rate": 9.998803497659885e-06, "loss": 0.6567, "mean_token_accuracy": 0.7952465042471886, "num_tokens": 37803882.0, "step": 44 }, { "entropy": 0.4854278564453125, "epoch": 0.35714285714285715, "grad_norm": 1.453745925853672, "learning_rate": 9.998277066867236e-06, "loss": 0.6557, "mean_token_accuracy": 0.7963202544488013, "num_tokens": 38667329.0, "step": 45 }, { "entropy": 0.4832763671875, "epoch": 0.36507936507936506, "grad_norm": 1.4543393351762242, "learning_rate": 9.997654945207368e-06, "loss": 0.6415, "mean_token_accuracy": 0.7979013016447425, "num_tokens": 39524166.0, "step": 46 }, { "entropy": 0.47686767578125, "epoch": 0.373015873015873, "grad_norm": 1.2844075952499214, "learning_rate": 9.99693714459065e-06, "loss": 0.6314, "mean_token_accuracy": 0.8014799957163632, "num_tokens": 40389693.0, "step": 47 }, { "entropy": 0.485137939453125, "epoch": 0.38095238095238093, "grad_norm": 1.2055402617655024, "learning_rate": 9.996123678759214e-06, "loss": 0.6304, "mean_token_accuracy": 0.8023288743570447, "num_tokens": 41231841.0, "step": 48 }, { "entropy": 0.4888763427734375, "epoch": 0.3888888888888889, "grad_norm": 1.562622920851612, "learning_rate": 9.995214563286677e-06, "loss": 0.6315, "mean_token_accuracy": 0.8009699960239232, "num_tokens": 42082897.0, "step": 49 }, { "entropy": 0.4813079833984375, "epoch": 0.3968253968253968, "grad_norm": 1.1138521900837535, "learning_rate": 9.994209815577843e-06, "loss": 0.6138, "mean_token_accuracy": 0.8051781668327749, "num_tokens": 42941458.0, "step": 50 }, { "entropy": 0.4780731201171875, "epoch": 0.40476190476190477, "grad_norm": 1.4023404249187725, "learning_rate": 9.993109454868379e-06, "loss": 0.6123, "mean_token_accuracy": 0.805035431869328, "num_tokens": 43801380.0, "step": 51 }, { "entropy": 0.4847564697265625, "epoch": 0.4126984126984127, "grad_norm": 1.1362760383931592, "learning_rate": 9.991913502224438e-06, "loss": 0.6252, "mean_token_accuracy": 0.8026665896177292, "num_tokens": 44671335.0, "step": 52 }, { "entropy": 0.4954071044921875, "epoch": 0.42063492063492064, "grad_norm": 1.2459663164920876, "learning_rate": 9.990621980542258e-06, "loss": 0.6116, "mean_token_accuracy": 0.8040637490339577, "num_tokens": 45508166.0, "step": 53 }, { "entropy": 0.479949951171875, "epoch": 0.42857142857142855, "grad_norm": 1.3916959345611162, "learning_rate": 9.989234914547725e-06, "loss": 0.6209, "mean_token_accuracy": 0.803072199691087, "num_tokens": 46398974.0, "step": 54 }, { "entropy": 0.472991943359375, "epoch": 0.4365079365079365, "grad_norm": 1.2293838348325725, "learning_rate": 9.9877523307959e-06, "loss": 0.619, "mean_token_accuracy": 0.8017513235099614, "num_tokens": 47311746.0, "step": 55 }, { "entropy": 0.477813720703125, "epoch": 0.4444444444444444, "grad_norm": 1.3910584256010747, "learning_rate": 9.986174257670509e-06, "loss": 0.5928, "mean_token_accuracy": 0.8104538763873279, "num_tokens": 48177761.0, "step": 56 }, { "entropy": 0.4864501953125, "epoch": 0.4523809523809524, "grad_norm": 1.398508282793997, "learning_rate": 9.984500725383397e-06, "loss": 0.5977, "mean_token_accuracy": 0.8092257836833596, "num_tokens": 49026375.0, "step": 57 }, { "entropy": 0.4769744873046875, "epoch": 0.4603174603174603, "grad_norm": 1.5292432492685464, "learning_rate": 9.98273176597396e-06, "loss": 0.6023, "mean_token_accuracy": 0.8064105985686183, "num_tokens": 49889163.0, "step": 58 }, { "entropy": 0.484161376953125, "epoch": 0.46825396825396826, "grad_norm": 1.2391735146187919, "learning_rate": 9.980867413308516e-06, "loss": 0.5945, "mean_token_accuracy": 0.8099995800293982, "num_tokens": 50735361.0, "step": 59 }, { "entropy": 0.4808349609375, "epoch": 0.47619047619047616, "grad_norm": 1.1506873672230502, "learning_rate": 9.978907703079672e-06, "loss": 0.5839, "mean_token_accuracy": 0.8109453665092587, "num_tokens": 51597577.0, "step": 60 }, { "entropy": 0.4707794189453125, "epoch": 0.48412698412698413, "grad_norm": 1.1563025667287559, "learning_rate": 9.976852672805625e-06, "loss": 0.5933, "mean_token_accuracy": 0.8103347043506801, "num_tokens": 52482382.0, "step": 61 }, { "entropy": 0.47381591796875, "epoch": 0.49206349206349204, "grad_norm": 1.2114630802835358, "learning_rate": 9.974702361829465e-06, "loss": 0.587, "mean_token_accuracy": 0.8098243419080973, "num_tokens": 53370750.0, "step": 62 }, { "entropy": 0.4756927490234375, "epoch": 0.5, "grad_norm": 1.1306748677183436, "learning_rate": 9.972456811318399e-06, "loss": 0.5792, "mean_token_accuracy": 0.8143061874434352, "num_tokens": 54235189.0, "step": 63 }, { "entropy": 0.47760009765625, "epoch": 0.5079365079365079, "grad_norm": 1.1104384579683657, "learning_rate": 9.970116064262975e-06, "loss": 0.5711, "mean_token_accuracy": 0.814652734901756, "num_tokens": 55066936.0, "step": 64 }, { "entropy": 0.475128173828125, "epoch": 0.5158730158730159, "grad_norm": 1.130873737994674, "learning_rate": 9.96768016547626e-06, "loss": 0.5749, "mean_token_accuracy": 0.8144003404304385, "num_tokens": 55922405.0, "step": 65 }, { "entropy": 0.4756317138671875, "epoch": 0.5238095238095238, "grad_norm": 1.2543626844594944, "learning_rate": 9.965149161592973e-06, "loss": 0.5717, "mean_token_accuracy": 0.8129943162202835, "num_tokens": 56770275.0, "step": 66 }, { "entropy": 0.47332763671875, "epoch": 0.5317460317460317, "grad_norm": 1.164483305334561, "learning_rate": 9.962523101068608e-06, "loss": 0.5781, "mean_token_accuracy": 0.8122508767992258, "num_tokens": 57627870.0, "step": 67 }, { "entropy": 0.475616455078125, "epoch": 0.5396825396825397, "grad_norm": 1.0960487524124647, "learning_rate": 9.959802034178489e-06, "loss": 0.5661, "mean_token_accuracy": 0.817779887933284, "num_tokens": 58469884.0, "step": 68 }, { "entropy": 0.4727325439453125, "epoch": 0.5476190476190477, "grad_norm": 1.0760531755759473, "learning_rate": 9.956986013016816e-06, "loss": 0.5655, "mean_token_accuracy": 0.815218704752624, "num_tokens": 59323796.0, "step": 69 }, { "entropy": 0.472900390625, "epoch": 0.5555555555555556, "grad_norm": 1.150431448189494, "learning_rate": 9.954075091495669e-06, "loss": 0.5564, "mean_token_accuracy": 0.8179643992334604, "num_tokens": 60183841.0, "step": 70 }, { "entropy": 0.4703521728515625, "epoch": 0.5634920634920635, "grad_norm": 1.0833066927711814, "learning_rate": 9.951069325343972e-06, "loss": 0.5668, "mean_token_accuracy": 0.8150419541634619, "num_tokens": 61048601.0, "step": 71 }, { "entropy": 0.472991943359375, "epoch": 0.5714285714285714, "grad_norm": 1.1267753740162227, "learning_rate": 9.947968772106428e-06, "loss": 0.5683, "mean_token_accuracy": 0.8151015248149633, "num_tokens": 61887912.0, "step": 72 }, { "entropy": 0.468231201171875, "epoch": 0.5793650793650794, "grad_norm": 1.0041998917193997, "learning_rate": 9.944773491142416e-06, "loss": 0.561, "mean_token_accuracy": 0.8171937335282564, "num_tokens": 62741342.0, "step": 73 }, { "entropy": 0.464263916015625, "epoch": 0.5873015873015873, "grad_norm": 1.0641654456452165, "learning_rate": 9.94148354362486e-06, "loss": 0.5591, "mean_token_accuracy": 0.8203174020163715, "num_tokens": 63594617.0, "step": 74 }, { "entropy": 0.4626922607421875, "epoch": 0.5952380952380952, "grad_norm": 1.0150387789211928, "learning_rate": 9.938098992539045e-06, "loss": 0.5534, "mean_token_accuracy": 0.8201709003187716, "num_tokens": 64467695.0, "step": 75 }, { "entropy": 0.4632568359375, "epoch": 0.6031746031746031, "grad_norm": 1.1279871018458427, "learning_rate": 9.93461990268143e-06, "loss": 0.5589, "mean_token_accuracy": 0.8168505723588169, "num_tokens": 65333788.0, "step": 76 }, { "entropy": 0.4673309326171875, "epoch": 0.6111111111111112, "grad_norm": 1.1373063081267694, "learning_rate": 9.931046340658387e-06, "loss": 0.5494, "mean_token_accuracy": 0.8186001246795058, "num_tokens": 66178918.0, "step": 77 }, { "entropy": 0.4654693603515625, "epoch": 0.6190476190476191, "grad_norm": 0.987490904917509, "learning_rate": 9.927378374884947e-06, "loss": 0.5617, "mean_token_accuracy": 0.8173182448372245, "num_tokens": 67052342.0, "step": 78 }, { "entropy": 0.4614715576171875, "epoch": 0.626984126984127, "grad_norm": 1.158480909536903, "learning_rate": 9.923616075583465e-06, "loss": 0.5521, "mean_token_accuracy": 0.8191495719365776, "num_tokens": 67913391.0, "step": 79 }, { "entropy": 0.46258544921875, "epoch": 0.6349206349206349, "grad_norm": 1.0694507889690312, "learning_rate": 9.919759514782304e-06, "loss": 0.5518, "mean_token_accuracy": 0.8191684056073427, "num_tokens": 68772115.0, "step": 80 }, { "entropy": 0.4591064453125, "epoch": 0.6428571428571429, "grad_norm": 1.031285333594921, "learning_rate": 9.91580876631443e-06, "loss": 0.5395, "mean_token_accuracy": 0.8231914453208447, "num_tokens": 69611653.0, "step": 81 }, { "entropy": 0.4618072509765625, "epoch": 0.6507936507936508, "grad_norm": 1.2191568231885717, "learning_rate": 9.91176390581602e-06, "loss": 0.5609, "mean_token_accuracy": 0.8178833266720176, "num_tokens": 70496952.0, "step": 82 }, { "entropy": 0.4582672119140625, "epoch": 0.6587301587301587, "grad_norm": 1.098798237502869, "learning_rate": 9.907625010724999e-06, "loss": 0.5426, "mean_token_accuracy": 0.8210734003223479, "num_tokens": 71343921.0, "step": 83 }, { "entropy": 0.45758056640625, "epoch": 0.6666666666666666, "grad_norm": 1.0293307451137137, "learning_rate": 9.903392160279564e-06, "loss": 0.5547, "mean_token_accuracy": 0.8177660717628896, "num_tokens": 72240608.0, "step": 84 }, { "entropy": 0.4568939208984375, "epoch": 0.6746031746031746, "grad_norm": 1.1591213276198025, "learning_rate": 9.899065435516661e-06, "loss": 0.5419, "mean_token_accuracy": 0.8213843265548348, "num_tokens": 73118452.0, "step": 85 }, { "entropy": 0.45391845703125, "epoch": 0.6825396825396826, "grad_norm": 1.0203407509108033, "learning_rate": 9.894644919270448e-06, "loss": 0.5482, "mean_token_accuracy": 0.8173369145952165, "num_tokens": 73991069.0, "step": 86 }, { "entropy": 0.4616546630859375, "epoch": 0.6904761904761905, "grad_norm": 1.2761061587844562, "learning_rate": 9.890130696170691e-06, "loss": 0.5398, "mean_token_accuracy": 0.8226454192772508, "num_tokens": 74839901.0, "step": 87 }, { "entropy": 0.4576416015625, "epoch": 0.6984126984126984, "grad_norm": 1.0577284033847283, "learning_rate": 9.885522852641156e-06, "loss": 0.5527, "mean_token_accuracy": 0.8187054474838078, "num_tokens": 75749725.0, "step": 88 }, { "entropy": 0.465087890625, "epoch": 0.7063492063492064, "grad_norm": 1.1084468078084884, "learning_rate": 9.880821476897948e-06, "loss": 0.5456, "mean_token_accuracy": 0.8205522131174803, "num_tokens": 76593690.0, "step": 89 }, { "entropy": 0.466217041015625, "epoch": 0.7142857142857143, "grad_norm": 1.1432698793337261, "learning_rate": 9.87602665894783e-06, "loss": 0.5352, "mean_token_accuracy": 0.8226723484694958, "num_tokens": 77431030.0, "step": 90 }, { "entropy": 0.4627227783203125, "epoch": 0.7222222222222222, "grad_norm": 1.2237349934344866, "learning_rate": 9.871138490586489e-06, "loss": 0.54, "mean_token_accuracy": 0.8220892632380128, "num_tokens": 78278605.0, "step": 91 }, { "entropy": 0.457794189453125, "epoch": 0.7301587301587301, "grad_norm": 0.9735987604661573, "learning_rate": 9.866157065396784e-06, "loss": 0.5336, "mean_token_accuracy": 0.8227689885534346, "num_tokens": 79154216.0, "step": 92 }, { "entropy": 0.4553680419921875, "epoch": 0.7380952380952381, "grad_norm": 1.2060097781830723, "learning_rate": 9.861082478746962e-06, "loss": 0.5453, "mean_token_accuracy": 0.8193097808398306, "num_tokens": 80039204.0, "step": 93 }, { "entropy": 0.452606201171875, "epoch": 0.746031746031746, "grad_norm": 1.219116442820296, "learning_rate": 9.855914827788814e-06, "loss": 0.537, "mean_token_accuracy": 0.8233561674132943, "num_tokens": 80910348.0, "step": 94 }, { "entropy": 0.454315185546875, "epoch": 0.753968253968254, "grad_norm": 1.1176715433890458, "learning_rate": 9.850654211455837e-06, "loss": 0.5371, "mean_token_accuracy": 0.8221336985006928, "num_tokens": 81765325.0, "step": 95 }, { "entropy": 0.4631195068359375, "epoch": 0.7619047619047619, "grad_norm": 0.9620936989799753, "learning_rate": 9.84530073046132e-06, "loss": 0.5288, "mean_token_accuracy": 0.8240236868150532, "num_tokens": 82611125.0, "step": 96 }, { "entropy": 0.454254150390625, "epoch": 0.7698412698412699, "grad_norm": 1.0990268102179221, "learning_rate": 9.83985448729643e-06, "loss": 0.5376, "mean_token_accuracy": 0.8205642709508538, "num_tokens": 83488021.0, "step": 97 }, { "entropy": 0.45806884765625, "epoch": 0.7777777777777778, "grad_norm": 1.1036853861455271, "learning_rate": 9.83431558622824e-06, "loss": 0.533, "mean_token_accuracy": 0.8217868432402611, "num_tokens": 84321775.0, "step": 98 }, { "entropy": 0.4533843994140625, "epoch": 0.7857142857142857, "grad_norm": 1.1621319158175323, "learning_rate": 9.828684133297738e-06, "loss": 0.5223, "mean_token_accuracy": 0.8267802041955292, "num_tokens": 85167087.0, "step": 99 }, { "entropy": 0.453704833984375, "epoch": 0.7936507936507936, "grad_norm": 1.01710948689366, "learning_rate": 9.822960236317804e-06, "loss": 0.5324, "mean_token_accuracy": 0.824018832296133, "num_tokens": 86009383.0, "step": 100 }, { "entropy": 0.4491729736328125, "epoch": 0.8015873015873016, "grad_norm": 1.136196189273348, "learning_rate": 9.817144004871127e-06, "loss": 0.5249, "mean_token_accuracy": 0.8261763895861804, "num_tokens": 86862628.0, "step": 101 }, { "entropy": 0.447357177734375, "epoch": 0.8095238095238095, "grad_norm": 1.1653574778770674, "learning_rate": 9.811235550308127e-06, "loss": 0.5298, "mean_token_accuracy": 0.8248972818255424, "num_tokens": 87713395.0, "step": 102 }, { "entropy": 0.4496612548828125, "epoch": 0.8174603174603174, "grad_norm": 1.0197520810298888, "learning_rate": 9.805234985744804e-06, "loss": 0.5374, "mean_token_accuracy": 0.8215886438265443, "num_tokens": 88602545.0, "step": 103 }, { "entropy": 0.45355224609375, "epoch": 0.8253968253968254, "grad_norm": 1.0746283719190985, "learning_rate": 9.799142426060595e-06, "loss": 0.5211, "mean_token_accuracy": 0.8281928705982864, "num_tokens": 89444255.0, "step": 104 }, { "entropy": 0.4513702392578125, "epoch": 0.8333333333333334, "grad_norm": 0.9405502354816935, "learning_rate": 9.792957987896154e-06, "loss": 0.5164, "mean_token_accuracy": 0.8303409847430885, "num_tokens": 90312774.0, "step": 105 }, { "entropy": 0.4513397216796875, "epoch": 0.8412698412698413, "grad_norm": 0.9828002484074289, "learning_rate": 9.786681789651134e-06, "loss": 0.5167, "mean_token_accuracy": 0.8268536222167313, "num_tokens": 91177447.0, "step": 106 }, { "entropy": 0.452178955078125, "epoch": 0.8492063492063492, "grad_norm": 1.0029391226175481, "learning_rate": 9.780313951481904e-06, "loss": 0.5159, "mean_token_accuracy": 0.8277767463587224, "num_tokens": 92026164.0, "step": 107 }, { "entropy": 0.448455810546875, "epoch": 0.8571428571428571, "grad_norm": 1.0111683546148944, "learning_rate": 9.773854595299269e-06, "loss": 0.5158, "mean_token_accuracy": 0.8292862558737397, "num_tokens": 92891631.0, "step": 108 }, { "entropy": 0.4544219970703125, "epoch": 0.8650793650793651, "grad_norm": 1.0693593999923847, "learning_rate": 9.767303844766118e-06, "loss": 0.5228, "mean_token_accuracy": 0.8271551127545536, "num_tokens": 93760535.0, "step": 109 }, { "entropy": 0.4507293701171875, "epoch": 0.873015873015873, "grad_norm": 1.078077119226931, "learning_rate": 9.760661825295068e-06, "loss": 0.5188, "mean_token_accuracy": 0.8268710542470217, "num_tokens": 94599260.0, "step": 110 }, { "entropy": 0.45849609375, "epoch": 0.8809523809523809, "grad_norm": 1.1500541245910931, "learning_rate": 9.753928664046055e-06, "loss": 0.5188, "mean_token_accuracy": 0.8258566916920245, "num_tokens": 95425799.0, "step": 111 }, { "entropy": 0.455718994140625, "epoch": 0.8888888888888888, "grad_norm": 0.9411323026364743, "learning_rate": 9.747104489923907e-06, "loss": 0.5223, "mean_token_accuracy": 0.8278644122183323, "num_tokens": 96260271.0, "step": 112 }, { "entropy": 0.44830322265625, "epoch": 0.8968253968253969, "grad_norm": 1.0496976356343377, "learning_rate": 9.740189433575873e-06, "loss": 0.5209, "mean_token_accuracy": 0.8264745082706213, "num_tokens": 97135925.0, "step": 113 }, { "entropy": 0.4494781494140625, "epoch": 0.9047619047619048, "grad_norm": 1.1407666858849659, "learning_rate": 9.733183627389117e-06, "loss": 0.5223, "mean_token_accuracy": 0.8253828585147858, "num_tokens": 98011108.0, "step": 114 }, { "entropy": 0.4512481689453125, "epoch": 0.9126984126984127, "grad_norm": 1.042415494174418, "learning_rate": 9.726087205488192e-06, "loss": 0.5122, "mean_token_accuracy": 0.827279772143811, "num_tokens": 98873029.0, "step": 115 }, { "entropy": 0.4514007568359375, "epoch": 0.9206349206349206, "grad_norm": 1.2092755396739658, "learning_rate": 9.718900303732465e-06, "loss": 0.5205, "mean_token_accuracy": 0.8262831498868763, "num_tokens": 99734864.0, "step": 116 }, { "entropy": 0.4513702392578125, "epoch": 0.9285714285714286, "grad_norm": 1.1637681555356034, "learning_rate": 9.711623059713522e-06, "loss": 0.525, "mean_token_accuracy": 0.826652648858726, "num_tokens": 100606926.0, "step": 117 }, { "entropy": 0.45599365234375, "epoch": 0.9365079365079365, "grad_norm": 1.0209942008459238, "learning_rate": 9.70425561275253e-06, "loss": 0.5082, "mean_token_accuracy": 0.8288828083314002, "num_tokens": 101447599.0, "step": 118 }, { "entropy": 0.4554595947265625, "epoch": 0.9444444444444444, "grad_norm": 1.0671983733664188, "learning_rate": 9.696798103897567e-06, "loss": 0.5105, "mean_token_accuracy": 0.8294458598829806, "num_tokens": 102275358.0, "step": 119 }, { "entropy": 0.459686279296875, "epoch": 0.9523809523809523, "grad_norm": 1.043670325725254, "learning_rate": 9.689250675920932e-06, "loss": 0.5091, "mean_token_accuracy": 0.8285032915882766, "num_tokens": 103113293.0, "step": 120 }, { "entropy": 0.4480743408203125, "epoch": 0.9603174603174603, "grad_norm": 0.9316811204172604, "learning_rate": 9.6816134733164e-06, "loss": 0.5139, "mean_token_accuracy": 0.8276259712874889, "num_tokens": 104000550.0, "step": 121 }, { "entropy": 0.448577880859375, "epoch": 0.9682539682539683, "grad_norm": 1.0919546140808916, "learning_rate": 9.67388664229646e-06, "loss": 0.5134, "mean_token_accuracy": 0.8282599356025457, "num_tokens": 104859286.0, "step": 122 }, { "entropy": 0.448150634765625, "epoch": 0.9761904761904762, "grad_norm": 0.999620140999962, "learning_rate": 9.66607033078952e-06, "loss": 0.5131, "mean_token_accuracy": 0.827931288164109, "num_tokens": 105729675.0, "step": 123 }, { "entropy": 0.4438629150390625, "epoch": 0.9841269841269841, "grad_norm": 0.9685899298092844, "learning_rate": 9.658164688437073e-06, "loss": 0.5098, "mean_token_accuracy": 0.8288385523483157, "num_tokens": 106603968.0, "step": 124 }, { "entropy": 0.44073486328125, "epoch": 0.9920634920634921, "grad_norm": 0.9358281617577068, "learning_rate": 9.65016986659082e-06, "loss": 0.5053, "mean_token_accuracy": 0.8307707700878382, "num_tokens": 107495007.0, "step": 125 }, { "entropy": 0.4453887939453125, "epoch": 1.0, "grad_norm": 1.0968390964445407, "learning_rate": 9.642086018309798e-06, "loss": 0.5189, "mean_token_accuracy": 0.8269711588509381, "num_tokens": 108364335.0, "step": 126 }, { "entropy": 0.4580535888671875, "epoch": 1.007936507936508, "grad_norm": 1.0332257753415273, "learning_rate": 9.63391329835742e-06, "loss": 0.4924, "mean_token_accuracy": 0.8341724565252662, "num_tokens": 109202665.0, "step": 127 }, { "entropy": 0.443359375, "epoch": 1.0158730158730158, "grad_norm": 0.9791939186386783, "learning_rate": 9.625651863198538e-06, "loss": 0.4937, "mean_token_accuracy": 0.8334903731010854, "num_tokens": 110061605.0, "step": 128 }, { "entropy": 0.4462127685546875, "epoch": 1.0238095238095237, "grad_norm": 0.9767524502632213, "learning_rate": 9.617301870996432e-06, "loss": 0.4907, "mean_token_accuracy": 0.8331266730092466, "num_tokens": 110924491.0, "step": 129 }, { "entropy": 0.4455108642578125, "epoch": 1.0317460317460316, "grad_norm": 1.0445136711209406, "learning_rate": 9.608863481609784e-06, "loss": 0.4924, "mean_token_accuracy": 0.8337863022461534, "num_tokens": 111773832.0, "step": 130 }, { "entropy": 0.4438018798828125, "epoch": 1.0396825396825398, "grad_norm": 0.9414725400905224, "learning_rate": 9.600336856589622e-06, "loss": 0.4856, "mean_token_accuracy": 0.835617205593735, "num_tokens": 112637470.0, "step": 131 }, { "entropy": 0.44439697265625, "epoch": 1.0476190476190477, "grad_norm": 0.9341197433359306, "learning_rate": 9.591722159176229e-06, "loss": 0.4942, "mean_token_accuracy": 0.8329057167284191, "num_tokens": 113501590.0, "step": 132 }, { "entropy": 0.4480133056640625, "epoch": 1.0555555555555556, "grad_norm": 0.9687991138519816, "learning_rate": 9.583019554296004e-06, "loss": 0.4921, "mean_token_accuracy": 0.8349612141028047, "num_tokens": 114360533.0, "step": 133 }, { "entropy": 0.44232177734375, "epoch": 1.0634920634920635, "grad_norm": 0.9629260310473902, "learning_rate": 9.574229208558322e-06, "loss": 0.4899, "mean_token_accuracy": 0.8340120441280305, "num_tokens": 115231953.0, "step": 134 }, { "entropy": 0.443511962890625, "epoch": 1.0714285714285714, "grad_norm": 0.904231182313968, "learning_rate": 9.565351290252339e-06, "loss": 0.4865, "mean_token_accuracy": 0.8350210129283369, "num_tokens": 116092221.0, "step": 135 }, { "entropy": 0.443817138671875, "epoch": 1.0793650793650793, "grad_norm": 1.0016105467556813, "learning_rate": 9.556385969343756e-06, "loss": 0.493, "mean_token_accuracy": 0.8340729284100235, "num_tokens": 116984739.0, "step": 136 }, { "entropy": 0.441619873046875, "epoch": 1.0873015873015872, "grad_norm": 1.0079401042157452, "learning_rate": 9.547333417471589e-06, "loss": 0.4921, "mean_token_accuracy": 0.8338921638205647, "num_tokens": 117852991.0, "step": 137 }, { "entropy": 0.449127197265625, "epoch": 1.0952380952380953, "grad_norm": 1.0727711874297776, "learning_rate": 9.538193807944864e-06, "loss": 0.49, "mean_token_accuracy": 0.8347254949621856, "num_tokens": 118696927.0, "step": 138 }, { "entropy": 0.4441070556640625, "epoch": 1.1031746031746033, "grad_norm": 1.1054420578884305, "learning_rate": 9.528967315739308e-06, "loss": 0.4899, "mean_token_accuracy": 0.8341114274226129, "num_tokens": 119569613.0, "step": 139 }, { "entropy": 0.4412689208984375, "epoch": 1.1111111111111112, "grad_norm": 1.0649123433307148, "learning_rate": 9.519654117493996e-06, "loss": 0.4942, "mean_token_accuracy": 0.8335490431636572, "num_tokens": 120447089.0, "step": 140 }, { "entropy": 0.441131591796875, "epoch": 1.119047619047619, "grad_norm": 0.9658018391507793, "learning_rate": 9.510254391507971e-06, "loss": 0.4839, "mean_token_accuracy": 0.8368041082285345, "num_tokens": 121314886.0, "step": 141 }, { "entropy": 0.44622802734375, "epoch": 1.126984126984127, "grad_norm": 0.9540717616355562, "learning_rate": 9.500768317736832e-06, "loss": 0.4797, "mean_token_accuracy": 0.837146339006722, "num_tokens": 122167617.0, "step": 142 }, { "entropy": 0.44293212890625, "epoch": 1.1349206349206349, "grad_norm": 1.0354981305656152, "learning_rate": 9.49119607778928e-06, "loss": 0.4849, "mean_token_accuracy": 0.8341447049751878, "num_tokens": 123013840.0, "step": 143 }, { "entropy": 0.4395904541015625, "epoch": 1.1428571428571428, "grad_norm": 0.9923916024668512, "learning_rate": 9.481537854923654e-06, "loss": 0.477, "mean_token_accuracy": 0.8377352114766836, "num_tokens": 123876490.0, "step": 144 }, { "entropy": 0.4472503662109375, "epoch": 1.1507936507936507, "grad_norm": 1.129639029346026, "learning_rate": 9.471793834044416e-06, "loss": 0.4853, "mean_token_accuracy": 0.8350173779763281, "num_tokens": 124713314.0, "step": 145 }, { "entropy": 0.44598388671875, "epoch": 1.1587301587301586, "grad_norm": 1.186594461732162, "learning_rate": 9.461964201698604e-06, "loss": 0.4939, "mean_token_accuracy": 0.8313662535510957, "num_tokens": 125564746.0, "step": 146 }, { "entropy": 0.43914794921875, "epoch": 1.1666666666666667, "grad_norm": 0.9201175898857195, "learning_rate": 9.452049146072278e-06, "loss": 0.4803, "mean_token_accuracy": 0.8369712042622268, "num_tokens": 126415997.0, "step": 147 }, { "entropy": 0.445343017578125, "epoch": 1.1746031746031746, "grad_norm": 0.9688073354920573, "learning_rate": 9.442048856986899e-06, "loss": 0.4914, "mean_token_accuracy": 0.8343577086925507, "num_tokens": 127285760.0, "step": 148 }, { "entropy": 0.4484710693359375, "epoch": 1.1825396825396826, "grad_norm": 1.0757588730703025, "learning_rate": 9.431963525895709e-06, "loss": 0.4946, "mean_token_accuracy": 0.8332444536499679, "num_tokens": 128153685.0, "step": 149 }, { "entropy": 0.45037841796875, "epoch": 1.1904761904761905, "grad_norm": 1.0256311878794404, "learning_rate": 9.421793345880055e-06, "loss": 0.4789, "mean_token_accuracy": 0.8380363639444113, "num_tokens": 128990738.0, "step": 150 }, { "entropy": 0.44927978515625, "epoch": 1.1984126984126984, "grad_norm": 1.0956116916076417, "learning_rate": 9.4115385116457e-06, "loss": 0.4919, "mean_token_accuracy": 0.8342617130838335, "num_tokens": 129848900.0, "step": 151 }, { "entropy": 0.448333740234375, "epoch": 1.2063492063492063, "grad_norm": 1.0166065436311602, "learning_rate": 9.401199219519088e-06, "loss": 0.4878, "mean_token_accuracy": 0.8344792602583766, "num_tokens": 130724709.0, "step": 152 }, { "entropy": 0.44781494140625, "epoch": 1.2142857142857142, "grad_norm": 0.9085771039074888, "learning_rate": 9.390775667443602e-06, "loss": 0.4761, "mean_token_accuracy": 0.8378241760656238, "num_tokens": 131582811.0, "step": 153 }, { "entropy": 0.4468536376953125, "epoch": 1.2222222222222223, "grad_norm": 1.0868732902444567, "learning_rate": 9.380268054975745e-06, "loss": 0.4835, "mean_token_accuracy": 0.8363062706775963, "num_tokens": 132429743.0, "step": 154 }, { "entropy": 0.4462738037109375, "epoch": 1.2301587301587302, "grad_norm": 1.0628171497183283, "learning_rate": 9.36967658328135e-06, "loss": 0.4854, "mean_token_accuracy": 0.8348783804103732, "num_tokens": 133291943.0, "step": 155 }, { "entropy": 0.44970703125, "epoch": 1.2380952380952381, "grad_norm": 0.9804992859024957, "learning_rate": 9.359001455131713e-06, "loss": 0.4815, "mean_token_accuracy": 0.8365443642251194, "num_tokens": 134149814.0, "step": 156 }, { "entropy": 0.44989013671875, "epoch": 1.246031746031746, "grad_norm": 1.1446452708449568, "learning_rate": 9.34824287489971e-06, "loss": 0.4728, "mean_token_accuracy": 0.839170094113797, "num_tokens": 134989406.0, "step": 157 }, { "entropy": 0.4441986083984375, "epoch": 1.253968253968254, "grad_norm": 1.003349637480706, "learning_rate": 9.337401048555892e-06, "loss": 0.4688, "mean_token_accuracy": 0.8404755499213934, "num_tokens": 135832642.0, "step": 158 }, { "entropy": 0.4422149658203125, "epoch": 1.2619047619047619, "grad_norm": 1.0839125563857472, "learning_rate": 9.326476183664535e-06, "loss": 0.4797, "mean_token_accuracy": 0.837718007620424, "num_tokens": 136724748.0, "step": 159 }, { "entropy": 0.449066162109375, "epoch": 1.2698412698412698, "grad_norm": 1.0019921797525027, "learning_rate": 9.315468489379668e-06, "loss": 0.4788, "mean_token_accuracy": 0.8362822770141065, "num_tokens": 137570382.0, "step": 160 }, { "entropy": 0.4457855224609375, "epoch": 1.2777777777777777, "grad_norm": 0.9885822793795929, "learning_rate": 9.304378176441076e-06, "loss": 0.4779, "mean_token_accuracy": 0.8382827825844288, "num_tokens": 138431194.0, "step": 161 }, { "entropy": 0.4445953369140625, "epoch": 1.2857142857142856, "grad_norm": 1.040992724515245, "learning_rate": 9.29320545717025e-06, "loss": 0.4673, "mean_token_accuracy": 0.840416397433728, "num_tokens": 139287890.0, "step": 162 }, { "entropy": 0.44342041015625, "epoch": 1.2936507936507937, "grad_norm": 0.9829545106372576, "learning_rate": 9.281950545466336e-06, "loss": 0.4814, "mean_token_accuracy": 0.8361613317392766, "num_tokens": 140160179.0, "step": 163 }, { "entropy": 0.44000244140625, "epoch": 1.3015873015873016, "grad_norm": 1.0385970457713267, "learning_rate": 9.27061365680204e-06, "loss": 0.4803, "mean_token_accuracy": 0.8369025052525103, "num_tokens": 141002875.0, "step": 164 }, { "entropy": 0.4385528564453125, "epoch": 1.3095238095238095, "grad_norm": 0.9812301520220873, "learning_rate": 9.25919500821949e-06, "loss": 0.471, "mean_token_accuracy": 0.838864213321358, "num_tokens": 141858286.0, "step": 165 }, { "entropy": 0.4442901611328125, "epoch": 1.3174603174603174, "grad_norm": 1.0236518502998646, "learning_rate": 9.247694818326092e-06, "loss": 0.4711, "mean_token_accuracy": 0.8398910835385323, "num_tokens": 142698339.0, "step": 166 }, { "entropy": 0.4426116943359375, "epoch": 1.3253968253968254, "grad_norm": 1.1382281143976174, "learning_rate": 9.236113307290345e-06, "loss": 0.4742, "mean_token_accuracy": 0.837109467945993, "num_tokens": 143561112.0, "step": 167 }, { "entropy": 0.442718505859375, "epoch": 1.3333333333333333, "grad_norm": 0.9746191934401784, "learning_rate": 9.224450696837617e-06, "loss": 0.4756, "mean_token_accuracy": 0.8384752809070051, "num_tokens": 144390223.0, "step": 168 }, { "entropy": 0.4402008056640625, "epoch": 1.3412698412698414, "grad_norm": 0.9749642677850219, "learning_rate": 9.212707210245908e-06, "loss": 0.4881, "mean_token_accuracy": 0.8348734346218407, "num_tokens": 145276276.0, "step": 169 }, { "entropy": 0.4441070556640625, "epoch": 1.3492063492063493, "grad_norm": 1.0438747532350088, "learning_rate": 9.200883072341573e-06, "loss": 0.4761, "mean_token_accuracy": 0.8384365830570459, "num_tokens": 146148957.0, "step": 170 }, { "entropy": 0.4490203857421875, "epoch": 1.3571428571428572, "grad_norm": 0.9016207370694161, "learning_rate": 9.188978509495022e-06, "loss": 0.475, "mean_token_accuracy": 0.8379638059996068, "num_tokens": 146999892.0, "step": 171 }, { "entropy": 0.4430694580078125, "epoch": 1.3650793650793651, "grad_norm": 0.8815407085280926, "learning_rate": 9.176993749616374e-06, "loss": 0.4768, "mean_token_accuracy": 0.8367542624473572, "num_tokens": 147888947.0, "step": 172 }, { "entropy": 0.451995849609375, "epoch": 1.373015873015873, "grad_norm": 0.9246528792063293, "learning_rate": 9.164929022151106e-06, "loss": 0.4871, "mean_token_accuracy": 0.8344444935210049, "num_tokens": 148771994.0, "step": 173 }, { "entropy": 0.444549560546875, "epoch": 1.380952380952381, "grad_norm": 0.9018527258286749, "learning_rate": 9.15278455807566e-06, "loss": 0.4715, "mean_token_accuracy": 0.8390994230285287, "num_tokens": 149626353.0, "step": 174 }, { "entropy": 0.4414825439453125, "epoch": 1.3888888888888888, "grad_norm": 0.8544852116993605, "learning_rate": 9.140560589893012e-06, "loss": 0.4697, "mean_token_accuracy": 0.8393202098086476, "num_tokens": 150484433.0, "step": 175 }, { "entropy": 0.4454803466796875, "epoch": 1.3968253968253967, "grad_norm": 1.0071897404357584, "learning_rate": 9.128257351628224e-06, "loss": 0.473, "mean_token_accuracy": 0.8398340521380305, "num_tokens": 151351171.0, "step": 176 }, { "entropy": 0.440704345703125, "epoch": 1.4047619047619047, "grad_norm": 1.0906938840190845, "learning_rate": 9.115875078823975e-06, "loss": 0.4829, "mean_token_accuracy": 0.8349933759309351, "num_tokens": 152198704.0, "step": 177 }, { "entropy": 0.4438629150390625, "epoch": 1.4126984126984126, "grad_norm": 0.9630152565863848, "learning_rate": 9.103414008536029e-06, "loss": 0.4762, "mean_token_accuracy": 0.8377989139407873, "num_tokens": 153027562.0, "step": 178 }, { "entropy": 0.4464569091796875, "epoch": 1.4206349206349207, "grad_norm": 0.968332662888831, "learning_rate": 9.09087437932872e-06, "loss": 0.47, "mean_token_accuracy": 0.8383941231295466, "num_tokens": 153863890.0, "step": 179 }, { "entropy": 0.4459228515625, "epoch": 1.4285714285714286, "grad_norm": 0.8682147767187481, "learning_rate": 9.07825643127037e-06, "loss": 0.477, "mean_token_accuracy": 0.8366480157710612, "num_tokens": 154707913.0, "step": 180 }, { "entropy": 0.4376068115234375, "epoch": 1.4365079365079365, "grad_norm": 0.897993109890026, "learning_rate": 9.065560405928699e-06, "loss": 0.4756, "mean_token_accuracy": 0.8380651730112731, "num_tokens": 155590050.0, "step": 181 }, { "entropy": 0.4394073486328125, "epoch": 1.4444444444444444, "grad_norm": 0.9388047872340883, "learning_rate": 9.0527865463662e-06, "loss": 0.4709, "mean_token_accuracy": 0.8392384983599186, "num_tokens": 156449879.0, "step": 182 }, { "entropy": 0.440673828125, "epoch": 1.4523809523809523, "grad_norm": 0.8193746423443552, "learning_rate": 9.039935097135479e-06, "loss": 0.4584, "mean_token_accuracy": 0.8437643311917782, "num_tokens": 157304143.0, "step": 183 }, { "entropy": 0.43817138671875, "epoch": 1.4603174603174602, "grad_norm": 0.9374511059068703, "learning_rate": 9.027006304274584e-06, "loss": 0.4748, "mean_token_accuracy": 0.8367259805090725, "num_tokens": 158177988.0, "step": 184 }, { "entropy": 0.4360198974609375, "epoch": 1.4682539682539684, "grad_norm": 0.8212121210922411, "learning_rate": 9.014000415302286e-06, "loss": 0.4783, "mean_token_accuracy": 0.8371384800411761, "num_tokens": 159060066.0, "step": 185 }, { "entropy": 0.441680908203125, "epoch": 1.4761904761904763, "grad_norm": 0.8806995545843207, "learning_rate": 9.000917679213344e-06, "loss": 0.474, "mean_token_accuracy": 0.8378347246907651, "num_tokens": 159942986.0, "step": 186 }, { "entropy": 0.4442596435546875, "epoch": 1.4841269841269842, "grad_norm": 0.8463270672966281, "learning_rate": 8.987758346473739e-06, "loss": 0.4649, "mean_token_accuracy": 0.8411796907894313, "num_tokens": 160782816.0, "step": 187 }, { "entropy": 0.4425201416015625, "epoch": 1.492063492063492, "grad_norm": 0.839482572154392, "learning_rate": 8.974522669015872e-06, "loss": 0.4672, "mean_token_accuracy": 0.8414199482649565, "num_tokens": 161643512.0, "step": 188 }, { "entropy": 0.436920166015625, "epoch": 1.5, "grad_norm": 0.9422232012137579, "learning_rate": 8.961210900233757e-06, "loss": 0.4593, "mean_token_accuracy": 0.8416055347770452, "num_tokens": 162503001.0, "step": 189 }, { "entropy": 0.4347076416015625, "epoch": 1.507936507936508, "grad_norm": 0.9312174445116989, "learning_rate": 8.947823294978147e-06, "loss": 0.4741, "mean_token_accuracy": 0.8390083778649569, "num_tokens": 163388010.0, "step": 190 }, { "entropy": 0.4320220947265625, "epoch": 1.5158730158730158, "grad_norm": 0.8634591037958418, "learning_rate": 8.934360109551671e-06, "loss": 0.4694, "mean_token_accuracy": 0.8393782819621265, "num_tokens": 164270399.0, "step": 191 }, { "entropy": 0.4329681396484375, "epoch": 1.5238095238095237, "grad_norm": 1.0268418396952028, "learning_rate": 8.920821601703927e-06, "loss": 0.4657, "mean_token_accuracy": 0.8410811661742628, "num_tokens": 165155523.0, "step": 192 }, { "entropy": 0.4373931884765625, "epoch": 1.5317460317460316, "grad_norm": 0.911296138695116, "learning_rate": 8.907208030626538e-06, "loss": 0.4647, "mean_token_accuracy": 0.8417128617875278, "num_tokens": 166004219.0, "step": 193 }, { "entropy": 0.436920166015625, "epoch": 1.5396825396825395, "grad_norm": 0.8615585964723216, "learning_rate": 8.8935196569482e-06, "loss": 0.4659, "mean_token_accuracy": 0.841303990688175, "num_tokens": 166887486.0, "step": 194 }, { "entropy": 0.4330291748046875, "epoch": 1.5476190476190477, "grad_norm": 0.9022520563994237, "learning_rate": 8.879756742729683e-06, "loss": 0.4642, "mean_token_accuracy": 0.842128555290401, "num_tokens": 167743608.0, "step": 195 }, { "entropy": 0.4404449462890625, "epoch": 1.5555555555555556, "grad_norm": 0.8427585056849268, "learning_rate": 8.865919551458823e-06, "loss": 0.4638, "mean_token_accuracy": 0.8412150857038796, "num_tokens": 168602032.0, "step": 196 }, { "entropy": 0.43634033203125, "epoch": 1.5634920634920635, "grad_norm": 0.9473332832713499, "learning_rate": 8.852008348045468e-06, "loss": 0.4713, "mean_token_accuracy": 0.8384702135808766, "num_tokens": 169469975.0, "step": 197 }, { "entropy": 0.4295654296875, "epoch": 1.5714285714285714, "grad_norm": 0.8265269594435529, "learning_rate": 8.838023398816417e-06, "loss": 0.471, "mean_token_accuracy": 0.8378414455801249, "num_tokens": 170343282.0, "step": 198 }, { "entropy": 0.4375, "epoch": 1.5793650793650795, "grad_norm": 0.9350879056767756, "learning_rate": 8.823964971510313e-06, "loss": 0.4701, "mean_token_accuracy": 0.8392301532439888, "num_tokens": 171227432.0, "step": 199 }, { "entropy": 0.4355621337890625, "epoch": 1.5873015873015874, "grad_norm": 0.8262956094897539, "learning_rate": 8.809833335272517e-06, "loss": 0.4531, "mean_token_accuracy": 0.8436351302079856, "num_tokens": 172096305.0, "step": 200 }, { "entropy": 0.4359893798828125, "epoch": 1.5952380952380953, "grad_norm": 0.8393643465691598, "learning_rate": 8.795628760649965e-06, "loss": 0.4552, "mean_token_accuracy": 0.8432473209686577, "num_tokens": 172910673.0, "step": 201 }, { "entropy": 0.4252777099609375, "epoch": 1.6031746031746033, "grad_norm": 0.930458721360079, "learning_rate": 8.781351519585978e-06, "loss": 0.4602, "mean_token_accuracy": 0.8418687861412764, "num_tokens": 173775762.0, "step": 202 }, { "entropy": 0.4301910400390625, "epoch": 1.6111111111111112, "grad_norm": 0.9255152673550228, "learning_rate": 8.767001885415055e-06, "loss": 0.4658, "mean_token_accuracy": 0.8412896669469774, "num_tokens": 174651858.0, "step": 203 }, { "entropy": 0.4319915771484375, "epoch": 1.619047619047619, "grad_norm": 0.8156780804217264, "learning_rate": 8.752580132857652e-06, "loss": 0.4576, "mean_token_accuracy": 0.8430444840341806, "num_tokens": 175519282.0, "step": 204 }, { "entropy": 0.4349212646484375, "epoch": 1.626984126984127, "grad_norm": 0.8770655718885645, "learning_rate": 8.73808653801491e-06, "loss": 0.4714, "mean_token_accuracy": 0.8400326487608254, "num_tokens": 176387199.0, "step": 205 }, { "entropy": 0.430511474609375, "epoch": 1.6349206349206349, "grad_norm": 0.8757592238026767, "learning_rate": 8.723521378363378e-06, "loss": 0.4681, "mean_token_accuracy": 0.8415880398824811, "num_tokens": 177264131.0, "step": 206 }, { "entropy": 0.43365478515625, "epoch": 1.6428571428571428, "grad_norm": 0.89706430686004, "learning_rate": 8.70888493274969e-06, "loss": 0.4581, "mean_token_accuracy": 0.8423688313923776, "num_tokens": 178114003.0, "step": 207 }, { "entropy": 0.4362945556640625, "epoch": 1.6507936507936507, "grad_norm": 1.245121139741542, "learning_rate": 8.694177481385244e-06, "loss": 0.4681, "mean_token_accuracy": 0.8389896345324814, "num_tokens": 178950487.0, "step": 208 }, { "entropy": 0.4297637939453125, "epoch": 1.6587301587301586, "grad_norm": 0.9621053522750438, "learning_rate": 8.679399305840815e-06, "loss": 0.4694, "mean_token_accuracy": 0.83825440146029, "num_tokens": 179833212.0, "step": 209 }, { "entropy": 0.4279327392578125, "epoch": 1.6666666666666665, "grad_norm": 0.8737950990378116, "learning_rate": 8.664550689041187e-06, "loss": 0.461, "mean_token_accuracy": 0.8423066223040223, "num_tokens": 180712234.0, "step": 210 }, { "entropy": 0.4318084716796875, "epoch": 1.6746031746031746, "grad_norm": 1.000096359581219, "learning_rate": 8.649631915259716e-06, "loss": 0.4741, "mean_token_accuracy": 0.8375975685194135, "num_tokens": 181566490.0, "step": 211 }, { "entropy": 0.4336090087890625, "epoch": 1.6825396825396826, "grad_norm": 0.8934119220152827, "learning_rate": 8.634643270112903e-06, "loss": 0.4667, "mean_token_accuracy": 0.8412727518007159, "num_tokens": 182413254.0, "step": 212 }, { "entropy": 0.430084228515625, "epoch": 1.6904761904761905, "grad_norm": 0.8156026818943841, "learning_rate": 8.61958504055492e-06, "loss": 0.4599, "mean_token_accuracy": 0.8421376254409552, "num_tokens": 183269538.0, "step": 213 }, { "entropy": 0.4370574951171875, "epoch": 1.6984126984126984, "grad_norm": 1.0433955766227752, "learning_rate": 8.604457514872115e-06, "loss": 0.4577, "mean_token_accuracy": 0.8438415261916816, "num_tokens": 184109496.0, "step": 214 }, { "entropy": 0.4349365234375, "epoch": 1.7063492063492065, "grad_norm": 0.8795834565304798, "learning_rate": 8.589260982677496e-06, "loss": 0.4716, "mean_token_accuracy": 0.8374428367242217, "num_tokens": 184968366.0, "step": 215 }, { "entropy": 0.42926025390625, "epoch": 1.7142857142857144, "grad_norm": 0.8234311825574274, "learning_rate": 8.573995734905185e-06, "loss": 0.4689, "mean_token_accuracy": 0.8390569076873362, "num_tokens": 185857166.0, "step": 216 }, { "entropy": 0.436279296875, "epoch": 1.7222222222222223, "grad_norm": 0.9720711352685596, "learning_rate": 8.558662063804843e-06, "loss": 0.452, "mean_token_accuracy": 0.8439166625030339, "num_tokens": 186684767.0, "step": 217 }, { "entropy": 0.4309844970703125, "epoch": 1.7301587301587302, "grad_norm": 0.8753161905032254, "learning_rate": 8.543260262936087e-06, "loss": 0.4545, "mean_token_accuracy": 0.843706154730171, "num_tokens": 187534641.0, "step": 218 }, { "entropy": 0.429901123046875, "epoch": 1.7380952380952381, "grad_norm": 0.8611756266061616, "learning_rate": 8.527790627162858e-06, "loss": 0.4594, "mean_token_accuracy": 0.8414032305590808, "num_tokens": 188403747.0, "step": 219 }, { "entropy": 0.4284210205078125, "epoch": 1.746031746031746, "grad_norm": 0.917314816658313, "learning_rate": 8.512253452647783e-06, "loss": 0.4636, "mean_token_accuracy": 0.8410017411224544, "num_tokens": 189286051.0, "step": 220 }, { "entropy": 0.435394287109375, "epoch": 1.753968253968254, "grad_norm": 0.8160829606015351, "learning_rate": 8.496649036846502e-06, "loss": 0.4556, "mean_token_accuracy": 0.8419742425903678, "num_tokens": 190135846.0, "step": 221 }, { "entropy": 0.4247589111328125, "epoch": 1.7619047619047619, "grad_norm": 1.0175216766708233, "learning_rate": 8.480977678501974e-06, "loss": 0.4658, "mean_token_accuracy": 0.8410613937303424, "num_tokens": 191023956.0, "step": 222 }, { "entropy": 0.43572998046875, "epoch": 1.7698412698412698, "grad_norm": 0.9507033398860977, "learning_rate": 8.465239677638755e-06, "loss": 0.4554, "mean_token_accuracy": 0.8437660122290254, "num_tokens": 191865715.0, "step": 223 }, { "entropy": 0.4373016357421875, "epoch": 1.7777777777777777, "grad_norm": 0.8180069161681153, "learning_rate": 8.449435335557264e-06, "loss": 0.4575, "mean_token_accuracy": 0.8432631348259747, "num_tokens": 192687536.0, "step": 224 }, { "entropy": 0.4383087158203125, "epoch": 1.7857142857142856, "grad_norm": 0.8838273166712945, "learning_rate": 8.433564954828e-06, "loss": 0.4526, "mean_token_accuracy": 0.8442786163650453, "num_tokens": 193514317.0, "step": 225 }, { "entropy": 0.42999267578125, "epoch": 1.7936507936507935, "grad_norm": 0.9410341071186311, "learning_rate": 8.417628839285757e-06, "loss": 0.4581, "mean_token_accuracy": 0.8429269646294415, "num_tokens": 194368425.0, "step": 226 }, { "entropy": 0.4307098388671875, "epoch": 1.8015873015873016, "grad_norm": 0.90812536954616, "learning_rate": 8.401627294023815e-06, "loss": 0.4577, "mean_token_accuracy": 0.8424977059476078, "num_tokens": 195229420.0, "step": 227 }, { "entropy": 0.4311065673828125, "epoch": 1.8095238095238095, "grad_norm": 0.8936972613642765, "learning_rate": 8.385560625388081e-06, "loss": 0.4613, "mean_token_accuracy": 0.8418103088624775, "num_tokens": 196086060.0, "step": 228 }, { "entropy": 0.4331512451171875, "epoch": 1.8174603174603174, "grad_norm": 0.9023364391946196, "learning_rate": 8.369429140971239e-06, "loss": 0.4587, "mean_token_accuracy": 0.840968404430896, "num_tokens": 196949752.0, "step": 229 }, { "entropy": 0.431732177734375, "epoch": 1.8253968253968254, "grad_norm": 0.9269255151577744, "learning_rate": 8.353233149606859e-06, "loss": 0.4564, "mean_token_accuracy": 0.8422707901336253, "num_tokens": 197787383.0, "step": 230 }, { "entropy": 0.4332275390625, "epoch": 1.8333333333333335, "grad_norm": 0.8746862673486592, "learning_rate": 8.336972961363472e-06, "loss": 0.4641, "mean_token_accuracy": 0.8415999473072588, "num_tokens": 198640204.0, "step": 231 }, { "entropy": 0.4254608154296875, "epoch": 1.8412698412698414, "grad_norm": 0.8419998918591282, "learning_rate": 8.320648887538657e-06, "loss": 0.4628, "mean_token_accuracy": 0.8425387698225677, "num_tokens": 199534945.0, "step": 232 }, { "entropy": 0.4349517822265625, "epoch": 1.8492063492063493, "grad_norm": 0.945589758024129, "learning_rate": 8.304261240653054e-06, "loss": 0.4546, "mean_token_accuracy": 0.8429999812506139, "num_tokens": 200401566.0, "step": 233 }, { "entropy": 0.4324798583984375, "epoch": 1.8571428571428572, "grad_norm": 0.9389620288256866, "learning_rate": 8.287810334444406e-06, "loss": 0.4616, "mean_token_accuracy": 0.8408999373205006, "num_tokens": 201286569.0, "step": 234 }, { "entropy": 0.4327392578125, "epoch": 1.8650793650793651, "grad_norm": 0.9056957266265069, "learning_rate": 8.271296483861532e-06, "loss": 0.4555, "mean_token_accuracy": 0.8440436110831797, "num_tokens": 202148785.0, "step": 235 }, { "entropy": 0.43682861328125, "epoch": 1.873015873015873, "grad_norm": 0.9007501274176329, "learning_rate": 8.254720005058317e-06, "loss": 0.4511, "mean_token_accuracy": 0.8437027987092733, "num_tokens": 202969412.0, "step": 236 }, { "entropy": 0.429046630859375, "epoch": 1.880952380952381, "grad_norm": 0.7886955269176177, "learning_rate": 8.238081215387639e-06, "loss": 0.4572, "mean_token_accuracy": 0.8425727025605738, "num_tokens": 203845826.0, "step": 237 }, { "entropy": 0.4304656982421875, "epoch": 1.8888888888888888, "grad_norm": 0.9902829953426554, "learning_rate": 8.221380433395308e-06, "loss": 0.4522, "mean_token_accuracy": 0.8438800727017224, "num_tokens": 204713067.0, "step": 238 }, { "entropy": 0.4382476806640625, "epoch": 1.8968253968253967, "grad_norm": 0.8783861526345048, "learning_rate": 8.204617978813963e-06, "loss": 0.4544, "mean_token_accuracy": 0.8443545303307474, "num_tokens": 205549482.0, "step": 239 }, { "entropy": 0.4334259033203125, "epoch": 1.9047619047619047, "grad_norm": 0.7800627411645534, "learning_rate": 8.187794172556947e-06, "loss": 0.4535, "mean_token_accuracy": 0.8426107591949403, "num_tokens": 206394578.0, "step": 240 }, { "entropy": 0.4372100830078125, "epoch": 1.9126984126984126, "grad_norm": 0.8924141210495853, "learning_rate": 8.170909336712171e-06, "loss": 0.4593, "mean_token_accuracy": 0.8427824974060059, "num_tokens": 207233636.0, "step": 241 }, { "entropy": 0.4353790283203125, "epoch": 1.9206349206349205, "grad_norm": 0.8893426872353928, "learning_rate": 8.153963794535945e-06, "loss": 0.4604, "mean_token_accuracy": 0.841770654078573, "num_tokens": 208074376.0, "step": 242 }, { "entropy": 0.4335174560546875, "epoch": 1.9285714285714286, "grad_norm": 0.8143885506939128, "learning_rate": 8.136957870446779e-06, "loss": 0.4591, "mean_token_accuracy": 0.8414175752550364, "num_tokens": 208947370.0, "step": 243 }, { "entropy": 0.4306640625, "epoch": 1.9365079365079365, "grad_norm": 0.8217558583786552, "learning_rate": 8.119891890019187e-06, "loss": 0.4502, "mean_token_accuracy": 0.8447873778641224, "num_tokens": 209798547.0, "step": 244 }, { "entropy": 0.4336090087890625, "epoch": 1.9444444444444444, "grad_norm": 0.8345483891742207, "learning_rate": 8.102766179977452e-06, "loss": 0.4548, "mean_token_accuracy": 0.843397512100637, "num_tokens": 210661829.0, "step": 245 }, { "entropy": 0.4267578125, "epoch": 1.9523809523809523, "grad_norm": 0.886458439838755, "learning_rate": 8.085581068189358e-06, "loss": 0.4546, "mean_token_accuracy": 0.8432880756445229, "num_tokens": 211549046.0, "step": 246 }, { "entropy": 0.4273834228515625, "epoch": 1.9603174603174605, "grad_norm": 0.7893301359285466, "learning_rate": 8.068336883659926e-06, "loss": 0.4483, "mean_token_accuracy": 0.8453035233542323, "num_tokens": 212447521.0, "step": 247 }, { "entropy": 0.4306793212890625, "epoch": 1.9682539682539684, "grad_norm": 0.891429474690652, "learning_rate": 8.051033956525113e-06, "loss": 0.4539, "mean_token_accuracy": 0.8432926838286221, "num_tokens": 213310334.0, "step": 248 }, { "entropy": 0.429534912109375, "epoch": 1.9761904761904763, "grad_norm": 0.8247760750659134, "learning_rate": 8.033672618045485e-06, "loss": 0.4524, "mean_token_accuracy": 0.8450775747187436, "num_tokens": 214169043.0, "step": 249 }, { "entropy": 0.4324188232421875, "epoch": 1.9841269841269842, "grad_norm": 0.8524339615157, "learning_rate": 8.016253200599885e-06, "loss": 0.4519, "mean_token_accuracy": 0.8445535181090236, "num_tokens": 215005057.0, "step": 250 }, { "entropy": 0.4263763427734375, "epoch": 1.992063492063492, "grad_norm": 0.8331975898868739, "learning_rate": 7.998776037679061e-06, "loss": 0.4437, "mean_token_accuracy": 0.8456794614903629, "num_tokens": 215869766.0, "step": 251 }, { "entropy": 0.4291229248046875, "epoch": 2.0, "grad_norm": 0.8613130972882047, "learning_rate": 7.981241463879284e-06, "loss": 0.4466, "mean_token_accuracy": 0.8456757622770965, "num_tokens": 216731206.0, "step": 252 }, { "entropy": 0.428619384765625, "epoch": 2.007936507936508, "grad_norm": 0.9277446577089026, "learning_rate": 7.963649814895945e-06, "loss": 0.4256, "mean_token_accuracy": 0.8530098241753876, "num_tokens": 217589905.0, "step": 253 }, { "entropy": 0.4280853271484375, "epoch": 2.015873015873016, "grad_norm": 0.8708275069644504, "learning_rate": 7.94600142751713e-06, "loss": 0.432, "mean_token_accuracy": 0.8501051301136613, "num_tokens": 218446876.0, "step": 254 }, { "entropy": 0.42718505859375, "epoch": 2.0238095238095237, "grad_norm": 0.8842468147508419, "learning_rate": 7.92829663961716e-06, "loss": 0.433, "mean_token_accuracy": 0.850572609808296, "num_tokens": 219322571.0, "step": 255 }, { "entropy": 0.42889404296875, "epoch": 2.0317460317460316, "grad_norm": 0.8439073466722959, "learning_rate": 7.910535790150135e-06, "loss": 0.4291, "mean_token_accuracy": 0.8493411005474627, "num_tokens": 220180160.0, "step": 256 }, { "entropy": 0.42620849609375, "epoch": 2.0396825396825395, "grad_norm": 0.8757701106721189, "learning_rate": 7.892719219143446e-06, "loss": 0.42, "mean_token_accuracy": 0.8547583618201315, "num_tokens": 221016578.0, "step": 257 }, { "entropy": 0.42437744140625, "epoch": 2.0476190476190474, "grad_norm": 0.8882389150609792, "learning_rate": 7.874847267691254e-06, "loss": 0.4293, "mean_token_accuracy": 0.8507325639948249, "num_tokens": 221871968.0, "step": 258 }, { "entropy": 0.4220123291015625, "epoch": 2.0555555555555554, "grad_norm": 0.8012689613491503, "learning_rate": 7.856920277947969e-06, "loss": 0.4236, "mean_token_accuracy": 0.8522419198416173, "num_tokens": 222738323.0, "step": 259 }, { "entropy": 0.4245758056640625, "epoch": 2.0634920634920633, "grad_norm": 0.9448934032497721, "learning_rate": 7.83893859312169e-06, "loss": 0.4286, "mean_token_accuracy": 0.8518815254792571, "num_tokens": 223584134.0, "step": 260 }, { "entropy": 0.4188232421875, "epoch": 2.0714285714285716, "grad_norm": 0.8498468200899314, "learning_rate": 7.820902557467648e-06, "loss": 0.4256, "mean_token_accuracy": 0.8522023572586477, "num_tokens": 224461654.0, "step": 261 }, { "entropy": 0.42340087890625, "epoch": 2.0793650793650795, "grad_norm": 0.9494122603581977, "learning_rate": 7.80281251628161e-06, "loss": 0.4325, "mean_token_accuracy": 0.8496407098136842, "num_tokens": 225327562.0, "step": 262 }, { "entropy": 0.4233551025390625, "epoch": 2.0873015873015874, "grad_norm": 0.773955396676882, "learning_rate": 7.784668815893256e-06, "loss": 0.4201, "mean_token_accuracy": 0.853766305372119, "num_tokens": 226189031.0, "step": 263 }, { "entropy": 0.4216156005859375, "epoch": 2.0952380952380953, "grad_norm": 0.9147295809460214, "learning_rate": 7.766471803659571e-06, "loss": 0.4396, "mean_token_accuracy": 0.8481186041608453, "num_tokens": 227062309.0, "step": 264 }, { "entropy": 0.4239349365234375, "epoch": 2.1031746031746033, "grad_norm": 0.9057979347358639, "learning_rate": 7.748221827958174e-06, "loss": 0.4297, "mean_token_accuracy": 0.8508882015012205, "num_tokens": 227920598.0, "step": 265 }, { "entropy": 0.422332763671875, "epoch": 2.111111111111111, "grad_norm": 0.7698507478470203, "learning_rate": 7.729919238180663e-06, "loss": 0.4239, "mean_token_accuracy": 0.8522637677378953, "num_tokens": 228773818.0, "step": 266 }, { "entropy": 0.4227447509765625, "epoch": 2.119047619047619, "grad_norm": 0.7899935011725722, "learning_rate": 7.711564384725916e-06, "loss": 0.4215, "mean_token_accuracy": 0.8535870416089892, "num_tokens": 229627711.0, "step": 267 }, { "entropy": 0.4207000732421875, "epoch": 2.126984126984127, "grad_norm": 0.7934125500230887, "learning_rate": 7.693157618993392e-06, "loss": 0.4334, "mean_token_accuracy": 0.8498726398684084, "num_tokens": 230489032.0, "step": 268 }, { "entropy": 0.421234130859375, "epoch": 2.134920634920635, "grad_norm": 0.7955872905920104, "learning_rate": 7.674699293376397e-06, "loss": 0.4349, "mean_token_accuracy": 0.8490500543266535, "num_tokens": 231339019.0, "step": 269 }, { "entropy": 0.4244232177734375, "epoch": 2.142857142857143, "grad_norm": 0.7854549442643708, "learning_rate": 7.656189761255333e-06, "loss": 0.4319, "mean_token_accuracy": 0.8492707693949342, "num_tokens": 232199672.0, "step": 270 }, { "entropy": 0.4268646240234375, "epoch": 2.1507936507936507, "grad_norm": 0.7878191667376515, "learning_rate": 7.63762937699095e-06, "loss": 0.4309, "mean_token_accuracy": 0.8514748462475836, "num_tokens": 233076007.0, "step": 271 }, { "entropy": 0.4263763427734375, "epoch": 2.1587301587301586, "grad_norm": 0.7996554554456847, "learning_rate": 7.619018495917543e-06, "loss": 0.4302, "mean_token_accuracy": 0.8500847779214382, "num_tokens": 233942156.0, "step": 272 }, { "entropy": 0.4241943359375, "epoch": 2.1666666666666665, "grad_norm": 0.7525601660809861, "learning_rate": 7.600357474336157e-06, "loss": 0.432, "mean_token_accuracy": 0.8499450846575201, "num_tokens": 234844668.0, "step": 273 }, { "entropy": 0.4309234619140625, "epoch": 2.1746031746031744, "grad_norm": 0.8012481468665732, "learning_rate": 7.581646669507768e-06, "loss": 0.4329, "mean_token_accuracy": 0.8488554251380265, "num_tokens": 235697877.0, "step": 274 }, { "entropy": 0.425933837890625, "epoch": 2.1825396825396823, "grad_norm": 0.7849735766550899, "learning_rate": 7.56288643964644e-06, "loss": 0.4253, "mean_token_accuracy": 0.851461592130363, "num_tokens": 236586699.0, "step": 275 }, { "entropy": 0.4276123046875, "epoch": 2.1904761904761907, "grad_norm": 0.8396810265614048, "learning_rate": 7.544077143912467e-06, "loss": 0.425, "mean_token_accuracy": 0.8501534420065582, "num_tokens": 237426378.0, "step": 276 }, { "entropy": 0.4206085205078125, "epoch": 2.1984126984126986, "grad_norm": 0.8220018076708352, "learning_rate": 7.525219142405501e-06, "loss": 0.4272, "mean_token_accuracy": 0.8498959382995963, "num_tokens": 238297987.0, "step": 277 }, { "entropy": 0.4322357177734375, "epoch": 2.2063492063492065, "grad_norm": 0.8197639207257438, "learning_rate": 7.506312796157649e-06, "loss": 0.4381, "mean_token_accuracy": 0.8488305411301553, "num_tokens": 239171101.0, "step": 278 }, { "entropy": 0.4268646240234375, "epoch": 2.2142857142857144, "grad_norm": 0.783646321002473, "learning_rate": 7.487358467126573e-06, "loss": 0.4242, "mean_token_accuracy": 0.8518037595786154, "num_tokens": 240034337.0, "step": 279 }, { "entropy": 0.43035888671875, "epoch": 2.2222222222222223, "grad_norm": 0.8546249612241452, "learning_rate": 7.468356518188551e-06, "loss": 0.4174, "mean_token_accuracy": 0.8534762058407068, "num_tokens": 240860927.0, "step": 280 }, { "entropy": 0.42437744140625, "epoch": 2.2301587301587302, "grad_norm": 0.87740351405863, "learning_rate": 7.449307313131533e-06, "loss": 0.4296, "mean_token_accuracy": 0.8500415538437665, "num_tokens": 241739076.0, "step": 281 }, { "entropy": 0.4241180419921875, "epoch": 2.238095238095238, "grad_norm": 0.8620781094998687, "learning_rate": 7.4302112166481814e-06, "loss": 0.4152, "mean_token_accuracy": 0.8549392893910408, "num_tokens": 242574011.0, "step": 282 }, { "entropy": 0.4252471923828125, "epoch": 2.246031746031746, "grad_norm": 0.8379775455346818, "learning_rate": 7.411068594328876e-06, "loss": 0.4292, "mean_token_accuracy": 0.8494298844598234, "num_tokens": 243458878.0, "step": 283 }, { "entropy": 0.4227294921875, "epoch": 2.253968253968254, "grad_norm": 0.825758180012109, "learning_rate": 7.391879812654727e-06, "loss": 0.4257, "mean_token_accuracy": 0.852616976480931, "num_tokens": 244313964.0, "step": 284 }, { "entropy": 0.42498779296875, "epoch": 2.261904761904762, "grad_norm": 0.7941347937575597, "learning_rate": 7.37264523899056e-06, "loss": 0.4204, "mean_token_accuracy": 0.8534517176449299, "num_tokens": 245200322.0, "step": 285 }, { "entropy": 0.4271087646484375, "epoch": 2.2698412698412698, "grad_norm": 0.8928939293234606, "learning_rate": 7.353365241577869e-06, "loss": 0.4274, "mean_token_accuracy": 0.8513154000975192, "num_tokens": 246083539.0, "step": 286 }, { "entropy": 0.427947998046875, "epoch": 2.2777777777777777, "grad_norm": 0.8392795753081728, "learning_rate": 7.3340401895277816e-06, "loss": 0.4276, "mean_token_accuracy": 0.8511864547617733, "num_tokens": 246933619.0, "step": 287 }, { "entropy": 0.4322662353515625, "epoch": 2.2857142857142856, "grad_norm": 0.8013508005420562, "learning_rate": 7.314670452813982e-06, "loss": 0.4188, "mean_token_accuracy": 0.8539650039747357, "num_tokens": 247765672.0, "step": 288 }, { "entropy": 0.4230804443359375, "epoch": 2.2936507936507935, "grad_norm": 0.7925852081903219, "learning_rate": 7.295256402265636e-06, "loss": 0.4208, "mean_token_accuracy": 0.8516722363419831, "num_tokens": 248628378.0, "step": 289 }, { "entropy": 0.4259033203125, "epoch": 2.3015873015873014, "grad_norm": 0.881087099852364, "learning_rate": 7.275798409560282e-06, "loss": 0.4286, "mean_token_accuracy": 0.8508175020106137, "num_tokens": 249501143.0, "step": 290 }, { "entropy": 0.42486572265625, "epoch": 2.3095238095238093, "grad_norm": 0.8044072073317771, "learning_rate": 7.256296847216727e-06, "loss": 0.4208, "mean_token_accuracy": 0.8538436009548604, "num_tokens": 250356099.0, "step": 291 }, { "entropy": 0.42413330078125, "epoch": 2.317460317460317, "grad_norm": 0.8884297194353945, "learning_rate": 7.236752088587905e-06, "loss": 0.4278, "mean_token_accuracy": 0.850508657284081, "num_tokens": 251219125.0, "step": 292 }, { "entropy": 0.4213409423828125, "epoch": 2.3253968253968256, "grad_norm": 0.8328136207372636, "learning_rate": 7.217164507853734e-06, "loss": 0.423, "mean_token_accuracy": 0.8522739242762327, "num_tokens": 252080434.0, "step": 293 }, { "entropy": 0.426666259765625, "epoch": 2.3333333333333335, "grad_norm": 0.8427474548537396, "learning_rate": 7.197534480013951e-06, "loss": 0.4203, "mean_token_accuracy": 0.85275460453704, "num_tokens": 252923218.0, "step": 294 }, { "entropy": 0.430084228515625, "epoch": 2.3412698412698414, "grad_norm": 0.9004833679442211, "learning_rate": 7.177862380880935e-06, "loss": 0.4218, "mean_token_accuracy": 0.8528542476706207, "num_tokens": 253761289.0, "step": 295 }, { "entropy": 0.4216156005859375, "epoch": 2.3492063492063493, "grad_norm": 0.8913589598940626, "learning_rate": 7.158148587072509e-06, "loss": 0.425, "mean_token_accuracy": 0.8505891724489629, "num_tokens": 254643716.0, "step": 296 }, { "entropy": 0.426910400390625, "epoch": 2.357142857142857, "grad_norm": 0.7894369274614703, "learning_rate": 7.138393476004725e-06, "loss": 0.425, "mean_token_accuracy": 0.8516062931157649, "num_tokens": 255486573.0, "step": 297 }, { "entropy": 0.426300048828125, "epoch": 2.365079365079365, "grad_norm": 0.8212693828322741, "learning_rate": 7.118597425884659e-06, "loss": 0.4154, "mean_token_accuracy": 0.8540734858252108, "num_tokens": 256345326.0, "step": 298 }, { "entropy": 0.4244842529296875, "epoch": 2.373015873015873, "grad_norm": 0.8408664594175462, "learning_rate": 7.098760815703139e-06, "loss": 0.4159, "mean_token_accuracy": 0.8559228433296084, "num_tokens": 257185152.0, "step": 299 }, { "entropy": 0.42059326171875, "epoch": 2.380952380952381, "grad_norm": 0.8309509931885469, "learning_rate": 7.078884025227519e-06, "loss": 0.4215, "mean_token_accuracy": 0.8527105739340186, "num_tokens": 258067282.0, "step": 300 }, { "entropy": 0.421661376953125, "epoch": 2.388888888888889, "grad_norm": 0.7954323327920004, "learning_rate": 7.058967434994388e-06, "loss": 0.4251, "mean_token_accuracy": 0.8514108480885625, "num_tokens": 258944016.0, "step": 301 }, { "entropy": 0.425048828125, "epoch": 2.3968253968253967, "grad_norm": 0.7636778328503014, "learning_rate": 7.0390114263022955e-06, "loss": 0.4198, "mean_token_accuracy": 0.8537601926364005, "num_tokens": 259808268.0, "step": 302 }, { "entropy": 0.41961669921875, "epoch": 2.4047619047619047, "grad_norm": 0.8444532257440839, "learning_rate": 7.019016381204448e-06, "loss": 0.4264, "mean_token_accuracy": 0.8519964478909969, "num_tokens": 260684292.0, "step": 303 }, { "entropy": 0.4252777099609375, "epoch": 2.4126984126984126, "grad_norm": 0.8265870173926899, "learning_rate": 6.998982682501394e-06, "loss": 0.4233, "mean_token_accuracy": 0.8529867087490857, "num_tokens": 261555918.0, "step": 304 }, { "entropy": 0.4233245849609375, "epoch": 2.4206349206349205, "grad_norm": 0.8766807638096971, "learning_rate": 6.978910713733696e-06, "loss": 0.4207, "mean_token_accuracy": 0.8529614573344588, "num_tokens": 262425946.0, "step": 305 }, { "entropy": 0.4260406494140625, "epoch": 2.4285714285714284, "grad_norm": 0.8180287946820591, "learning_rate": 6.958800859174591e-06, "loss": 0.4155, "mean_token_accuracy": 0.8538580327294767, "num_tokens": 263268966.0, "step": 306 }, { "entropy": 0.420166015625, "epoch": 2.4365079365079367, "grad_norm": 0.8038512105532972, "learning_rate": 6.938653503822628e-06, "loss": 0.4193, "mean_token_accuracy": 0.8529025730676949, "num_tokens": 264137961.0, "step": 307 }, { "entropy": 0.4186859130859375, "epoch": 2.4444444444444446, "grad_norm": 0.8356237787218255, "learning_rate": 6.9184690333942995e-06, "loss": 0.4179, "mean_token_accuracy": 0.8538753935135901, "num_tokens": 264995910.0, "step": 308 }, { "entropy": 0.4160308837890625, "epoch": 2.4523809523809526, "grad_norm": 0.8358036143672558, "learning_rate": 6.898247834316662e-06, "loss": 0.4147, "mean_token_accuracy": 0.8543582037091255, "num_tokens": 265867518.0, "step": 309 }, { "entropy": 0.41815185546875, "epoch": 2.4603174603174605, "grad_norm": 0.9260389067513531, "learning_rate": 6.877990293719928e-06, "loss": 0.4211, "mean_token_accuracy": 0.8540931805036962, "num_tokens": 266730039.0, "step": 310 }, { "entropy": 0.4172515869140625, "epoch": 2.4682539682539684, "grad_norm": 0.7930039952405856, "learning_rate": 6.857696799430064e-06, "loss": 0.4248, "mean_token_accuracy": 0.8519657654687762, "num_tokens": 267605673.0, "step": 311 }, { "entropy": 0.4198455810546875, "epoch": 2.4761904761904763, "grad_norm": 0.8779922529454903, "learning_rate": 6.83736773996136e-06, "loss": 0.4276, "mean_token_accuracy": 0.852175232488662, "num_tokens": 268470812.0, "step": 312 }, { "entropy": 0.418670654296875, "epoch": 2.484126984126984, "grad_norm": 0.7739740399164128, "learning_rate": 6.817003504508993e-06, "loss": 0.4145, "mean_token_accuracy": 0.853930065408349, "num_tokens": 269329768.0, "step": 313 }, { "entropy": 0.4190521240234375, "epoch": 2.492063492063492, "grad_norm": 0.7927430903268082, "learning_rate": 6.796604482941578e-06, "loss": 0.4238, "mean_token_accuracy": 0.8510767961852252, "num_tokens": 270192672.0, "step": 314 }, { "entropy": 0.4205474853515625, "epoch": 2.5, "grad_norm": 0.7677286448168184, "learning_rate": 6.7761710657936995e-06, "loss": 0.4282, "mean_token_accuracy": 0.8515617684461176, "num_tokens": 271053623.0, "step": 315 }, { "entropy": 0.415618896484375, "epoch": 2.507936507936508, "grad_norm": 0.7893175807304748, "learning_rate": 6.75570364425844e-06, "loss": 0.4215, "mean_token_accuracy": 0.8526675584726036, "num_tokens": 271921985.0, "step": 316 }, { "entropy": 0.4283447265625, "epoch": 2.515873015873016, "grad_norm": 0.8617893689163498, "learning_rate": 6.735202610179886e-06, "loss": 0.4235, "mean_token_accuracy": 0.8520378330722451, "num_tokens": 272757706.0, "step": 317 }, { "entropy": 0.42413330078125, "epoch": 2.5238095238095237, "grad_norm": 0.76248538584374, "learning_rate": 6.714668356045629e-06, "loss": 0.4155, "mean_token_accuracy": 0.8540036669000983, "num_tokens": 273603268.0, "step": 318 }, { "entropy": 0.421356201171875, "epoch": 2.5317460317460316, "grad_norm": 1.1471382166034823, "learning_rate": 6.694101274979253e-06, "loss": 0.4182, "mean_token_accuracy": 0.8544265124946833, "num_tokens": 274458735.0, "step": 319 }, { "entropy": 0.419586181640625, "epoch": 2.5396825396825395, "grad_norm": 0.8503843517257628, "learning_rate": 6.673501760732805e-06, "loss": 0.4188, "mean_token_accuracy": 0.851504479534924, "num_tokens": 275320028.0, "step": 320 }, { "entropy": 0.41754150390625, "epoch": 2.5476190476190474, "grad_norm": 0.7742097684397823, "learning_rate": 6.652870207679253e-06, "loss": 0.4154, "mean_token_accuracy": 0.8555147871375084, "num_tokens": 276151262.0, "step": 321 }, { "entropy": 0.415802001953125, "epoch": 2.5555555555555554, "grad_norm": 0.7996726962055972, "learning_rate": 6.632207010804949e-06, "loss": 0.4175, "mean_token_accuracy": 0.8534226748161018, "num_tokens": 276997327.0, "step": 322 }, { "entropy": 0.420318603515625, "epoch": 2.5634920634920633, "grad_norm": 0.8023983937223226, "learning_rate": 6.611512565702053e-06, "loss": 0.4226, "mean_token_accuracy": 0.8535379455424845, "num_tokens": 277849848.0, "step": 323 }, { "entropy": 0.4129180908203125, "epoch": 2.571428571428571, "grad_norm": 0.794860570280225, "learning_rate": 6.590787268560967e-06, "loss": 0.4126, "mean_token_accuracy": 0.8558539836667478, "num_tokens": 278726761.0, "step": 324 }, { "entropy": 0.417694091796875, "epoch": 2.5793650793650795, "grad_norm": 0.8947468548309203, "learning_rate": 6.570031516162746e-06, "loss": 0.4161, "mean_token_accuracy": 0.8547689928673208, "num_tokens": 279572082.0, "step": 325 }, { "entropy": 0.4159698486328125, "epoch": 2.5873015873015874, "grad_norm": 0.7955201654992391, "learning_rate": 6.549245705871507e-06, "loss": 0.4146, "mean_token_accuracy": 0.854183979332447, "num_tokens": 280414468.0, "step": 326 }, { "entropy": 0.4205780029296875, "epoch": 2.5952380952380953, "grad_norm": 0.8138022818439977, "learning_rate": 6.528430235626819e-06, "loss": 0.4216, "mean_token_accuracy": 0.8531410917639732, "num_tokens": 281237288.0, "step": 327 }, { "entropy": 0.4152374267578125, "epoch": 2.6031746031746033, "grad_norm": 0.8114079031107396, "learning_rate": 6.5075855039360805e-06, "loss": 0.4092, "mean_token_accuracy": 0.8578996560536325, "num_tokens": 282118057.0, "step": 328 }, { "entropy": 0.409637451171875, "epoch": 2.611111111111111, "grad_norm": 0.8647529166774726, "learning_rate": 6.486711909866895e-06, "loss": 0.4248, "mean_token_accuracy": 0.8518201056867838, "num_tokens": 283028330.0, "step": 329 }, { "entropy": 0.4239501953125, "epoch": 2.619047619047619, "grad_norm": 0.7331498819381451, "learning_rate": 6.465809853039431e-06, "loss": 0.4172, "mean_token_accuracy": 0.8533883499912918, "num_tokens": 283866607.0, "step": 330 }, { "entropy": 0.425384521484375, "epoch": 2.626984126984127, "grad_norm": 0.9242263399118948, "learning_rate": 6.444879733618766e-06, "loss": 0.4229, "mean_token_accuracy": 0.852979929652065, "num_tokens": 284705319.0, "step": 331 }, { "entropy": 0.419525146484375, "epoch": 2.634920634920635, "grad_norm": 0.8158292669223365, "learning_rate": 6.423921952307237e-06, "loss": 0.4338, "mean_token_accuracy": 0.8505428163334727, "num_tokens": 285598883.0, "step": 332 }, { "entropy": 0.422210693359375, "epoch": 2.642857142857143, "grad_norm": 0.8529287289999934, "learning_rate": 6.4029369103367545e-06, "loss": 0.4199, "mean_token_accuracy": 0.8537574140354991, "num_tokens": 286461446.0, "step": 333 }, { "entropy": 0.4251708984375, "epoch": 2.6507936507936507, "grad_norm": 0.8196864990296487, "learning_rate": 6.381925009461128e-06, "loss": 0.4171, "mean_token_accuracy": 0.8536815252155066, "num_tokens": 287308399.0, "step": 334 }, { "entropy": 0.4163818359375, "epoch": 2.6587301587301586, "grad_norm": 0.7820718545979705, "learning_rate": 6.3608866519483825e-06, "loss": 0.4198, "mean_token_accuracy": 0.8528619990684092, "num_tokens": 288187832.0, "step": 335 }, { "entropy": 0.4176025390625, "epoch": 2.6666666666666665, "grad_norm": 0.796216651976639, "learning_rate": 6.339822240573041e-06, "loss": 0.4169, "mean_token_accuracy": 0.8543005757965147, "num_tokens": 289047051.0, "step": 336 }, { "entropy": 0.421844482421875, "epoch": 2.674603174603175, "grad_norm": 0.8463751671443359, "learning_rate": 6.3187321786084236e-06, "loss": 0.423, "mean_token_accuracy": 0.852651288267225, "num_tokens": 289920851.0, "step": 337 }, { "entropy": 0.418731689453125, "epoch": 2.682539682539683, "grad_norm": 0.8240504405278195, "learning_rate": 6.297616869818926e-06, "loss": 0.4069, "mean_token_accuracy": 0.8571417732164264, "num_tokens": 290766931.0, "step": 338 }, { "entropy": 0.427032470703125, "epoch": 2.6904761904761907, "grad_norm": 0.8185363544673269, "learning_rate": 6.276476718452289e-06, "loss": 0.4155, "mean_token_accuracy": 0.853483980987221, "num_tokens": 291599836.0, "step": 339 }, { "entropy": 0.417877197265625, "epoch": 2.6984126984126986, "grad_norm": 0.837427213509895, "learning_rate": 6.2553121292318595e-06, "loss": 0.4211, "mean_token_accuracy": 0.8524906514212489, "num_tokens": 292454972.0, "step": 340 }, { "entropy": 0.42510986328125, "epoch": 2.7063492063492065, "grad_norm": 0.8135990341026819, "learning_rate": 6.23412350734884e-06, "loss": 0.4166, "mean_token_accuracy": 0.852956528775394, "num_tokens": 293307675.0, "step": 341 }, { "entropy": 0.4229583740234375, "epoch": 2.7142857142857144, "grad_norm": 0.7369881660528143, "learning_rate": 6.2129112584545325e-06, "loss": 0.4144, "mean_token_accuracy": 0.8540790337137878, "num_tokens": 294149752.0, "step": 342 }, { "entropy": 0.4259033203125, "epoch": 2.7222222222222223, "grad_norm": 0.8315573451881167, "learning_rate": 6.191675788652574e-06, "loss": 0.4017, "mean_token_accuracy": 0.8583689746446908, "num_tokens": 294975614.0, "step": 343 }, { "entropy": 0.416900634765625, "epoch": 2.7301587301587302, "grad_norm": 0.8638440384540704, "learning_rate": 6.170417504491157e-06, "loss": 0.4147, "mean_token_accuracy": 0.854499620385468, "num_tokens": 295846874.0, "step": 344 }, { "entropy": 0.4163665771484375, "epoch": 2.738095238095238, "grad_norm": 0.8116865889754844, "learning_rate": 6.149136812955256e-06, "loss": 0.4166, "mean_token_accuracy": 0.8544518309645355, "num_tokens": 296730922.0, "step": 345 }, { "entropy": 0.41357421875, "epoch": 2.746031746031746, "grad_norm": 0.7806791564546498, "learning_rate": 6.1278341214588255e-06, "loss": 0.4101, "mean_token_accuracy": 0.8577063884586096, "num_tokens": 297610941.0, "step": 346 }, { "entropy": 0.4183197021484375, "epoch": 2.753968253968254, "grad_norm": 0.8686079824008746, "learning_rate": 6.106509837837004e-06, "loss": 0.412, "mean_token_accuracy": 0.8529722727835178, "num_tokens": 298464464.0, "step": 347 }, { "entropy": 0.4134063720703125, "epoch": 2.761904761904762, "grad_norm": 0.8287811327498212, "learning_rate": 6.0851643703383066e-06, "loss": 0.407, "mean_token_accuracy": 0.8568426473066211, "num_tokens": 299315956.0, "step": 348 }, { "entropy": 0.420440673828125, "epoch": 2.7698412698412698, "grad_norm": 0.7606321520792506, "learning_rate": 6.063798127616811e-06, "loss": 0.4129, "mean_token_accuracy": 0.8552384455688298, "num_tokens": 300162540.0, "step": 349 }, { "entropy": 0.422515869140625, "epoch": 2.7777777777777777, "grad_norm": 0.7256068297614475, "learning_rate": 6.042411518724327e-06, "loss": 0.41, "mean_token_accuracy": 0.8559038788080215, "num_tokens": 301009855.0, "step": 350 }, { "entropy": 0.42303466796875, "epoch": 2.7857142857142856, "grad_norm": 0.7634303802543713, "learning_rate": 6.021004953102576e-06, "loss": 0.4039, "mean_token_accuracy": 0.8571093692444265, "num_tokens": 301852351.0, "step": 351 }, { "entropy": 0.4193878173828125, "epoch": 2.7936507936507935, "grad_norm": 0.7645192522691564, "learning_rate": 5.999578840575342e-06, "loss": 0.4046, "mean_token_accuracy": 0.8566430397331715, "num_tokens": 302702189.0, "step": 352 }, { "entropy": 0.4232940673828125, "epoch": 2.8015873015873014, "grad_norm": 0.794739166753094, "learning_rate": 5.978133591340633e-06, "loss": 0.4091, "mean_token_accuracy": 0.8565059076063335, "num_tokens": 303546117.0, "step": 353 }, { "entropy": 0.4161529541015625, "epoch": 2.8095238095238093, "grad_norm": 0.8805512544331933, "learning_rate": 5.956669615962821e-06, "loss": 0.413, "mean_token_accuracy": 0.8556133066304028, "num_tokens": 304424136.0, "step": 354 }, { "entropy": 0.41705322265625, "epoch": 2.817460317460317, "grad_norm": 0.7877254936944273, "learning_rate": 5.935187325364791e-06, "loss": 0.42, "mean_token_accuracy": 0.8545625568367541, "num_tokens": 305299176.0, "step": 355 }, { "entropy": 0.4157562255859375, "epoch": 2.825396825396825, "grad_norm": 0.8062504809460449, "learning_rate": 5.913687130820064e-06, "loss": 0.4104, "mean_token_accuracy": 0.8556024674326181, "num_tokens": 306180918.0, "step": 356 }, { "entropy": 0.41650390625, "epoch": 2.8333333333333335, "grad_norm": 0.7092100136349762, "learning_rate": 5.892169443944929e-06, "loss": 0.4151, "mean_token_accuracy": 0.8552160942927003, "num_tokens": 307053855.0, "step": 357 }, { "entropy": 0.4198150634765625, "epoch": 2.8412698412698414, "grad_norm": 0.8020397673815377, "learning_rate": 5.870634676690564e-06, "loss": 0.414, "mean_token_accuracy": 0.8550357166677713, "num_tokens": 307908233.0, "step": 358 }, { "entropy": 0.419891357421875, "epoch": 2.8492063492063493, "grad_norm": 0.8184927667236647, "learning_rate": 5.8490832413351465e-06, "loss": 0.406, "mean_token_accuracy": 0.8566894140094519, "num_tokens": 308765267.0, "step": 359 }, { "entropy": 0.4268646240234375, "epoch": 2.857142857142857, "grad_norm": 0.7696962615494287, "learning_rate": 5.827515550475955e-06, "loss": 0.4112, "mean_token_accuracy": 0.8539468543604016, "num_tokens": 309586897.0, "step": 360 }, { "entropy": 0.4179840087890625, "epoch": 2.865079365079365, "grad_norm": 0.7851245206394726, "learning_rate": 5.805932017021486e-06, "loss": 0.4116, "mean_token_accuracy": 0.8549962108954787, "num_tokens": 310440896.0, "step": 361 }, { "entropy": 0.4180450439453125, "epoch": 2.873015873015873, "grad_norm": 0.7806163849576252, "learning_rate": 5.784333054183533e-06, "loss": 0.4069, "mean_token_accuracy": 0.8565008505247533, "num_tokens": 311297562.0, "step": 362 }, { "entropy": 0.4127197265625, "epoch": 2.880952380952381, "grad_norm": 0.754556378509014, "learning_rate": 5.762719075469277e-06, "loss": 0.4155, "mean_token_accuracy": 0.8560635317116976, "num_tokens": 312189513.0, "step": 363 }, { "entropy": 0.41400146484375, "epoch": 2.888888888888889, "grad_norm": 0.833131948010438, "learning_rate": 5.741090494673386e-06, "loss": 0.4098, "mean_token_accuracy": 0.8566766679286957, "num_tokens": 313055977.0, "step": 364 }, { "entropy": 0.4152984619140625, "epoch": 2.8968253968253967, "grad_norm": 0.8035675568742273, "learning_rate": 5.719447725870071e-06, "loss": 0.417, "mean_token_accuracy": 0.8535463376902044, "num_tokens": 313934488.0, "step": 365 }, { "entropy": 0.41644287109375, "epoch": 2.9047619047619047, "grad_norm": 0.809344160354769, "learning_rate": 5.697791183405174e-06, "loss": 0.4123, "mean_token_accuracy": 0.8555832463316619, "num_tokens": 314782888.0, "step": 366 }, { "entropy": 0.4123077392578125, "epoch": 2.9126984126984126, "grad_norm": 0.7542255175949691, "learning_rate": 5.67612128188823e-06, "loss": 0.4042, "mean_token_accuracy": 0.8586938725784421, "num_tokens": 315667111.0, "step": 367 }, { "entropy": 0.418426513671875, "epoch": 2.9206349206349205, "grad_norm": 0.7325186075881142, "learning_rate": 5.654438436184531e-06, "loss": 0.41, "mean_token_accuracy": 0.8550154692493379, "num_tokens": 316519645.0, "step": 368 }, { "entropy": 0.4178619384765625, "epoch": 2.928571428571429, "grad_norm": 0.764847574977915, "learning_rate": 5.6327430614071794e-06, "loss": 0.409, "mean_token_accuracy": 0.8574743596836925, "num_tokens": 317376914.0, "step": 369 }, { "entropy": 0.4179229736328125, "epoch": 2.9365079365079367, "grad_norm": 0.7979585773178869, "learning_rate": 5.611035572909147e-06, "loss": 0.4116, "mean_token_accuracy": 0.8546005864627659, "num_tokens": 318210944.0, "step": 370 }, { "entropy": 0.411712646484375, "epoch": 2.9444444444444446, "grad_norm": 0.7465872378991787, "learning_rate": 5.589316386275318e-06, "loss": 0.4127, "mean_token_accuracy": 0.8551405002363026, "num_tokens": 319072977.0, "step": 371 }, { "entropy": 0.413848876953125, "epoch": 2.9523809523809526, "grad_norm": 0.7670391280421824, "learning_rate": 5.567585917314535e-06, "loss": 0.4085, "mean_token_accuracy": 0.8564633526839316, "num_tokens": 319936836.0, "step": 372 }, { "entropy": 0.41168212890625, "epoch": 2.9603174603174605, "grad_norm": 0.8099483587987164, "learning_rate": 5.545844582051641e-06, "loss": 0.4053, "mean_token_accuracy": 0.8578686797991395, "num_tokens": 320807541.0, "step": 373 }, { "entropy": 0.4144439697265625, "epoch": 2.9682539682539684, "grad_norm": 0.8134336846221772, "learning_rate": 5.524092796719507e-06, "loss": 0.4096, "mean_token_accuracy": 0.8564304136671126, "num_tokens": 321676330.0, "step": 374 }, { "entropy": 0.41241455078125, "epoch": 2.9761904761904763, "grad_norm": 0.7501989763747119, "learning_rate": 5.502330977751072e-06, "loss": 0.4012, "mean_token_accuracy": 0.8606314528733492, "num_tokens": 322526195.0, "step": 375 }, { "entropy": 0.4184417724609375, "epoch": 2.984126984126984, "grad_norm": 0.8084127993444857, "learning_rate": 5.4805595417713634e-06, "loss": 0.4129, "mean_token_accuracy": 0.854987567756325, "num_tokens": 323373321.0, "step": 376 }, { "entropy": 0.413482666015625, "epoch": 2.992063492063492, "grad_norm": 0.8921476455980862, "learning_rate": 5.458778905589528e-06, "loss": 0.4048, "mean_token_accuracy": 0.8568636071868241, "num_tokens": 324241487.0, "step": 377 }, { "entropy": 0.413818359375, "epoch": 3.0, "grad_norm": 0.7212429646275152, "learning_rate": 5.436989486190846e-06, "loss": 0.4132, "mean_token_accuracy": 0.8552796910516918, "num_tokens": 325114310.0, "step": 378 }, { "entropy": 0.41851806640625, "epoch": 3.007936507936508, "grad_norm": 0.8309684413468622, "learning_rate": 5.415191700728749e-06, "loss": 0.3803, "mean_token_accuracy": 0.8651906503364444, "num_tokens": 325956929.0, "step": 379 }, { "entropy": 0.415771484375, "epoch": 3.015873015873016, "grad_norm": 0.875627308634879, "learning_rate": 5.393385966516838e-06, "loss": 0.3949, "mean_token_accuracy": 0.8609235784970224, "num_tokens": 326825247.0, "step": 380 }, { "entropy": 0.4132232666015625, "epoch": 3.0238095238095237, "grad_norm": 0.7660187349203336, "learning_rate": 5.371572701020891e-06, "loss": 0.3843, "mean_token_accuracy": 0.865902341902256, "num_tokens": 327664768.0, "step": 381 }, { "entropy": 0.4121246337890625, "epoch": 3.0317460317460316, "grad_norm": 0.876168435455912, "learning_rate": 5.349752321850866e-06, "loss": 0.3891, "mean_token_accuracy": 0.8622553567402065, "num_tokens": 328521474.0, "step": 382 }, { "entropy": 0.4065399169921875, "epoch": 3.0396825396825395, "grad_norm": 0.8349975533992948, "learning_rate": 5.327925246752917e-06, "loss": 0.3871, "mean_token_accuracy": 0.8634061855264008, "num_tokens": 329375199.0, "step": 383 }, { "entropy": 0.41015625, "epoch": 3.0476190476190474, "grad_norm": 0.7379532101462078, "learning_rate": 5.306091893601384e-06, "loss": 0.3854, "mean_token_accuracy": 0.8652450819499791, "num_tokens": 330238541.0, "step": 384 }, { "entropy": 0.4120330810546875, "epoch": 3.0555555555555554, "grad_norm": 0.7995763856973052, "learning_rate": 5.284252680390803e-06, "loss": 0.3919, "mean_token_accuracy": 0.8609937699511647, "num_tokens": 331111401.0, "step": 385 }, { "entropy": 0.412261962890625, "epoch": 3.0634920634920633, "grad_norm": 0.7968948726175992, "learning_rate": 5.2624080252279006e-06, "loss": 0.3891, "mean_token_accuracy": 0.8621770567260683, "num_tokens": 331970501.0, "step": 386 }, { "entropy": 0.412689208984375, "epoch": 3.0714285714285716, "grad_norm": 0.7567371102755298, "learning_rate": 5.240558346323582e-06, "loss": 0.388, "mean_token_accuracy": 0.8623263705521822, "num_tokens": 332839444.0, "step": 387 }, { "entropy": 0.4113616943359375, "epoch": 3.0793650793650795, "grad_norm": 0.8253393750303774, "learning_rate": 5.218704061984938e-06, "loss": 0.3805, "mean_token_accuracy": 0.8644652073271573, "num_tokens": 333694157.0, "step": 388 }, { "entropy": 0.4080963134765625, "epoch": 3.0873015873015874, "grad_norm": 0.8150357016122449, "learning_rate": 5.196845590607225e-06, "loss": 0.3778, "mean_token_accuracy": 0.8659757277928293, "num_tokens": 334553848.0, "step": 389 }, { "entropy": 0.411529541015625, "epoch": 3.0952380952380953, "grad_norm": 0.810657705832448, "learning_rate": 5.174983350665861e-06, "loss": 0.3837, "mean_token_accuracy": 0.862535847350955, "num_tokens": 335414382.0, "step": 390 }, { "entropy": 0.4155731201171875, "epoch": 3.1031746031746033, "grad_norm": 0.7578031832169086, "learning_rate": 5.153117760708411e-06, "loss": 0.388, "mean_token_accuracy": 0.8647559527307749, "num_tokens": 336270013.0, "step": 391 }, { "entropy": 0.4129638671875, "epoch": 3.111111111111111, "grad_norm": 0.7853251945488365, "learning_rate": 5.131249239346574e-06, "loss": 0.3874, "mean_token_accuracy": 0.8632673225365579, "num_tokens": 337153945.0, "step": 392 }, { "entropy": 0.41741943359375, "epoch": 3.119047619047619, "grad_norm": 0.8526319008895792, "learning_rate": 5.109378205248177e-06, "loss": 0.3813, "mean_token_accuracy": 0.8653798257000744, "num_tokens": 337986623.0, "step": 393 }, { "entropy": 0.41943359375, "epoch": 3.126984126984127, "grad_norm": 0.7777939267453691, "learning_rate": 5.087505077129144e-06, "loss": 0.3847, "mean_token_accuracy": 0.8638610797934234, "num_tokens": 338820053.0, "step": 394 }, { "entropy": 0.4080352783203125, "epoch": 3.134920634920635, "grad_norm": 0.8050487820823641, "learning_rate": 5.065630273745495e-06, "loss": 0.391, "mean_token_accuracy": 0.8619571630842984, "num_tokens": 339709184.0, "step": 395 }, { "entropy": 0.4158172607421875, "epoch": 3.142857142857143, "grad_norm": 0.788047324475754, "learning_rate": 5.043754213885319e-06, "loss": 0.3806, "mean_token_accuracy": 0.8652480882592499, "num_tokens": 340560422.0, "step": 396 }, { "entropy": 0.4126739501953125, "epoch": 3.1507936507936507, "grad_norm": 0.7895581256609918, "learning_rate": 5.021877316360759e-06, "loss": 0.3857, "mean_token_accuracy": 0.8641035025939345, "num_tokens": 341427547.0, "step": 397 }, { "entropy": 0.4117431640625, "epoch": 3.1587301587301586, "grad_norm": 0.7885237864621762, "learning_rate": 5e-06, "loss": 0.3809, "mean_token_accuracy": 0.8656587679870427, "num_tokens": 342297536.0, "step": 398 }, { "entropy": 0.42266845703125, "epoch": 3.1666666666666665, "grad_norm": 0.7347090872403708, "learning_rate": 4.978122683639241e-06, "loss": 0.3797, "mean_token_accuracy": 0.8654965776950121, "num_tokens": 343108738.0, "step": 399 }, { "entropy": 0.41461181640625, "epoch": 3.1746031746031744, "grad_norm": 0.7919607175007582, "learning_rate": 4.956245786114683e-06, "loss": 0.3805, "mean_token_accuracy": 0.8652189085260034, "num_tokens": 343963294.0, "step": 400 }, { "entropy": 0.4164276123046875, "epoch": 3.1825396825396823, "grad_norm": 0.7982890026738051, "learning_rate": 4.934369726254506e-06, "loss": 0.3839, "mean_token_accuracy": 0.8629119992256165, "num_tokens": 344812036.0, "step": 401 }, { "entropy": 0.4126739501953125, "epoch": 3.1904761904761907, "grad_norm": 0.7923716094274679, "learning_rate": 4.9124949228708566e-06, "loss": 0.385, "mean_token_accuracy": 0.8645118903368711, "num_tokens": 345643006.0, "step": 402 }, { "entropy": 0.41131591796875, "epoch": 3.1984126984126986, "grad_norm": 0.8224201397236163, "learning_rate": 4.890621794751825e-06, "loss": 0.3781, "mean_token_accuracy": 0.8670813706703484, "num_tokens": 346526093.0, "step": 403 }, { "entropy": 0.4088134765625, "epoch": 3.2063492063492065, "grad_norm": 0.748278286726127, "learning_rate": 4.8687507606534274e-06, "loss": 0.3869, "mean_token_accuracy": 0.8644262808375061, "num_tokens": 347429334.0, "step": 404 }, { "entropy": 0.4080810546875, "epoch": 3.2142857142857144, "grad_norm": 0.7655034553814312, "learning_rate": 4.8468822392915925e-06, "loss": 0.3879, "mean_token_accuracy": 0.86290636472404, "num_tokens": 348300345.0, "step": 405 }, { "entropy": 0.41357421875, "epoch": 3.2222222222222223, "grad_norm": 0.8371130749450066, "learning_rate": 4.82501664933414e-06, "loss": 0.3895, "mean_token_accuracy": 0.8630081634037197, "num_tokens": 349174728.0, "step": 406 }, { "entropy": 0.414794921875, "epoch": 3.2301587301587302, "grad_norm": 0.7863015466902707, "learning_rate": 4.803154409392776e-06, "loss": 0.3827, "mean_token_accuracy": 0.864469132386148, "num_tokens": 350019662.0, "step": 407 }, { "entropy": 0.411102294921875, "epoch": 3.238095238095238, "grad_norm": 0.7465717328173177, "learning_rate": 4.781295938015063e-06, "loss": 0.3831, "mean_token_accuracy": 0.8645726442337036, "num_tokens": 350867252.0, "step": 408 }, { "entropy": 0.41094970703125, "epoch": 3.246031746031746, "grad_norm": 0.7655319214119843, "learning_rate": 4.759441653676419e-06, "loss": 0.3788, "mean_token_accuracy": 0.8645767103880644, "num_tokens": 351713015.0, "step": 409 }, { "entropy": 0.4076080322265625, "epoch": 3.253968253968254, "grad_norm": 0.7922787284316188, "learning_rate": 4.737591974772102e-06, "loss": 0.383, "mean_token_accuracy": 0.8641512831673026, "num_tokens": 352612442.0, "step": 410 }, { "entropy": 0.41400146484375, "epoch": 3.261904761904762, "grad_norm": 0.7897648138284737, "learning_rate": 4.715747319609199e-06, "loss": 0.3808, "mean_token_accuracy": 0.8653241745196283, "num_tokens": 353475199.0, "step": 411 }, { "entropy": 0.4135589599609375, "epoch": 3.2698412698412698, "grad_norm": 0.7900462489736286, "learning_rate": 4.693908106398617e-06, "loss": 0.3805, "mean_token_accuracy": 0.8655834072269499, "num_tokens": 354336321.0, "step": 412 }, { "entropy": 0.413421630859375, "epoch": 3.2777777777777777, "grad_norm": 0.8361561340006152, "learning_rate": 4.6720747532470845e-06, "loss": 0.3909, "mean_token_accuracy": 0.8628287450410426, "num_tokens": 355204238.0, "step": 413 }, { "entropy": 0.41632080078125, "epoch": 3.2857142857142856, "grad_norm": 0.7192804861618904, "learning_rate": 4.650247678149135e-06, "loss": 0.3822, "mean_token_accuracy": 0.8637247635051608, "num_tokens": 356039432.0, "step": 414 }, { "entropy": 0.417022705078125, "epoch": 3.2936507936507935, "grad_norm": 0.7884051950710803, "learning_rate": 4.628427298979111e-06, "loss": 0.3834, "mean_token_accuracy": 0.8649981459602714, "num_tokens": 356872476.0, "step": 415 }, { "entropy": 0.4183197021484375, "epoch": 3.3015873015873014, "grad_norm": 0.7660377464634954, "learning_rate": 4.606614033483164e-06, "loss": 0.3733, "mean_token_accuracy": 0.8667362979613245, "num_tokens": 357702880.0, "step": 416 }, { "entropy": 0.4095458984375, "epoch": 3.3095238095238093, "grad_norm": 1.502576186555255, "learning_rate": 4.5848082992712516e-06, "loss": 0.3851, "mean_token_accuracy": 0.8638884532265365, "num_tokens": 358593212.0, "step": 417 }, { "entropy": 0.411895751953125, "epoch": 3.317460317460317, "grad_norm": 0.8516989355397148, "learning_rate": 4.563010513809156e-06, "loss": 0.374, "mean_token_accuracy": 0.8689160253852606, "num_tokens": 359427196.0, "step": 418 }, { "entropy": 0.415771484375, "epoch": 3.3253968253968256, "grad_norm": 0.8333941577643297, "learning_rate": 4.541221094410473e-06, "loss": 0.3886, "mean_token_accuracy": 0.8632900207303464, "num_tokens": 360297417.0, "step": 419 }, { "entropy": 0.4151153564453125, "epoch": 3.3333333333333335, "grad_norm": 0.9185443535766896, "learning_rate": 4.519440458228638e-06, "loss": 0.3929, "mean_token_accuracy": 0.8609873973764479, "num_tokens": 361158991.0, "step": 420 }, { "entropy": 0.407745361328125, "epoch": 3.3412698412698414, "grad_norm": 0.7923581076176655, "learning_rate": 4.497669022248931e-06, "loss": 0.3768, "mean_token_accuracy": 0.86583196464926, "num_tokens": 362026434.0, "step": 421 }, { "entropy": 0.4108734130859375, "epoch": 3.3492063492063493, "grad_norm": 0.8878219569934822, "learning_rate": 4.475907203280494e-06, "loss": 0.3874, "mean_token_accuracy": 0.862205957993865, "num_tokens": 362894932.0, "step": 422 }, { "entropy": 0.4105682373046875, "epoch": 3.357142857142857, "grad_norm": 0.8849923219918312, "learning_rate": 4.45415541794836e-06, "loss": 0.3861, "mean_token_accuracy": 0.8655079673044384, "num_tokens": 363774304.0, "step": 423 }, { "entropy": 0.415740966796875, "epoch": 3.365079365079365, "grad_norm": 0.7854653600781764, "learning_rate": 4.432414082685466e-06, "loss": 0.3759, "mean_token_accuracy": 0.8663219287991524, "num_tokens": 364624495.0, "step": 424 }, { "entropy": 0.4079132080078125, "epoch": 3.373015873015873, "grad_norm": 0.7932914584936744, "learning_rate": 4.410683613724684e-06, "loss": 0.3827, "mean_token_accuracy": 0.8652459299191833, "num_tokens": 365522341.0, "step": 425 }, { "entropy": 0.4139251708984375, "epoch": 3.380952380952381, "grad_norm": 0.8623623236366137, "learning_rate": 4.388964427090855e-06, "loss": 0.3818, "mean_token_accuracy": 0.8645171159878373, "num_tokens": 366384352.0, "step": 426 }, { "entropy": 0.406494140625, "epoch": 3.388888888888889, "grad_norm": 0.8905833082641034, "learning_rate": 4.367256938592822e-06, "loss": 0.3883, "mean_token_accuracy": 0.8634491441771388, "num_tokens": 367231565.0, "step": 427 }, { "entropy": 0.409393310546875, "epoch": 3.3968253968253967, "grad_norm": 0.8362209111342414, "learning_rate": 4.345561563815471e-06, "loss": 0.3722, "mean_token_accuracy": 0.8681328790262341, "num_tokens": 368105765.0, "step": 428 }, { "entropy": 0.4090118408203125, "epoch": 3.4047619047619047, "grad_norm": 0.7875786753075623, "learning_rate": 4.323878718111771e-06, "loss": 0.3815, "mean_token_accuracy": 0.8649689424782991, "num_tokens": 368976566.0, "step": 429 }, { "entropy": 0.4091339111328125, "epoch": 3.4126984126984126, "grad_norm": 0.8242239527343677, "learning_rate": 4.302208816594829e-06, "loss": 0.3775, "mean_token_accuracy": 0.8660351554863155, "num_tokens": 369847661.0, "step": 430 }, { "entropy": 0.4122161865234375, "epoch": 3.4206349206349205, "grad_norm": 0.8253724820932592, "learning_rate": 4.280552274129932e-06, "loss": 0.3832, "mean_token_accuracy": 0.8640619218349457, "num_tokens": 370716519.0, "step": 431 }, { "entropy": 0.4095916748046875, "epoch": 3.4285714285714284, "grad_norm": 0.7522197274851726, "learning_rate": 4.258909505326617e-06, "loss": 0.3747, "mean_token_accuracy": 0.8674253430217505, "num_tokens": 371548994.0, "step": 432 }, { "entropy": 0.4106597900390625, "epoch": 3.4365079365079367, "grad_norm": 0.7580899790250368, "learning_rate": 4.237280924530723e-06, "loss": 0.3731, "mean_token_accuracy": 0.8671010048128664, "num_tokens": 372388644.0, "step": 433 }, { "entropy": 0.4091644287109375, "epoch": 3.4444444444444446, "grad_norm": 0.7966902079320883, "learning_rate": 4.215666945816469e-06, "loss": 0.3824, "mean_token_accuracy": 0.8636285294778645, "num_tokens": 373216255.0, "step": 434 }, { "entropy": 0.4067840576171875, "epoch": 3.4523809523809526, "grad_norm": 0.7259667199168536, "learning_rate": 4.194067982978516e-06, "loss": 0.3744, "mean_token_accuracy": 0.8671418204903603, "num_tokens": 374091119.0, "step": 435 }, { "entropy": 0.4103546142578125, "epoch": 3.4603174603174605, "grad_norm": 0.7168906340219251, "learning_rate": 4.172484449524047e-06, "loss": 0.3806, "mean_token_accuracy": 0.8649011980742216, "num_tokens": 374964989.0, "step": 436 }, { "entropy": 0.4099578857421875, "epoch": 3.4682539682539684, "grad_norm": 0.7226054435885343, "learning_rate": 4.150916758664857e-06, "loss": 0.3743, "mean_token_accuracy": 0.86769935535267, "num_tokens": 375849312.0, "step": 437 }, { "entropy": 0.4129791259765625, "epoch": 3.4761904761904763, "grad_norm": 0.7254203102984561, "learning_rate": 4.129365323309436e-06, "loss": 0.3886, "mean_token_accuracy": 0.8636708622798324, "num_tokens": 376711755.0, "step": 438 }, { "entropy": 0.4055328369140625, "epoch": 3.484126984126984, "grad_norm": 0.7258886081350053, "learning_rate": 4.107830556055072e-06, "loss": 0.377, "mean_token_accuracy": 0.865284236613661, "num_tokens": 377601829.0, "step": 439 }, { "entropy": 0.4116363525390625, "epoch": 3.492063492063492, "grad_norm": 0.7385419803324385, "learning_rate": 4.086312869179938e-06, "loss": 0.3811, "mean_token_accuracy": 0.8655071114189923, "num_tokens": 378449007.0, "step": 440 }, { "entropy": 0.409393310546875, "epoch": 3.5, "grad_norm": 0.7975589365886925, "learning_rate": 4.06481267463521e-06, "loss": 0.3746, "mean_token_accuracy": 0.867948766797781, "num_tokens": 379309394.0, "step": 441 }, { "entropy": 0.4044036865234375, "epoch": 3.507936507936508, "grad_norm": 0.7484471478807218, "learning_rate": 4.04333038403718e-06, "loss": 0.3755, "mean_token_accuracy": 0.8677190546877682, "num_tokens": 380174726.0, "step": 442 }, { "entropy": 0.406707763671875, "epoch": 3.515873015873016, "grad_norm": 0.7052307989651647, "learning_rate": 4.021866408659368e-06, "loss": 0.3766, "mean_token_accuracy": 0.8668166692368686, "num_tokens": 381047802.0, "step": 443 }, { "entropy": 0.4095001220703125, "epoch": 3.5238095238095237, "grad_norm": 0.7205324024463486, "learning_rate": 4.000421159424658e-06, "loss": 0.3782, "mean_token_accuracy": 0.8670969372615218, "num_tokens": 381900519.0, "step": 444 }, { "entropy": 0.40911865234375, "epoch": 3.5317460317460316, "grad_norm": 0.6751185639712526, "learning_rate": 3.978995046897425e-06, "loss": 0.3811, "mean_token_accuracy": 0.8652258133515716, "num_tokens": 382738529.0, "step": 445 }, { "entropy": 0.4097747802734375, "epoch": 3.5396825396825395, "grad_norm": 0.8003232347426622, "learning_rate": 3.957588481275674e-06, "loss": 0.3813, "mean_token_accuracy": 0.8646343694999814, "num_tokens": 383603819.0, "step": 446 }, { "entropy": 0.4104156494140625, "epoch": 3.5476190476190474, "grad_norm": 0.7612125218536709, "learning_rate": 3.9362018723831915e-06, "loss": 0.3834, "mean_token_accuracy": 0.8642422612756491, "num_tokens": 384466493.0, "step": 447 }, { "entropy": 0.40789794921875, "epoch": 3.5555555555555554, "grad_norm": 0.7301586930422078, "learning_rate": 3.914835629661695e-06, "loss": 0.3691, "mean_token_accuracy": 0.8685760577209294, "num_tokens": 385303493.0, "step": 448 }, { "entropy": 0.4090576171875, "epoch": 3.5634920634920633, "grad_norm": 0.8028620237168601, "learning_rate": 3.893490162162997e-06, "loss": 0.3772, "mean_token_accuracy": 0.8661560285836458, "num_tokens": 386139059.0, "step": 449 }, { "entropy": 0.4043426513671875, "epoch": 3.571428571428571, "grad_norm": 0.8038126363701456, "learning_rate": 3.872165878541175e-06, "loss": 0.3819, "mean_token_accuracy": 0.8657438950613141, "num_tokens": 387035788.0, "step": 450 }, { "entropy": 0.4053955078125, "epoch": 3.5793650793650795, "grad_norm": 0.7971696805205959, "learning_rate": 3.850863187044745e-06, "loss": 0.3783, "mean_token_accuracy": 0.8658295255154371, "num_tokens": 387893370.0, "step": 451 }, { "entropy": 0.4118804931640625, "epoch": 3.5873015873015874, "grad_norm": 0.7382565780061542, "learning_rate": 3.829582495508844e-06, "loss": 0.3774, "mean_token_accuracy": 0.8669222141616046, "num_tokens": 388743564.0, "step": 452 }, { "entropy": 0.4039764404296875, "epoch": 3.5952380952380953, "grad_norm": 0.762934354969153, "learning_rate": 3.808324211347429e-06, "loss": 0.3846, "mean_token_accuracy": 0.8625594675540924, "num_tokens": 389636739.0, "step": 453 }, { "entropy": 0.4062957763671875, "epoch": 3.6031746031746033, "grad_norm": 0.7869576244237314, "learning_rate": 3.7870887415454687e-06, "loss": 0.3772, "mean_token_accuracy": 0.8657813919708133, "num_tokens": 390510641.0, "step": 454 }, { "entropy": 0.4098968505859375, "epoch": 3.611111111111111, "grad_norm": 0.7351333487875162, "learning_rate": 3.7658764926511613e-06, "loss": 0.3659, "mean_token_accuracy": 0.868850149679929, "num_tokens": 391336036.0, "step": 455 }, { "entropy": 0.40362548828125, "epoch": 3.619047619047619, "grad_norm": 0.734375379903563, "learning_rate": 3.7446878707681413e-06, "loss": 0.373, "mean_token_accuracy": 0.8676298609934747, "num_tokens": 392197671.0, "step": 456 }, { "entropy": 0.4044189453125, "epoch": 3.626984126984127, "grad_norm": 0.8070921691017913, "learning_rate": 3.7235232815477123e-06, "loss": 0.3723, "mean_token_accuracy": 0.8680153395980597, "num_tokens": 393048877.0, "step": 457 }, { "entropy": 0.4051971435546875, "epoch": 3.634920634920635, "grad_norm": 0.7382078569033306, "learning_rate": 3.7023831301810765e-06, "loss": 0.3807, "mean_token_accuracy": 0.8656099583022296, "num_tokens": 393913386.0, "step": 458 }, { "entropy": 0.412567138671875, "epoch": 3.642857142857143, "grad_norm": 0.7500851651198606, "learning_rate": 3.6812678213915777e-06, "loss": 0.3753, "mean_token_accuracy": 0.866928874514997, "num_tokens": 394741069.0, "step": 459 }, { "entropy": 0.4068450927734375, "epoch": 3.6507936507936507, "grad_norm": 0.7014301015459137, "learning_rate": 3.6601777594269605e-06, "loss": 0.3716, "mean_token_accuracy": 0.8681608587503433, "num_tokens": 395587154.0, "step": 460 }, { "entropy": 0.4112701416015625, "epoch": 3.6587301587301586, "grad_norm": 0.7453844502592517, "learning_rate": 3.6391133480516196e-06, "loss": 0.37, "mean_token_accuracy": 0.8680603308603168, "num_tokens": 396429106.0, "step": 461 }, { "entropy": 0.40789794921875, "epoch": 3.6666666666666665, "grad_norm": 0.6714935595868241, "learning_rate": 3.618074990538873e-06, "loss": 0.3764, "mean_token_accuracy": 0.8662193124182522, "num_tokens": 397306106.0, "step": 462 }, { "entropy": 0.404541015625, "epoch": 3.674603174603175, "grad_norm": 0.7264996158823219, "learning_rate": 3.5970630896632485e-06, "loss": 0.3679, "mean_token_accuracy": 0.8692012121900916, "num_tokens": 398186044.0, "step": 463 }, { "entropy": 0.4099273681640625, "epoch": 3.682539682539683, "grad_norm": 0.6812527161900478, "learning_rate": 3.5760780476927637e-06, "loss": 0.3783, "mean_token_accuracy": 0.866292960010469, "num_tokens": 399059367.0, "step": 464 }, { "entropy": 0.4114532470703125, "epoch": 3.6904761904761907, "grad_norm": 0.7396202860678279, "learning_rate": 3.5551202663812344e-06, "loss": 0.3671, "mean_token_accuracy": 0.8694238997995853, "num_tokens": 399921480.0, "step": 465 }, { "entropy": 0.41082763671875, "epoch": 3.6984126984126986, "grad_norm": 0.7505615061152298, "learning_rate": 3.534190146960571e-06, "loss": 0.3738, "mean_token_accuracy": 0.8666451787576079, "num_tokens": 400768716.0, "step": 466 }, { "entropy": 0.407745361328125, "epoch": 3.7063492063492065, "grad_norm": 0.7716901466331328, "learning_rate": 3.5132880901331067e-06, "loss": 0.3836, "mean_token_accuracy": 0.8653549118898809, "num_tokens": 401643010.0, "step": 467 }, { "entropy": 0.411224365234375, "epoch": 3.7142857142857144, "grad_norm": 0.7425696552972633, "learning_rate": 3.492414496063921e-06, "loss": 0.3699, "mean_token_accuracy": 0.8682533628307283, "num_tokens": 402482222.0, "step": 468 }, { "entropy": 0.4105377197265625, "epoch": 3.7222222222222223, "grad_norm": 0.7549782563712677, "learning_rate": 3.4715697643731828e-06, "loss": 0.374, "mean_token_accuracy": 0.8664119308814406, "num_tokens": 403330184.0, "step": 469 }, { "entropy": 0.4114990234375, "epoch": 3.7301587301587302, "grad_norm": 0.6882214771400156, "learning_rate": 3.4507542941284933e-06, "loss": 0.3772, "mean_token_accuracy": 0.8662938089109957, "num_tokens": 404170985.0, "step": 470 }, { "entropy": 0.4065399169921875, "epoch": 3.738095238095238, "grad_norm": 0.7790709755576551, "learning_rate": 3.4299684838372547e-06, "loss": 0.3702, "mean_token_accuracy": 0.8684107572771609, "num_tokens": 405023111.0, "step": 471 }, { "entropy": 0.4109344482421875, "epoch": 3.746031746031746, "grad_norm": 0.8999085919269414, "learning_rate": 3.4092127314390354e-06, "loss": 0.3733, "mean_token_accuracy": 0.8679695804603398, "num_tokens": 405909984.0, "step": 472 }, { "entropy": 0.410552978515625, "epoch": 3.753968253968254, "grad_norm": 0.7852309398323011, "learning_rate": 3.388487434297949e-06, "loss": 0.3726, "mean_token_accuracy": 0.868429503403604, "num_tokens": 406762973.0, "step": 473 }, { "entropy": 0.4063873291015625, "epoch": 3.761904761904762, "grad_norm": 0.7497329657708961, "learning_rate": 3.3677929891950527e-06, "loss": 0.3675, "mean_token_accuracy": 0.8680104180239141, "num_tokens": 407632013.0, "step": 474 }, { "entropy": 0.4088287353515625, "epoch": 3.7698412698412698, "grad_norm": 0.9959878902569155, "learning_rate": 3.347129792320748e-06, "loss": 0.3803, "mean_token_accuracy": 0.8661512886174023, "num_tokens": 408479038.0, "step": 475 }, { "entropy": 0.400726318359375, "epoch": 3.7777777777777777, "grad_norm": 0.7083662937552805, "learning_rate": 3.3264982392671973e-06, "loss": 0.3707, "mean_token_accuracy": 0.8689758381806314, "num_tokens": 409381423.0, "step": 476 }, { "entropy": 0.4104156494140625, "epoch": 3.7857142857142856, "grad_norm": 0.7381897728110274, "learning_rate": 3.3058987250207476e-06, "loss": 0.3677, "mean_token_accuracy": 0.869597565382719, "num_tokens": 410237991.0, "step": 477 }, { "entropy": 0.4114227294921875, "epoch": 3.7936507936507935, "grad_norm": 0.7658185988150478, "learning_rate": 3.285331643954372e-06, "loss": 0.3779, "mean_token_accuracy": 0.8658644729293883, "num_tokens": 411113505.0, "step": 478 }, { "entropy": 0.4087371826171875, "epoch": 3.8015873015873014, "grad_norm": 0.6773211053527161, "learning_rate": 3.2647973898201157e-06, "loss": 0.3692, "mean_token_accuracy": 0.8686946122907102, "num_tokens": 411973035.0, "step": 479 }, { "entropy": 0.41082763671875, "epoch": 3.8095238095238093, "grad_norm": 0.6795058958160979, "learning_rate": 3.244296355741561e-06, "loss": 0.3792, "mean_token_accuracy": 0.864706945605576, "num_tokens": 412804401.0, "step": 480 }, { "entropy": 0.411712646484375, "epoch": 3.817460317460317, "grad_norm": 0.7172393778753827, "learning_rate": 3.2238289342063013e-06, "loss": 0.3741, "mean_token_accuracy": 0.8672458734363317, "num_tokens": 413645879.0, "step": 481 }, { "entropy": 0.40325927734375, "epoch": 3.825396825396825, "grad_norm": 0.7137250398041949, "learning_rate": 3.203395517058423e-06, "loss": 0.3815, "mean_token_accuracy": 0.8636117246933281, "num_tokens": 414526310.0, "step": 482 }, { "entropy": 0.40850830078125, "epoch": 3.8333333333333335, "grad_norm": 0.7060925413908444, "learning_rate": 3.1829964954910076e-06, "loss": 0.3744, "mean_token_accuracy": 0.8668604497797787, "num_tokens": 415386155.0, "step": 483 }, { "entropy": 0.4058837890625, "epoch": 3.8412698412698414, "grad_norm": 0.7071475776511494, "learning_rate": 3.1626322600386418e-06, "loss": 0.369, "mean_token_accuracy": 0.8673932519741356, "num_tokens": 416255235.0, "step": 484 }, { "entropy": 0.4070587158203125, "epoch": 3.8492063492063493, "grad_norm": 0.716470253050545, "learning_rate": 3.1423032005699377e-06, "loss": 0.3776, "mean_token_accuracy": 0.8670607698149979, "num_tokens": 417123335.0, "step": 485 }, { "entropy": 0.4124298095703125, "epoch": 3.857142857142857, "grad_norm": 0.7331008247958041, "learning_rate": 3.122009706280072e-06, "loss": 0.3725, "mean_token_accuracy": 0.8690087418071926, "num_tokens": 417961712.0, "step": 486 }, { "entropy": 0.407073974609375, "epoch": 3.865079365079365, "grad_norm": 0.7215581902390997, "learning_rate": 3.1017521656833384e-06, "loss": 0.3738, "mean_token_accuracy": 0.8678515437059104, "num_tokens": 418833226.0, "step": 487 }, { "entropy": 0.4102020263671875, "epoch": 3.873015873015873, "grad_norm": 0.6750907563285418, "learning_rate": 3.0815309666057013e-06, "loss": 0.3798, "mean_token_accuracy": 0.8680808427743614, "num_tokens": 419693605.0, "step": 488 }, { "entropy": 0.4107666015625, "epoch": 3.880952380952381, "grad_norm": 0.7253529027679292, "learning_rate": 3.061346496177374e-06, "loss": 0.3765, "mean_token_accuracy": 0.8668225076980889, "num_tokens": 420533126.0, "step": 489 }, { "entropy": 0.403076171875, "epoch": 3.888888888888889, "grad_norm": 0.774847139075562, "learning_rate": 3.0411991408254116e-06, "loss": 0.3734, "mean_token_accuracy": 0.8675551642663777, "num_tokens": 421408158.0, "step": 490 }, { "entropy": 0.407684326171875, "epoch": 3.8968253968253967, "grad_norm": 0.7430473963918918, "learning_rate": 3.0210892862663043e-06, "loss": 0.3669, "mean_token_accuracy": 0.8696022001095116, "num_tokens": 422285612.0, "step": 491 }, { "entropy": 0.40545654296875, "epoch": 3.9047619047619047, "grad_norm": 0.7359917176077896, "learning_rate": 3.001017317498607e-06, "loss": 0.3654, "mean_token_accuracy": 0.8683894919231534, "num_tokens": 423145278.0, "step": 492 }, { "entropy": 0.4095001220703125, "epoch": 3.9126984126984126, "grad_norm": 0.6995695516835777, "learning_rate": 2.9809836187955532e-06, "loss": 0.3759, "mean_token_accuracy": 0.8677502269856632, "num_tokens": 423982368.0, "step": 493 }, { "entropy": 0.4052734375, "epoch": 3.9206349206349205, "grad_norm": 0.7865083776711185, "learning_rate": 2.960988573697705e-06, "loss": 0.3769, "mean_token_accuracy": 0.8668651487678289, "num_tokens": 424855995.0, "step": 494 }, { "entropy": 0.4091949462890625, "epoch": 3.928571428571429, "grad_norm": 0.7069460296016946, "learning_rate": 2.941032565005613e-06, "loss": 0.3734, "mean_token_accuracy": 0.8668679501861334, "num_tokens": 425714116.0, "step": 495 }, { "entropy": 0.4059906005859375, "epoch": 3.9365079365079367, "grad_norm": 0.7074879125016784, "learning_rate": 2.9211159747724813e-06, "loss": 0.3702, "mean_token_accuracy": 0.86768330167979, "num_tokens": 426587317.0, "step": 496 }, { "entropy": 0.4085845947265625, "epoch": 3.9444444444444446, "grad_norm": 0.6887885189112549, "learning_rate": 2.90123918429686e-06, "loss": 0.3693, "mean_token_accuracy": 0.8686679415404797, "num_tokens": 427459701.0, "step": 497 }, { "entropy": 0.4090728759765625, "epoch": 3.9523809523809526, "grad_norm": 0.7038820990298879, "learning_rate": 2.881402574115344e-06, "loss": 0.3693, "mean_token_accuracy": 0.869040944147855, "num_tokens": 428310157.0, "step": 498 }, { "entropy": 0.406158447265625, "epoch": 3.9603174603174605, "grad_norm": 0.7223999433933582, "learning_rate": 2.8616065239952763e-06, "loss": 0.3706, "mean_token_accuracy": 0.8684421242214739, "num_tokens": 429155049.0, "step": 499 }, { "entropy": 0.4044036865234375, "epoch": 3.9682539682539684, "grad_norm": 0.7016426279147887, "learning_rate": 2.841851412927495e-06, "loss": 0.3706, "mean_token_accuracy": 0.8662050706334412, "num_tokens": 430044250.0, "step": 500 }, { "entropy": 0.405242919921875, "epoch": 3.9761904761904763, "grad_norm": 0.6864573418269565, "learning_rate": 2.822137619119065e-06, "loss": 0.365, "mean_token_accuracy": 0.8711186717264354, "num_tokens": 430927931.0, "step": 501 }, { "entropy": 0.40313720703125, "epoch": 3.984126984126984, "grad_norm": 0.7183772106536512, "learning_rate": 2.8024655199860495e-06, "loss": 0.3682, "mean_token_accuracy": 0.8691108208149672, "num_tokens": 431777495.0, "step": 502 }, { "entropy": 0.4088134765625, "epoch": 3.992063492063492, "grad_norm": 0.7147302557020478, "learning_rate": 2.7828354921462668e-06, "loss": 0.3622, "mean_token_accuracy": 0.8704049359075725, "num_tokens": 432616684.0, "step": 503 }, { "entropy": 0.40557861328125, "epoch": 4.0, "grad_norm": 0.724778260633041, "learning_rate": 2.7632479114120963e-06, "loss": 0.367, "mean_token_accuracy": 0.8679725076071918, "num_tokens": 433464885.0, "step": 504 }, { "entropy": 0.3993377685546875, "epoch": 4.007936507936508, "grad_norm": 0.7300051389709427, "learning_rate": 2.7437031527832747e-06, "loss": 0.3473, "mean_token_accuracy": 0.8751431121490896, "num_tokens": 434354281.0, "step": 505 }, { "entropy": 0.40277099609375, "epoch": 4.015873015873016, "grad_norm": 0.7518820580172925, "learning_rate": 2.72420159043972e-06, "loss": 0.351, "mean_token_accuracy": 0.8758351663127542, "num_tokens": 435254037.0, "step": 506 }, { "entropy": 0.40679931640625, "epoch": 4.023809523809524, "grad_norm": 0.75823317605801, "learning_rate": 2.704743597734365e-06, "loss": 0.3449, "mean_token_accuracy": 0.8767664707265794, "num_tokens": 436096499.0, "step": 507 }, { "entropy": 0.4034576416015625, "epoch": 4.031746031746032, "grad_norm": 0.781839052148236, "learning_rate": 2.685329547186018e-06, "loss": 0.349, "mean_token_accuracy": 0.8767024255357683, "num_tokens": 436936261.0, "step": 508 }, { "entropy": 0.3980255126953125, "epoch": 4.0396825396825395, "grad_norm": 0.8874815588264233, "learning_rate": 2.665959810472219e-06, "loss": 0.3457, "mean_token_accuracy": 0.8768184627406299, "num_tokens": 437789126.0, "step": 509 }, { "entropy": 0.400390625, "epoch": 4.0476190476190474, "grad_norm": 0.8438997097584569, "learning_rate": 2.6466347584221314e-06, "loss": 0.3488, "mean_token_accuracy": 0.8754238770343363, "num_tokens": 438642725.0, "step": 510 }, { "entropy": 0.399810791015625, "epoch": 4.055555555555555, "grad_norm": 0.7914559178878162, "learning_rate": 2.6273547610094408e-06, "loss": 0.3568, "mean_token_accuracy": 0.8729163003154099, "num_tokens": 439509332.0, "step": 511 }, { "entropy": 0.4043426513671875, "epoch": 4.063492063492063, "grad_norm": 0.8018090810161909, "learning_rate": 2.608120187345273e-06, "loss": 0.3589, "mean_token_accuracy": 0.8719246378168464, "num_tokens": 440358824.0, "step": 512 }, { "entropy": 0.40130615234375, "epoch": 4.071428571428571, "grad_norm": 0.717052204902591, "learning_rate": 2.588931405671127e-06, "loss": 0.347, "mean_token_accuracy": 0.876106639392674, "num_tokens": 441231571.0, "step": 513 }, { "entropy": 0.40350341796875, "epoch": 4.079365079365079, "grad_norm": 0.7833684120919279, "learning_rate": 2.5697887833518215e-06, "loss": 0.3481, "mean_token_accuracy": 0.874689971562475, "num_tokens": 442070234.0, "step": 514 }, { "entropy": 0.400054931640625, "epoch": 4.087301587301587, "grad_norm": 0.7624057899379104, "learning_rate": 2.5506926868684683e-06, "loss": 0.354, "mean_token_accuracy": 0.8740922566503286, "num_tokens": 442955553.0, "step": 515 }, { "entropy": 0.4012908935546875, "epoch": 4.095238095238095, "grad_norm": 0.7656669668346728, "learning_rate": 2.5316434818114517e-06, "loss": 0.3412, "mean_token_accuracy": 0.8769300729036331, "num_tokens": 443803905.0, "step": 516 }, { "entropy": 0.398406982421875, "epoch": 4.103174603174603, "grad_norm": 0.7384783736599843, "learning_rate": 2.5126415328734275e-06, "loss": 0.3549, "mean_token_accuracy": 0.875963733997196, "num_tokens": 444676270.0, "step": 517 }, { "entropy": 0.3984222412109375, "epoch": 4.111111111111111, "grad_norm": 0.763214655206926, "learning_rate": 2.4936872038423516e-06, "loss": 0.3527, "mean_token_accuracy": 0.8742309152148664, "num_tokens": 445551871.0, "step": 518 }, { "entropy": 0.4028167724609375, "epoch": 4.119047619047619, "grad_norm": 0.7566995169023817, "learning_rate": 2.4747808575945006e-06, "loss": 0.351, "mean_token_accuracy": 0.8753285491839051, "num_tokens": 446395494.0, "step": 519 }, { "entropy": 0.4046173095703125, "epoch": 4.1269841269841265, "grad_norm": 0.7704383123576651, "learning_rate": 2.4559228560875336e-06, "loss": 0.3489, "mean_token_accuracy": 0.8744379128329456, "num_tokens": 447255878.0, "step": 520 }, { "entropy": 0.4016876220703125, "epoch": 4.134920634920635, "grad_norm": 0.689868389625917, "learning_rate": 2.4371135603535613e-06, "loss": 0.3475, "mean_token_accuracy": 0.8762855334207416, "num_tokens": 448095199.0, "step": 521 }, { "entropy": 0.4026947021484375, "epoch": 4.142857142857143, "grad_norm": 0.7668680601519664, "learning_rate": 2.4183533304922336e-06, "loss": 0.3459, "mean_token_accuracy": 0.8746473412029445, "num_tokens": 448954987.0, "step": 522 }, { "entropy": 0.4035491943359375, "epoch": 4.150793650793651, "grad_norm": 0.72483989286729, "learning_rate": 2.399642525663843e-06, "loss": 0.3558, "mean_token_accuracy": 0.8751249178312719, "num_tokens": 449829424.0, "step": 523 }, { "entropy": 0.3994140625, "epoch": 4.158730158730159, "grad_norm": 0.706119007803981, "learning_rate": 2.380981504082459e-06, "loss": 0.349, "mean_token_accuracy": 0.875414258800447, "num_tokens": 450685443.0, "step": 524 }, { "entropy": 0.3993988037109375, "epoch": 4.166666666666667, "grad_norm": 0.7212946770565037, "learning_rate": 2.3623706230090517e-06, "loss": 0.3557, "mean_token_accuracy": 0.8735111146233976, "num_tokens": 451597512.0, "step": 525 }, { "entropy": 0.4042205810546875, "epoch": 4.174603174603175, "grad_norm": 0.7005628568870317, "learning_rate": 2.3438102387446686e-06, "loss": 0.3469, "mean_token_accuracy": 0.8763788240030408, "num_tokens": 452424123.0, "step": 526 }, { "entropy": 0.3973541259765625, "epoch": 4.182539682539683, "grad_norm": 0.7155495360306364, "learning_rate": 2.325300706623607e-06, "loss": 0.353, "mean_token_accuracy": 0.8734072712250054, "num_tokens": 453294509.0, "step": 527 }, { "entropy": 0.4000091552734375, "epoch": 4.190476190476191, "grad_norm": 0.7006937562520844, "learning_rate": 2.3068423810066085e-06, "loss": 0.3528, "mean_token_accuracy": 0.8746827309951186, "num_tokens": 454176550.0, "step": 528 }, { "entropy": 0.404052734375, "epoch": 4.198412698412699, "grad_norm": 0.6995025837491133, "learning_rate": 2.288435615274085e-06, "loss": 0.3579, "mean_token_accuracy": 0.8727600080892444, "num_tokens": 455027872.0, "step": 529 }, { "entropy": 0.4065704345703125, "epoch": 4.2063492063492065, "grad_norm": 0.7479029235127206, "learning_rate": 2.2700807618193393e-06, "loss": 0.3416, "mean_token_accuracy": 0.8783422014676034, "num_tokens": 455894126.0, "step": 530 }, { "entropy": 0.40460205078125, "epoch": 4.214285714285714, "grad_norm": 0.746817252036699, "learning_rate": 2.251778172041828e-06, "loss": 0.3455, "mean_token_accuracy": 0.8771996637806296, "num_tokens": 456741345.0, "step": 531 }, { "entropy": 0.402862548828125, "epoch": 4.222222222222222, "grad_norm": 0.6876078945117027, "learning_rate": 2.2335281963404315e-06, "loss": 0.3501, "mean_token_accuracy": 0.8753550541587174, "num_tokens": 457597774.0, "step": 532 }, { "entropy": 0.3993682861328125, "epoch": 4.23015873015873, "grad_norm": 0.7282590241527106, "learning_rate": 2.2153311841067438e-06, "loss": 0.3442, "mean_token_accuracy": 0.8769136122427881, "num_tokens": 458481487.0, "step": 533 }, { "entropy": 0.3993377685546875, "epoch": 4.238095238095238, "grad_norm": 0.7553930529448756, "learning_rate": 2.1971874837183914e-06, "loss": 0.3458, "mean_token_accuracy": 0.8748537562787533, "num_tokens": 459329943.0, "step": 534 }, { "entropy": 0.3927154541015625, "epoch": 4.246031746031746, "grad_norm": 0.7591301726961953, "learning_rate": 2.179097442532352e-06, "loss": 0.3394, "mean_token_accuracy": 0.8797525470145047, "num_tokens": 460196696.0, "step": 535 }, { "entropy": 0.39837646484375, "epoch": 4.253968253968254, "grad_norm": 0.7626069369315728, "learning_rate": 2.1610614068783112e-06, "loss": 0.361, "mean_token_accuracy": 0.8727622926235199, "num_tokens": 461069051.0, "step": 536 }, { "entropy": 0.403106689453125, "epoch": 4.261904761904762, "grad_norm": 0.7708798701412393, "learning_rate": 2.143079722052034e-06, "loss": 0.3479, "mean_token_accuracy": 0.8751750965602696, "num_tokens": 461920899.0, "step": 537 }, { "entropy": 0.400177001953125, "epoch": 4.26984126984127, "grad_norm": 0.7389095619225245, "learning_rate": 2.125152732308747e-06, "loss": 0.3459, "mean_token_accuracy": 0.8786491984501481, "num_tokens": 462797643.0, "step": 538 }, { "entropy": 0.4058380126953125, "epoch": 4.277777777777778, "grad_norm": 0.665980057345897, "learning_rate": 2.1072807808565547e-06, "loss": 0.3501, "mean_token_accuracy": 0.8757951087318361, "num_tokens": 463640936.0, "step": 539 }, { "entropy": 0.40155029296875, "epoch": 4.285714285714286, "grad_norm": 0.7005429583368125, "learning_rate": 2.0894642098498656e-06, "loss": 0.3587, "mean_token_accuracy": 0.8732366347685456, "num_tokens": 464513012.0, "step": 540 }, { "entropy": 0.4015350341796875, "epoch": 4.2936507936507935, "grad_norm": 0.6682325297585425, "learning_rate": 2.0717033603828436e-06, "loss": 0.3485, "mean_token_accuracy": 0.8750377274118364, "num_tokens": 465345613.0, "step": 541 }, { "entropy": 0.4040985107421875, "epoch": 4.301587301587301, "grad_norm": 0.7424671938333351, "learning_rate": 2.0539985724828736e-06, "loss": 0.3498, "mean_token_accuracy": 0.875401156488806, "num_tokens": 466181756.0, "step": 542 }, { "entropy": 0.401275634765625, "epoch": 4.309523809523809, "grad_norm": 0.695999191273483, "learning_rate": 2.0363501851040573e-06, "loss": 0.3436, "mean_token_accuracy": 0.8764086258597672, "num_tokens": 467035382.0, "step": 543 }, { "entropy": 0.40155029296875, "epoch": 4.317460317460317, "grad_norm": 0.6827039903530154, "learning_rate": 2.0187585361207174e-06, "loss": 0.3466, "mean_token_accuracy": 0.8745089964941144, "num_tokens": 467897340.0, "step": 544 }, { "entropy": 0.400177001953125, "epoch": 4.325396825396825, "grad_norm": 0.685211307868124, "learning_rate": 2.001223962320941e-06, "loss": 0.3517, "mean_token_accuracy": 0.8753441325388849, "num_tokens": 468764096.0, "step": 545 }, { "entropy": 0.405242919921875, "epoch": 4.333333333333333, "grad_norm": 0.6682700395214807, "learning_rate": 1.9837467994001165e-06, "loss": 0.3457, "mean_token_accuracy": 0.8773820898495615, "num_tokens": 469610106.0, "step": 546 }, { "entropy": 0.399566650390625, "epoch": 4.341269841269841, "grad_norm": 0.6719545574593448, "learning_rate": 1.9663273819545157e-06, "loss": 0.3396, "mean_token_accuracy": 0.8774642567150295, "num_tokens": 470468046.0, "step": 547 }, { "entropy": 0.40380859375, "epoch": 4.349206349206349, "grad_norm": 0.7001669509560304, "learning_rate": 1.948966043474889e-06, "loss": 0.3458, "mean_token_accuracy": 0.8756930027157068, "num_tokens": 471309098.0, "step": 548 }, { "entropy": 0.396209716796875, "epoch": 4.357142857142857, "grad_norm": 0.75965811702668, "learning_rate": 1.931663116340074e-06, "loss": 0.3455, "mean_token_accuracy": 0.8765083705075085, "num_tokens": 472145738.0, "step": 549 }, { "entropy": 0.396392822265625, "epoch": 4.365079365079365, "grad_norm": 0.6756191619675378, "learning_rate": 1.914418931810643e-06, "loss": 0.3512, "mean_token_accuracy": 0.8744937106966972, "num_tokens": 473047197.0, "step": 550 }, { "entropy": 0.401031494140625, "epoch": 4.3730158730158735, "grad_norm": 0.6965894626329614, "learning_rate": 1.8972338200225509e-06, "loss": 0.3421, "mean_token_accuracy": 0.8775716116651893, "num_tokens": 473907585.0, "step": 551 }, { "entropy": 0.4000091552734375, "epoch": 4.380952380952381, "grad_norm": 0.7076324681120165, "learning_rate": 1.880108109980815e-06, "loss": 0.3462, "mean_token_accuracy": 0.8761595580726862, "num_tokens": 474779332.0, "step": 552 }, { "entropy": 0.39825439453125, "epoch": 4.388888888888889, "grad_norm": 0.7050720543139621, "learning_rate": 1.8630421295532252e-06, "loss": 0.345, "mean_token_accuracy": 0.8770742062479258, "num_tokens": 475659187.0, "step": 553 }, { "entropy": 0.403778076171875, "epoch": 4.396825396825397, "grad_norm": 0.7340183406802493, "learning_rate": 1.8460362054640573e-06, "loss": 0.3478, "mean_token_accuracy": 0.8751401146873832, "num_tokens": 476487458.0, "step": 554 }, { "entropy": 0.39990234375, "epoch": 4.404761904761905, "grad_norm": 0.6861105491926857, "learning_rate": 1.8290906632878297e-06, "loss": 0.3431, "mean_token_accuracy": 0.8780268509872258, "num_tokens": 477345662.0, "step": 555 }, { "entropy": 0.3991241455078125, "epoch": 4.412698412698413, "grad_norm": 0.7296898602599676, "learning_rate": 1.8122058274430542e-06, "loss": 0.3411, "mean_token_accuracy": 0.8761810320429504, "num_tokens": 478205977.0, "step": 556 }, { "entropy": 0.4037933349609375, "epoch": 4.420634920634921, "grad_norm": 1.0332210701383924, "learning_rate": 1.7953820211860395e-06, "loss": 0.356, "mean_token_accuracy": 0.8737587067298591, "num_tokens": 479048650.0, "step": 557 }, { "entropy": 0.3999481201171875, "epoch": 4.428571428571429, "grad_norm": 0.7091178286840939, "learning_rate": 1.7786195666046935e-06, "loss": 0.343, "mean_token_accuracy": 0.8771154009737074, "num_tokens": 479895873.0, "step": 558 }, { "entropy": 0.4032745361328125, "epoch": 4.436507936507937, "grad_norm": 0.6733078832793936, "learning_rate": 1.7619187846123624e-06, "loss": 0.3457, "mean_token_accuracy": 0.8771998826414347, "num_tokens": 480755429.0, "step": 559 }, { "entropy": 0.4007568359375, "epoch": 4.444444444444445, "grad_norm": 0.8490823775032588, "learning_rate": 1.7452799949416833e-06, "loss": 0.3517, "mean_token_accuracy": 0.8754395125433803, "num_tokens": 481608352.0, "step": 560 }, { "entropy": 0.4008026123046875, "epoch": 4.4523809523809526, "grad_norm": 0.7225303298169462, "learning_rate": 1.7287035161384673e-06, "loss": 0.35, "mean_token_accuracy": 0.8747482905164361, "num_tokens": 482441149.0, "step": 561 }, { "entropy": 0.4021148681640625, "epoch": 4.4603174603174605, "grad_norm": 0.6624423396335506, "learning_rate": 1.7121896655555958e-06, "loss": 0.347, "mean_token_accuracy": 0.8763077296316624, "num_tokens": 483307531.0, "step": 562 }, { "entropy": 0.4007720947265625, "epoch": 4.468253968253968, "grad_norm": 0.6783795851745674, "learning_rate": 1.695738759346947e-06, "loss": 0.3516, "mean_token_accuracy": 0.8752468260936439, "num_tokens": 484156689.0, "step": 563 }, { "entropy": 0.3984375, "epoch": 4.476190476190476, "grad_norm": 0.7230409362049561, "learning_rate": 1.6793511124613455e-06, "loss": 0.3405, "mean_token_accuracy": 0.8779969648458064, "num_tokens": 485003773.0, "step": 564 }, { "entropy": 0.4019317626953125, "epoch": 4.484126984126984, "grad_norm": 0.6858561278935235, "learning_rate": 1.6630270386365288e-06, "loss": 0.3462, "mean_token_accuracy": 0.8767383908852935, "num_tokens": 485834271.0, "step": 565 }, { "entropy": 0.4033966064453125, "epoch": 4.492063492063492, "grad_norm": 0.7715463405263099, "learning_rate": 1.6467668503931432e-06, "loss": 0.3406, "mean_token_accuracy": 0.8790650884620845, "num_tokens": 486676541.0, "step": 566 }, { "entropy": 0.3995513916015625, "epoch": 4.5, "grad_norm": 0.7299031695508553, "learning_rate": 1.6305708590287616e-06, "loss": 0.3413, "mean_token_accuracy": 0.8776452434249222, "num_tokens": 487533902.0, "step": 567 }, { "entropy": 0.39752197265625, "epoch": 4.507936507936508, "grad_norm": 0.7001696842835692, "learning_rate": 1.6144393746119208e-06, "loss": 0.3468, "mean_token_accuracy": 0.8766471082344651, "num_tokens": 488403340.0, "step": 568 }, { "entropy": 0.3946075439453125, "epoch": 4.515873015873016, "grad_norm": 0.6949363799298416, "learning_rate": 1.5983727059761873e-06, "loss": 0.3413, "mean_token_accuracy": 0.8782787672244012, "num_tokens": 489285650.0, "step": 569 }, { "entropy": 0.402496337890625, "epoch": 4.523809523809524, "grad_norm": 0.6662573552334149, "learning_rate": 1.5823711607142428e-06, "loss": 0.3448, "mean_token_accuracy": 0.876647824421525, "num_tokens": 490146251.0, "step": 570 }, { "entropy": 0.3963165283203125, "epoch": 4.531746031746032, "grad_norm": 0.6722490242200185, "learning_rate": 1.5664350451720022e-06, "loss": 0.3343, "mean_token_accuracy": 0.8809215794317424, "num_tokens": 490981639.0, "step": 571 }, { "entropy": 0.401947021484375, "epoch": 4.5396825396825395, "grad_norm": 0.7667827684007154, "learning_rate": 1.5505646644427375e-06, "loss": 0.3443, "mean_token_accuracy": 0.8768278043717146, "num_tokens": 491819855.0, "step": 572 }, { "entropy": 0.4046630859375, "epoch": 4.5476190476190474, "grad_norm": 0.7217844340085546, "learning_rate": 1.5347603223612462e-06, "loss": 0.3453, "mean_token_accuracy": 0.8769222623668611, "num_tokens": 492664773.0, "step": 573 }, { "entropy": 0.396392822265625, "epoch": 4.555555555555555, "grad_norm": 0.6828293087400851, "learning_rate": 1.5190223214980286e-06, "loss": 0.3425, "mean_token_accuracy": 0.876984007190913, "num_tokens": 493538855.0, "step": 574 }, { "entropy": 0.3953704833984375, "epoch": 4.563492063492063, "grad_norm": 0.6985094822557292, "learning_rate": 1.5033509631534986e-06, "loss": 0.3481, "mean_token_accuracy": 0.8754701013676822, "num_tokens": 494419834.0, "step": 575 }, { "entropy": 0.40057373046875, "epoch": 4.571428571428571, "grad_norm": 0.7055750733602428, "learning_rate": 1.4877465473522178e-06, "loss": 0.3449, "mean_token_accuracy": 0.8770850743167102, "num_tokens": 495279672.0, "step": 576 }, { "entropy": 0.3951416015625, "epoch": 4.579365079365079, "grad_norm": 0.6964133064600199, "learning_rate": 1.4722093728371427e-06, "loss": 0.3513, "mean_token_accuracy": 0.874992523342371, "num_tokens": 496156072.0, "step": 577 }, { "entropy": 0.40093994140625, "epoch": 4.587301587301587, "grad_norm": 0.6585867710192563, "learning_rate": 1.4567397370639158e-06, "loss": 0.3481, "mean_token_accuracy": 0.8771389788016677, "num_tokens": 497013976.0, "step": 578 }, { "entropy": 0.400543212890625, "epoch": 4.595238095238095, "grad_norm": 0.6695268179646108, "learning_rate": 1.4413379361951596e-06, "loss": 0.3424, "mean_token_accuracy": 0.8771733501926064, "num_tokens": 497869587.0, "step": 579 }, { "entropy": 0.4037933349609375, "epoch": 4.603174603174603, "grad_norm": 0.7406939877102566, "learning_rate": 1.4260042650948187e-06, "loss": 0.3427, "mean_token_accuracy": 0.8756671342998743, "num_tokens": 498692858.0, "step": 580 }, { "entropy": 0.3985595703125, "epoch": 4.611111111111111, "grad_norm": 0.7048045979371886, "learning_rate": 1.4107390173225045e-06, "loss": 0.3469, "mean_token_accuracy": 0.8772797528654337, "num_tokens": 499558825.0, "step": 581 }, { "entropy": 0.4054718017578125, "epoch": 4.619047619047619, "grad_norm": 0.6900885423977635, "learning_rate": 1.395542485127886e-06, "loss": 0.3408, "mean_token_accuracy": 0.878491104580462, "num_tokens": 500399881.0, "step": 582 }, { "entropy": 0.4000701904296875, "epoch": 4.6269841269841265, "grad_norm": 0.6551912767795579, "learning_rate": 1.3804149594450816e-06, "loss": 0.3402, "mean_token_accuracy": 0.8797827651724219, "num_tokens": 501277242.0, "step": 583 }, { "entropy": 0.392608642578125, "epoch": 4.634920634920634, "grad_norm": 0.6872125661896025, "learning_rate": 1.365356729887099e-06, "loss": 0.3415, "mean_token_accuracy": 0.8778949431143701, "num_tokens": 502175769.0, "step": 584 }, { "entropy": 0.4004364013671875, "epoch": 4.642857142857143, "grad_norm": 0.6906654407257142, "learning_rate": 1.3503680847402868e-06, "loss": 0.3375, "mean_token_accuracy": 0.879074421711266, "num_tokens": 503037907.0, "step": 585 }, { "entropy": 0.3996429443359375, "epoch": 4.650793650793651, "grad_norm": 0.7240342341344183, "learning_rate": 1.3354493109588145e-06, "loss": 0.343, "mean_token_accuracy": 0.8791590658947825, "num_tokens": 503882004.0, "step": 586 }, { "entropy": 0.3933563232421875, "epoch": 4.658730158730159, "grad_norm": 0.6759947923545749, "learning_rate": 1.320600694159185e-06, "loss": 0.3418, "mean_token_accuracy": 0.8785993568599224, "num_tokens": 504761280.0, "step": 587 }, { "entropy": 0.3975372314453125, "epoch": 4.666666666666667, "grad_norm": 0.6810332156222548, "learning_rate": 1.3058225186147572e-06, "loss": 0.3419, "mean_token_accuracy": 0.8782242434099317, "num_tokens": 505628924.0, "step": 588 }, { "entropy": 0.3953857421875, "epoch": 4.674603174603175, "grad_norm": 0.6952323670957825, "learning_rate": 1.2911150672503098e-06, "loss": 0.3349, "mean_token_accuracy": 0.8792746933177114, "num_tokens": 506483264.0, "step": 589 }, { "entropy": 0.400146484375, "epoch": 4.682539682539683, "grad_norm": 0.6615786248003038, "learning_rate": 1.2764786216366236e-06, "loss": 0.342, "mean_token_accuracy": 0.8765605296939611, "num_tokens": 507337582.0, "step": 590 }, { "entropy": 0.3936614990234375, "epoch": 4.690476190476191, "grad_norm": 0.6423682264116367, "learning_rate": 1.2619134619850908e-06, "loss": 0.3403, "mean_token_accuracy": 0.8784746997989714, "num_tokens": 508222543.0, "step": 591 }, { "entropy": 0.40179443359375, "epoch": 4.698412698412699, "grad_norm": 0.7179320235597545, "learning_rate": 1.2474198671423493e-06, "loss": 0.3439, "mean_token_accuracy": 0.8781091058626771, "num_tokens": 509077470.0, "step": 592 }, { "entropy": 0.4020538330078125, "epoch": 4.7063492063492065, "grad_norm": 0.6640568389501444, "learning_rate": 1.2329981145849468e-06, "loss": 0.345, "mean_token_accuracy": 0.8776707421056926, "num_tokens": 509934412.0, "step": 593 }, { "entropy": 0.3987274169921875, "epoch": 4.714285714285714, "grad_norm": 0.6641554979878878, "learning_rate": 1.2186484804140242e-06, "loss": 0.333, "mean_token_accuracy": 0.8802338382229209, "num_tokens": 510796655.0, "step": 594 }, { "entropy": 0.3936614990234375, "epoch": 4.722222222222222, "grad_norm": 0.7311800509678725, "learning_rate": 1.2043712393500355e-06, "loss": 0.3465, "mean_token_accuracy": 0.876534974668175, "num_tokens": 511666478.0, "step": 595 }, { "entropy": 0.3982086181640625, "epoch": 4.73015873015873, "grad_norm": 0.654011664415763, "learning_rate": 1.1901666647274823e-06, "loss": 0.336, "mean_token_accuracy": 0.8799294792115688, "num_tokens": 512547249.0, "step": 596 }, { "entropy": 0.4018096923828125, "epoch": 4.738095238095238, "grad_norm": 0.7906669325474568, "learning_rate": 1.1760350284896876e-06, "loss": 0.3423, "mean_token_accuracy": 0.8780363285914063, "num_tokens": 513406924.0, "step": 597 }, { "entropy": 0.394073486328125, "epoch": 4.746031746031746, "grad_norm": 0.6272944465027679, "learning_rate": 1.1619766011835832e-06, "loss": 0.3351, "mean_token_accuracy": 0.8792783697135746, "num_tokens": 514278305.0, "step": 598 }, { "entropy": 0.3991546630859375, "epoch": 4.753968253968254, "grad_norm": 0.7028608754920164, "learning_rate": 1.1479916519545326e-06, "loss": 0.3381, "mean_token_accuracy": 0.8802824383601546, "num_tokens": 515127083.0, "step": 599 }, { "entropy": 0.3970489501953125, "epoch": 4.761904761904762, "grad_norm": 0.7415718014919481, "learning_rate": 1.1340804485411783e-06, "loss": 0.3494, "mean_token_accuracy": 0.8775360365398228, "num_tokens": 515982781.0, "step": 600 }, { "entropy": 0.400482177734375, "epoch": 4.76984126984127, "grad_norm": 0.6506891630519459, "learning_rate": 1.1202432572703176e-06, "loss": 0.3348, "mean_token_accuracy": 0.879584884736687, "num_tokens": 516838578.0, "step": 601 }, { "entropy": 0.398193359375, "epoch": 4.777777777777778, "grad_norm": 0.6609399081506822, "learning_rate": 1.1064803430518002e-06, "loss": 0.3403, "mean_token_accuracy": 0.8773757833987474, "num_tokens": 517695973.0, "step": 602 }, { "entropy": 0.3961334228515625, "epoch": 4.785714285714286, "grad_norm": 0.6536270048247466, "learning_rate": 1.0927919693734618e-06, "loss": 0.3403, "mean_token_accuracy": 0.8781040622852743, "num_tokens": 518570319.0, "step": 603 }, { "entropy": 0.398956298828125, "epoch": 4.7936507936507935, "grad_norm": 0.6661437239536121, "learning_rate": 1.0791783982960736e-06, "loss": 0.3429, "mean_token_accuracy": 0.8768606032244861, "num_tokens": 519417148.0, "step": 604 }, { "entropy": 0.399383544921875, "epoch": 4.801587301587301, "grad_norm": 0.6697036401243884, "learning_rate": 1.0656398904483312e-06, "loss": 0.3459, "mean_token_accuracy": 0.8781998874619603, "num_tokens": 520284498.0, "step": 605 }, { "entropy": 0.3967742919921875, "epoch": 4.809523809523809, "grad_norm": 0.6448494446348442, "learning_rate": 1.0521767050218562e-06, "loss": 0.3453, "mean_token_accuracy": 0.8755287849344313, "num_tokens": 521161180.0, "step": 606 }, { "entropy": 0.4019927978515625, "epoch": 4.817460317460317, "grad_norm": 0.695391933649051, "learning_rate": 1.0387890997662443e-06, "loss": 0.3338, "mean_token_accuracy": 0.8791229757480323, "num_tokens": 522018351.0, "step": 607 }, { "entropy": 0.396759033203125, "epoch": 4.825396825396825, "grad_norm": 0.6885741043618135, "learning_rate": 1.0254773309841277e-06, "loss": 0.3452, "mean_token_accuracy": 0.8766398807056248, "num_tokens": 522908445.0, "step": 608 }, { "entropy": 0.4028167724609375, "epoch": 4.833333333333333, "grad_norm": 0.6824971060967869, "learning_rate": 1.012241653526263e-06, "loss": 0.3381, "mean_token_accuracy": 0.8785937232896686, "num_tokens": 523761885.0, "step": 609 }, { "entropy": 0.3957977294921875, "epoch": 4.841269841269841, "grad_norm": 0.6556820486631832, "learning_rate": 9.990823207866578e-07, "loss": 0.3431, "mean_token_accuracy": 0.8786509921774268, "num_tokens": 524634392.0, "step": 610 }, { "entropy": 0.3975067138671875, "epoch": 4.849206349206349, "grad_norm": 0.6634339315868009, "learning_rate": 9.85999584697716e-07, "loss": 0.3458, "mean_token_accuracy": 0.8759105852805078, "num_tokens": 525481711.0, "step": 611 }, { "entropy": 0.3944549560546875, "epoch": 4.857142857142857, "grad_norm": 0.6633851252480073, "learning_rate": 9.729936957254165e-07, "loss": 0.3348, "mean_token_accuracy": 0.8805793649517, "num_tokens": 526350562.0, "step": 612 }, { "entropy": 0.3972015380859375, "epoch": 4.865079365079366, "grad_norm": 0.6809718096324041, "learning_rate": 9.600649028645215e-07, "loss": 0.3411, "mean_token_accuracy": 0.877831466961652, "num_tokens": 527208722.0, "step": 613 }, { "entropy": 0.3993377685546875, "epoch": 4.8730158730158735, "grad_norm": 0.7413910925997623, "learning_rate": 9.472134536338007e-07, "loss": 0.3348, "mean_token_accuracy": 0.8798928018659353, "num_tokens": 528070537.0, "step": 614 }, { "entropy": 0.3999786376953125, "epoch": 4.880952380952381, "grad_norm": 0.6404430937655207, "learning_rate": 9.344395940713009e-07, "loss": 0.3482, "mean_token_accuracy": 0.8766381270252168, "num_tokens": 528925481.0, "step": 615 }, { "entropy": 0.4001312255859375, "epoch": 4.888888888888889, "grad_norm": 0.642942870387289, "learning_rate": 9.217435687296305e-07, "loss": 0.3388, "mean_token_accuracy": 0.8791137794032693, "num_tokens": 529762293.0, "step": 616 }, { "entropy": 0.3968963623046875, "epoch": 4.896825396825397, "grad_norm": 0.6411974157903991, "learning_rate": 9.091256206712812e-07, "loss": 0.3398, "mean_token_accuracy": 0.8778149662539363, "num_tokens": 530625692.0, "step": 617 }, { "entropy": 0.397918701171875, "epoch": 4.904761904761905, "grad_norm": 0.6795080852701798, "learning_rate": 8.965859914639724e-07, "loss": 0.3458, "mean_token_accuracy": 0.8769173468463123, "num_tokens": 531481363.0, "step": 618 }, { "entropy": 0.399322509765625, "epoch": 4.912698412698413, "grad_norm": 0.7268654576820524, "learning_rate": 8.841249211760272e-07, "loss": 0.3401, "mean_token_accuracy": 0.8781247353181243, "num_tokens": 532334443.0, "step": 619 }, { "entropy": 0.39544677734375, "epoch": 4.920634920634921, "grad_norm": 0.6991042720171687, "learning_rate": 8.717426483717762e-07, "loss": 0.3474, "mean_token_accuracy": 0.8754720254801214, "num_tokens": 533215439.0, "step": 620 }, { "entropy": 0.39813232421875, "epoch": 4.928571428571429, "grad_norm": 0.6420905529792567, "learning_rate": 8.594394101069897e-07, "loss": 0.3449, "mean_token_accuracy": 0.876462968531996, "num_tokens": 534086645.0, "step": 621 }, { "entropy": 0.3993988037109375, "epoch": 4.936507936507937, "grad_norm": 0.634959288865702, "learning_rate": 8.472154419243411e-07, "loss": 0.3422, "mean_token_accuracy": 0.8784619648940861, "num_tokens": 534968024.0, "step": 622 }, { "entropy": 0.3939361572265625, "epoch": 4.944444444444445, "grad_norm": 0.6525948648538159, "learning_rate": 8.350709778488941e-07, "loss": 0.3433, "mean_token_accuracy": 0.878953296225518, "num_tokens": 535858097.0, "step": 623 }, { "entropy": 0.3915557861328125, "epoch": 4.9523809523809526, "grad_norm": 0.6338378330558326, "learning_rate": 8.230062503836278e-07, "loss": 0.3403, "mean_token_accuracy": 0.8782070642337203, "num_tokens": 536754856.0, "step": 624 }, { "entropy": 0.3979949951171875, "epoch": 4.9603174603174605, "grad_norm": 0.6515049678243575, "learning_rate": 8.110214905049802e-07, "loss": 0.3447, "mean_token_accuracy": 0.8780298097990453, "num_tokens": 537612880.0, "step": 625 }, { "entropy": 0.39752197265625, "epoch": 4.968253968253968, "grad_norm": 0.6537688062696074, "learning_rate": 7.991169276584281e-07, "loss": 0.3383, "mean_token_accuracy": 0.8791689327917993, "num_tokens": 538459827.0, "step": 626 }, { "entropy": 0.4014129638671875, "epoch": 4.976190476190476, "grad_norm": 0.6761627263761556, "learning_rate": 7.872927897540944e-07, "loss": 0.3349, "mean_token_accuracy": 0.8803192311897874, "num_tokens": 539280570.0, "step": 627 }, { "entropy": 0.4014129638671875, "epoch": 4.984126984126984, "grad_norm": 0.6783420727462287, "learning_rate": 7.75549303162384e-07, "loss": 0.3441, "mean_token_accuracy": 0.8776299306191504, "num_tokens": 540103344.0, "step": 628 }, { "entropy": 0.3983917236328125, "epoch": 4.992063492063492, "grad_norm": 0.6003292581738014, "learning_rate": 7.638866927096555e-07, "loss": 0.3384, "mean_token_accuracy": 0.8786575449630618, "num_tokens": 540983421.0, "step": 629 }, { "entropy": 0.3915557861328125, "epoch": 5.0, "grad_norm": 0.6132257577458607, "learning_rate": 7.523051816739074e-07, "loss": 0.3407, "mean_token_accuracy": 0.8787228460423648, "num_tokens": 541874593.0, "step": 630 }, { "entropy": 0.3969573974609375, "epoch": 5.007936507936508, "grad_norm": 0.6874008020158511, "learning_rate": 7.408049917805104e-07, "loss": 0.3324, "mean_token_accuracy": 0.8809576267376542, "num_tokens": 542746900.0, "step": 631 }, { "entropy": 0.3990631103515625, "epoch": 5.015873015873016, "grad_norm": 0.6533341249855773, "learning_rate": 7.293863431979619e-07, "loss": 0.3325, "mean_token_accuracy": 0.8817571788094938, "num_tokens": 543607351.0, "step": 632 }, { "entropy": 0.4006195068359375, "epoch": 5.023809523809524, "grad_norm": 0.6743032453873651, "learning_rate": 7.180494545336642e-07, "loss": 0.3252, "mean_token_accuracy": 0.8821277469396591, "num_tokens": 544429814.0, "step": 633 }, { "entropy": 0.392242431640625, "epoch": 5.031746031746032, "grad_norm": 0.6491407357261534, "learning_rate": 7.067945428297524e-07, "loss": 0.329, "mean_token_accuracy": 0.8841568692587316, "num_tokens": 545293765.0, "step": 634 }, { "entropy": 0.392059326171875, "epoch": 5.0396825396825395, "grad_norm": 0.6509793206709636, "learning_rate": 6.956218235589263e-07, "loss": 0.327, "mean_token_accuracy": 0.8831868241541088, "num_tokens": 546166241.0, "step": 635 }, { "entropy": 0.394683837890625, "epoch": 5.0476190476190474, "grad_norm": 0.6965451021310177, "learning_rate": 6.845315106203327e-07, "loss": 0.3192, "mean_token_accuracy": 0.8857454406097531, "num_tokens": 547008292.0, "step": 636 }, { "entropy": 0.3927764892578125, "epoch": 5.055555555555555, "grad_norm": 0.7299760861326686, "learning_rate": 6.735238163354669e-07, "loss": 0.329, "mean_token_accuracy": 0.8830623761750758, "num_tokens": 547881345.0, "step": 637 }, { "entropy": 0.393829345703125, "epoch": 5.063492063492063, "grad_norm": 0.692591071370849, "learning_rate": 6.625989514441089e-07, "loss": 0.3263, "mean_token_accuracy": 0.8835366195999086, "num_tokens": 548753550.0, "step": 638 }, { "entropy": 0.3939666748046875, "epoch": 5.071428571428571, "grad_norm": 0.6832435052031313, "learning_rate": 6.517571251002896e-07, "loss": 0.3274, "mean_token_accuracy": 0.8831272819079459, "num_tokens": 549614624.0, "step": 639 }, { "entropy": 0.39398193359375, "epoch": 5.079365079365079, "grad_norm": 0.6839637384434816, "learning_rate": 6.40998544868287e-07, "loss": 0.3168, "mean_token_accuracy": 0.8865264924243093, "num_tokens": 550450538.0, "step": 640 }, { "entropy": 0.3957672119140625, "epoch": 5.087301587301587, "grad_norm": 0.6628392349359786, "learning_rate": 6.3032341671865e-07, "loss": 0.3251, "mean_token_accuracy": 0.8840126856230199, "num_tokens": 551311533.0, "step": 641 }, { "entropy": 0.3985137939453125, "epoch": 5.095238095238095, "grad_norm": 0.6552020896616739, "learning_rate": 6.197319450242562e-07, "loss": 0.3219, "mean_token_accuracy": 0.8848442859016359, "num_tokens": 552168363.0, "step": 642 }, { "entropy": 0.3944549560546875, "epoch": 5.103174603174603, "grad_norm": 0.6459530624061904, "learning_rate": 6.092243325564007e-07, "loss": 0.3254, "mean_token_accuracy": 0.885173340793699, "num_tokens": 553049237.0, "step": 643 }, { "entropy": 0.396759033203125, "epoch": 5.111111111111111, "grad_norm": 0.6485251490072942, "learning_rate": 5.98800780480912e-07, "loss": 0.3345, "mean_token_accuracy": 0.8795099183917046, "num_tokens": 553918684.0, "step": 644 }, { "entropy": 0.3992462158203125, "epoch": 5.119047619047619, "grad_norm": 0.682884688994534, "learning_rate": 5.884614883543027e-07, "loss": 0.3294, "mean_token_accuracy": 0.8827598677016795, "num_tokens": 554768021.0, "step": 645 }, { "entropy": 0.3957366943359375, "epoch": 5.1269841269841265, "grad_norm": 0.6622701991206025, "learning_rate": 5.782066541199471e-07, "loss": 0.3201, "mean_token_accuracy": 0.8856850513257086, "num_tokens": 555611362.0, "step": 646 }, { "entropy": 0.397430419921875, "epoch": 5.134920634920635, "grad_norm": 0.6494881440989898, "learning_rate": 5.680364741042926e-07, "loss": 0.3308, "mean_token_accuracy": 0.8822413263842463, "num_tokens": 556476117.0, "step": 647 }, { "entropy": 0.39849853515625, "epoch": 5.142857142857143, "grad_norm": 0.6399277286004064, "learning_rate": 5.579511430131018e-07, "loss": 0.3262, "mean_token_accuracy": 0.8843574924394488, "num_tokens": 557321161.0, "step": 648 }, { "entropy": 0.3938751220703125, "epoch": 5.150793650793651, "grad_norm": 0.6414200561803504, "learning_rate": 5.479508539277229e-07, "loss": 0.3262, "mean_token_accuracy": 0.8831641948781908, "num_tokens": 558195818.0, "step": 649 }, { "entropy": 0.3946533203125, "epoch": 5.158730158730159, "grad_norm": 0.6400500966934808, "learning_rate": 5.380357983013962e-07, "loss": 0.3247, "mean_token_accuracy": 0.8846798562444746, "num_tokens": 559060077.0, "step": 650 }, { "entropy": 0.3952484130859375, "epoch": 5.166666666666667, "grad_norm": 0.665030112678579, "learning_rate": 5.282061659555854e-07, "loss": 0.3306, "mean_token_accuracy": 0.8817098373547196, "num_tokens": 559919625.0, "step": 651 }, { "entropy": 0.3953704833984375, "epoch": 5.174603174603175, "grad_norm": 0.6442855912711039, "learning_rate": 5.184621450763455e-07, "loss": 0.3286, "mean_token_accuracy": 0.8841330683790147, "num_tokens": 560767619.0, "step": 652 }, { "entropy": 0.3937225341796875, "epoch": 5.182539682539683, "grad_norm": 0.711237148073599, "learning_rate": 5.088039222107205e-07, "loss": 0.3317, "mean_token_accuracy": 0.8824787489138544, "num_tokens": 561614252.0, "step": 653 }, { "entropy": 0.3915252685546875, "epoch": 5.190476190476191, "grad_norm": 0.664655406106165, "learning_rate": 4.992316822631693e-07, "loss": 0.3247, "mean_token_accuracy": 0.883714787196368, "num_tokens": 562479272.0, "step": 654 }, { "entropy": 0.3939361572265625, "epoch": 5.198412698412699, "grad_norm": 0.6423893808843507, "learning_rate": 4.897456084920282e-07, "loss": 0.3233, "mean_token_accuracy": 0.8836758630350232, "num_tokens": 563325903.0, "step": 655 }, { "entropy": 0.391815185546875, "epoch": 5.2063492063492065, "grad_norm": 0.609576579332021, "learning_rate": 4.803458825060042e-07, "loss": 0.3234, "mean_token_accuracy": 0.8831925024278462, "num_tokens": 564204898.0, "step": 656 }, { "entropy": 0.3961944580078125, "epoch": 5.214285714285714, "grad_norm": 0.6355056690611225, "learning_rate": 4.710326842606927e-07, "loss": 0.3209, "mean_token_accuracy": 0.8843817953020334, "num_tokens": 565054657.0, "step": 657 }, { "entropy": 0.3915863037109375, "epoch": 5.222222222222222, "grad_norm": 0.6715411599255142, "learning_rate": 4.618061920551381e-07, "loss": 0.3268, "mean_token_accuracy": 0.8825240274891257, "num_tokens": 565926348.0, "step": 658 }, { "entropy": 0.3952484130859375, "epoch": 5.23015873015873, "grad_norm": 0.6474113419989475, "learning_rate": 4.526665825284132e-07, "loss": 0.3344, "mean_token_accuracy": 0.8826401890255511, "num_tokens": 566799563.0, "step": 659 }, { "entropy": 0.393402099609375, "epoch": 5.238095238095238, "grad_norm": 0.6520320734154383, "learning_rate": 4.4361403065624475e-07, "loss": 0.3283, "mean_token_accuracy": 0.881974630523473, "num_tokens": 567686904.0, "step": 660 }, { "entropy": 0.3928680419921875, "epoch": 5.246031746031746, "grad_norm": 0.6254784285922858, "learning_rate": 4.3464870974766314e-07, "loss": 0.3299, "mean_token_accuracy": 0.882287971675396, "num_tokens": 568563817.0, "step": 661 }, { "entropy": 0.396759033203125, "epoch": 5.253968253968254, "grad_norm": 0.6524894750875436, "learning_rate": 4.257707914416781e-07, "loss": 0.319, "mean_token_accuracy": 0.8853690237738192, "num_tokens": 569412950.0, "step": 662 }, { "entropy": 0.390960693359375, "epoch": 5.261904761904762, "grad_norm": 0.6550596337749973, "learning_rate": 4.169804457039972e-07, "loss": 0.3281, "mean_token_accuracy": 0.8837977671064436, "num_tokens": 570290370.0, "step": 663 }, { "entropy": 0.39520263671875, "epoch": 5.26984126984127, "grad_norm": 0.6599640261368089, "learning_rate": 4.082778408237731e-07, "loss": 0.3312, "mean_token_accuracy": 0.8819447602145374, "num_tokens": 571139089.0, "step": 664 }, { "entropy": 0.3977203369140625, "epoch": 5.277777777777778, "grad_norm": 0.6325202825006885, "learning_rate": 3.996631434103776e-07, "loss": 0.3216, "mean_token_accuracy": 0.8842552327550948, "num_tokens": 571974486.0, "step": 665 }, { "entropy": 0.3957366943359375, "epoch": 5.285714285714286, "grad_norm": 0.629795019163263, "learning_rate": 3.911365183902166e-07, "loss": 0.3244, "mean_token_accuracy": 0.883813981898129, "num_tokens": 572833941.0, "step": 666 }, { "entropy": 0.393707275390625, "epoch": 5.2936507936507935, "grad_norm": 0.6615713948839467, "learning_rate": 3.826981290035692e-07, "loss": 0.3358, "mean_token_accuracy": 0.8800787003710866, "num_tokens": 573696025.0, "step": 667 }, { "entropy": 0.3954620361328125, "epoch": 5.301587301587301, "grad_norm": 0.6448584761089627, "learning_rate": 3.7434813680146234e-07, "loss": 0.3258, "mean_token_accuracy": 0.8832776751369238, "num_tokens": 574541353.0, "step": 668 }, { "entropy": 0.3994140625, "epoch": 5.309523809523809, "grad_norm": 0.6253174175232413, "learning_rate": 3.6608670164258065e-07, "loss": 0.328, "mean_token_accuracy": 0.8827647585421801, "num_tokens": 575378417.0, "step": 669 }, { "entropy": 0.397003173828125, "epoch": 5.317460317460317, "grad_norm": 0.6543163664275344, "learning_rate": 3.5791398169020384e-07, "loss": 0.3223, "mean_token_accuracy": 0.8838555999100208, "num_tokens": 576216763.0, "step": 670 }, { "entropy": 0.3980865478515625, "epoch": 5.325396825396825, "grad_norm": 0.6727975870724803, "learning_rate": 3.4983013340918024e-07, "loss": 0.3319, "mean_token_accuracy": 0.881680119317025, "num_tokens": 577068285.0, "step": 671 }, { "entropy": 0.397216796875, "epoch": 5.333333333333333, "grad_norm": 0.6908497952151735, "learning_rate": 3.4183531156292913e-07, "loss": 0.3199, "mean_token_accuracy": 0.8852394479326904, "num_tokens": 577910888.0, "step": 672 }, { "entropy": 0.3982391357421875, "epoch": 5.341269841269841, "grad_norm": 0.6271777851924285, "learning_rate": 3.3392966921047984e-07, "loss": 0.3323, "mean_token_accuracy": 0.8813259471207857, "num_tokens": 578760857.0, "step": 673 }, { "entropy": 0.39141845703125, "epoch": 5.349206349206349, "grad_norm": 0.6420853255417144, "learning_rate": 3.261133577035408e-07, "loss": 0.3276, "mean_token_accuracy": 0.8815803048200905, "num_tokens": 579639850.0, "step": 674 }, { "entropy": 0.39239501953125, "epoch": 5.357142857142857, "grad_norm": 0.6400029343318571, "learning_rate": 3.1838652668360173e-07, "loss": 0.3208, "mean_token_accuracy": 0.8846969213336706, "num_tokens": 580506527.0, "step": 675 }, { "entropy": 0.4005584716796875, "epoch": 5.365079365079365, "grad_norm": 0.6337302117382984, "learning_rate": 3.1074932407906823e-07, "loss": 0.3313, "mean_token_accuracy": 0.8819033140316606, "num_tokens": 581347475.0, "step": 676 }, { "entropy": 0.39678955078125, "epoch": 5.3730158730158735, "grad_norm": 0.6531503190443603, "learning_rate": 3.0320189610243303e-07, "loss": 0.3226, "mean_token_accuracy": 0.8830904331989586, "num_tokens": 582201245.0, "step": 677 }, { "entropy": 0.39251708984375, "epoch": 5.380952380952381, "grad_norm": 0.6298282608886412, "learning_rate": 2.957443872474713e-07, "loss": 0.3249, "mean_token_accuracy": 0.8838722719810903, "num_tokens": 583076088.0, "step": 678 }, { "entropy": 0.3914947509765625, "epoch": 5.388888888888889, "grad_norm": 0.6217255913819979, "learning_rate": 2.883769402864789e-07, "loss": 0.3235, "mean_token_accuracy": 0.8837307607755065, "num_tokens": 583938273.0, "step": 679 }, { "entropy": 0.3894500732421875, "epoch": 5.396825396825397, "grad_norm": 0.9295484786245685, "learning_rate": 2.810996962675361e-07, "loss": 0.3289, "mean_token_accuracy": 0.8828169428743422, "num_tokens": 584828853.0, "step": 680 }, { "entropy": 0.39373779296875, "epoch": 5.404761904761905, "grad_norm": 0.6460727447308977, "learning_rate": 2.739127945118092e-07, "loss": 0.3332, "mean_token_accuracy": 0.8817042661830783, "num_tokens": 585681013.0, "step": 681 }, { "entropy": 0.3941650390625, "epoch": 5.412698412698413, "grad_norm": 0.6371402076274487, "learning_rate": 2.668163726108841e-07, "loss": 0.3294, "mean_token_accuracy": 0.8820854951627553, "num_tokens": 586567674.0, "step": 682 }, { "entropy": 0.3912506103515625, "epoch": 5.420634920634921, "grad_norm": 0.6599817270342999, "learning_rate": 2.5981056642412796e-07, "loss": 0.3274, "mean_token_accuracy": 0.884115984197706, "num_tokens": 587445470.0, "step": 683 }, { "entropy": 0.3984527587890625, "epoch": 5.428571428571429, "grad_norm": 0.6190960844327497, "learning_rate": 2.528955100760938e-07, "loss": 0.3225, "mean_token_accuracy": 0.8846618658863008, "num_tokens": 588268783.0, "step": 684 }, { "entropy": 0.396881103515625, "epoch": 5.436507936507937, "grad_norm": 0.6277108055718937, "learning_rate": 2.460713359539474e-07, "loss": 0.3247, "mean_token_accuracy": 0.8857448240742087, "num_tokens": 589106246.0, "step": 685 }, { "entropy": 0.393524169921875, "epoch": 5.444444444444445, "grad_norm": 0.6346008314831126, "learning_rate": 2.3933817470493445e-07, "loss": 0.319, "mean_token_accuracy": 0.8859792477451265, "num_tokens": 589927765.0, "step": 686 }, { "entropy": 0.396209716796875, "epoch": 5.4523809523809526, "grad_norm": 0.6418782134867133, "learning_rate": 2.3269615523388355e-07, "loss": 0.3276, "mean_token_accuracy": 0.8844058201648295, "num_tokens": 590768483.0, "step": 687 }, { "entropy": 0.393310546875, "epoch": 5.4603174603174605, "grad_norm": 0.6145408122390106, "learning_rate": 2.2614540470073276e-07, "loss": 0.3276, "mean_token_accuracy": 0.8833085368387401, "num_tokens": 591653011.0, "step": 688 }, { "entropy": 0.391082763671875, "epoch": 5.468253968253968, "grad_norm": 0.6290181808829338, "learning_rate": 2.1968604851809738e-07, "loss": 0.3344, "mean_token_accuracy": 0.8824751214124262, "num_tokens": 592555939.0, "step": 689 }, { "entropy": 0.393218994140625, "epoch": 5.476190476190476, "grad_norm": 0.6052791805512318, "learning_rate": 2.1331821034886846e-07, "loss": 0.3268, "mean_token_accuracy": 0.8844651393592358, "num_tokens": 593422295.0, "step": 690 }, { "entropy": 0.3939666748046875, "epoch": 5.484126984126984, "grad_norm": 0.6765361632797722, "learning_rate": 2.0704201210384634e-07, "loss": 0.3294, "mean_token_accuracy": 0.8817935772240162, "num_tokens": 594275578.0, "step": 691 }, { "entropy": 0.3902130126953125, "epoch": 5.492063492063492, "grad_norm": 0.6558765212307175, "learning_rate": 2.0085757393940586e-07, "loss": 0.3276, "mean_token_accuracy": 0.8839560803025961, "num_tokens": 595158105.0, "step": 692 }, { "entropy": 0.3959808349609375, "epoch": 5.5, "grad_norm": 0.6728354108367586, "learning_rate": 1.9476501425519656e-07, "loss": 0.3314, "mean_token_accuracy": 0.8821393130347133, "num_tokens": 596012207.0, "step": 693 }, { "entropy": 0.3945465087890625, "epoch": 5.507936507936508, "grad_norm": 0.6161700876601213, "learning_rate": 1.8876444969187557e-07, "loss": 0.3252, "mean_token_accuracy": 0.884291214402765, "num_tokens": 596867579.0, "step": 694 }, { "entropy": 0.3921051025390625, "epoch": 5.515873015873016, "grad_norm": 0.6659432185652765, "learning_rate": 1.828559951288733e-07, "loss": 0.3294, "mean_token_accuracy": 0.8830416211858392, "num_tokens": 597729003.0, "step": 695 }, { "entropy": 0.3937225341796875, "epoch": 5.523809523809524, "grad_norm": 0.6337625434766239, "learning_rate": 1.7703976368219633e-07, "loss": 0.3387, "mean_token_accuracy": 0.8793549695983529, "num_tokens": 598610243.0, "step": 696 }, { "entropy": 0.398223876953125, "epoch": 5.531746031746032, "grad_norm": 0.6425672701806616, "learning_rate": 1.713158667022613e-07, "loss": 0.3282, "mean_token_accuracy": 0.8831517458893359, "num_tokens": 599468184.0, "step": 697 }, { "entropy": 0.390777587890625, "epoch": 5.5396825396825395, "grad_norm": 0.6588868479383553, "learning_rate": 1.656844137717617e-07, "loss": 0.3241, "mean_token_accuracy": 0.8842832935042679, "num_tokens": 600335530.0, "step": 698 }, { "entropy": 0.39007568359375, "epoch": 5.5476190476190474, "grad_norm": 0.6391375374634255, "learning_rate": 1.601455127035717e-07, "loss": 0.3303, "mean_token_accuracy": 0.8812260185368359, "num_tokens": 601219591.0, "step": 699 }, { "entropy": 0.3961639404296875, "epoch": 5.555555555555555, "grad_norm": 0.6312177790645791, "learning_rate": 1.5469926953868063e-07, "loss": 0.3277, "mean_token_accuracy": 0.8838478000834584, "num_tokens": 602080711.0, "step": 700 }, { "entropy": 0.3947296142578125, "epoch": 5.563492063492063, "grad_norm": 0.6168029707269869, "learning_rate": 1.4934578854416403e-07, "loss": 0.3273, "mean_token_accuracy": 0.8831527666188776, "num_tokens": 602939251.0, "step": 701 }, { "entropy": 0.3935546875, "epoch": 5.571428571428571, "grad_norm": 0.6196463005466791, "learning_rate": 1.440851722111858e-07, "loss": 0.3214, "mean_token_accuracy": 0.8847082569263875, "num_tokens": 603814211.0, "step": 702 }, { "entropy": 0.3931884765625, "epoch": 5.579365079365079, "grad_norm": 0.6582430849772537, "learning_rate": 1.389175212530397e-07, "loss": 0.3279, "mean_token_accuracy": 0.8828860782086849, "num_tokens": 604668809.0, "step": 703 }, { "entropy": 0.395904541015625, "epoch": 5.587301587301587, "grad_norm": 0.6557024272945261, "learning_rate": 1.3384293460321662e-07, "loss": 0.3316, "mean_token_accuracy": 0.8824727293103933, "num_tokens": 605530653.0, "step": 704 }, { "entropy": 0.39093017578125, "epoch": 5.595238095238095, "grad_norm": 0.5952866739722456, "learning_rate": 1.2886150941351317e-07, "loss": 0.3282, "mean_token_accuracy": 0.8835100992582738, "num_tokens": 606405426.0, "step": 705 }, { "entropy": 0.394195556640625, "epoch": 5.603174603174603, "grad_norm": 0.6329738035693675, "learning_rate": 1.2397334105217097e-07, "loss": 0.3251, "mean_token_accuracy": 0.8846405958756804, "num_tokens": 607272519.0, "step": 706 }, { "entropy": 0.3961181640625, "epoch": 5.611111111111111, "grad_norm": 0.6167631677238792, "learning_rate": 1.1917852310205147e-07, "loss": 0.3279, "mean_token_accuracy": 0.883335932623595, "num_tokens": 608126636.0, "step": 707 }, { "entropy": 0.3947296142578125, "epoch": 5.619047619047619, "grad_norm": 0.6123371507088005, "learning_rate": 1.1447714735884463e-07, "loss": 0.3197, "mean_token_accuracy": 0.8852512533776462, "num_tokens": 608972728.0, "step": 708 }, { "entropy": 0.3878631591796875, "epoch": 5.6269841269841265, "grad_norm": 0.6206905437393018, "learning_rate": 1.0986930382930916e-07, "loss": 0.3251, "mean_token_accuracy": 0.8831897312775254, "num_tokens": 609880767.0, "step": 709 }, { "entropy": 0.39312744140625, "epoch": 5.634920634920634, "grad_norm": 0.6362558759153903, "learning_rate": 1.0535508072955225e-07, "loss": 0.3284, "mean_token_accuracy": 0.8815909679979086, "num_tokens": 610732354.0, "step": 710 }, { "entropy": 0.3891448974609375, "epoch": 5.642857142857143, "grad_norm": 0.6132840766865005, "learning_rate": 1.0093456448333872e-07, "loss": 0.3259, "mean_token_accuracy": 0.8822133978828788, "num_tokens": 611602054.0, "step": 711 }, { "entropy": 0.393310546875, "epoch": 5.650793650793651, "grad_norm": 0.6300293214423813, "learning_rate": 9.660783972043786e-08, "loss": 0.3285, "mean_token_accuracy": 0.8818607972934842, "num_tokens": 612448801.0, "step": 712 }, { "entropy": 0.3934173583984375, "epoch": 5.658730158730159, "grad_norm": 0.6904998334631229, "learning_rate": 9.237498927500088e-08, "loss": 0.3302, "mean_token_accuracy": 0.881088858935982, "num_tokens": 613302281.0, "step": 713 }, { "entropy": 0.3985748291015625, "epoch": 5.666666666666667, "grad_norm": 0.6336591385840509, "learning_rate": 8.823609418397939e-08, "loss": 0.324, "mean_token_accuracy": 0.8830747129395604, "num_tokens": 614133343.0, "step": 714 }, { "entropy": 0.3961181640625, "epoch": 5.674603174603175, "grad_norm": 0.6560814482182532, "learning_rate": 8.419123368556991e-08, "loss": 0.3281, "mean_token_accuracy": 0.8820374673232436, "num_tokens": 614971152.0, "step": 715 }, { "entropy": 0.395751953125, "epoch": 5.682539682539683, "grad_norm": 0.61946329385844, "learning_rate": 8.024048521769745e-08, "loss": 0.3244, "mean_token_accuracy": 0.8853642977774143, "num_tokens": 615848347.0, "step": 716 }, { "entropy": 0.3966522216796875, "epoch": 5.690476190476191, "grad_norm": 0.6677220378111695, "learning_rate": 7.638392441653542e-08, "loss": 0.3315, "mean_token_accuracy": 0.8841713918372989, "num_tokens": 616696985.0, "step": 717 }, { "entropy": 0.386993408203125, "epoch": 5.698412698412699, "grad_norm": 0.6448605664251128, "learning_rate": 7.262162511505466e-08, "loss": 0.323, "mean_token_accuracy": 0.8844177662394941, "num_tokens": 617578479.0, "step": 718 }, { "entropy": 0.3964385986328125, "epoch": 5.7063492063492065, "grad_norm": 0.6465949886444489, "learning_rate": 6.895365934161236e-08, "loss": 0.3265, "mean_token_accuracy": 0.883903234731406, "num_tokens": 618415443.0, "step": 719 }, { "entropy": 0.3905792236328125, "epoch": 5.714285714285714, "grad_norm": 0.7626270402963101, "learning_rate": 6.538009731857087e-08, "loss": 0.3266, "mean_token_accuracy": 0.8833001307211816, "num_tokens": 619287864.0, "step": 720 }, { "entropy": 0.3910675048828125, "epoch": 5.722222222222222, "grad_norm": 0.6559784419468065, "learning_rate": 6.190100746095495e-08, "loss": 0.3243, "mean_token_accuracy": 0.8850177507847548, "num_tokens": 620198481.0, "step": 721 }, { "entropy": 0.3970184326171875, "epoch": 5.73015873015873, "grad_norm": 0.6389982422035683, "learning_rate": 5.851645637514114e-08, "loss": 0.327, "mean_token_accuracy": 0.8827162678353488, "num_tokens": 621065449.0, "step": 722 }, { "entropy": 0.3971710205078125, "epoch": 5.738095238095238, "grad_norm": 0.6140180252911043, "learning_rate": 5.522650885758374e-08, "loss": 0.3204, "mean_token_accuracy": 0.8850936810486019, "num_tokens": 621906875.0, "step": 723 }, { "entropy": 0.3934326171875, "epoch": 5.746031746031746, "grad_norm": 0.6533501187294389, "learning_rate": 5.203122789357307e-08, "loss": 0.3342, "mean_token_accuracy": 0.881910024676472, "num_tokens": 622774268.0, "step": 724 }, { "entropy": 0.3961334228515625, "epoch": 5.753968253968254, "grad_norm": 0.6833130980259546, "learning_rate": 4.893067465602863e-08, "loss": 0.3307, "mean_token_accuracy": 0.8820976046845317, "num_tokens": 623625107.0, "step": 725 }, { "entropy": 0.3968048095703125, "epoch": 5.761904761904762, "grad_norm": 0.763571756315237, "learning_rate": 4.5924908504331735e-08, "loss": 0.3303, "mean_token_accuracy": 0.8829025984741747, "num_tokens": 624511530.0, "step": 726 }, { "entropy": 0.392822265625, "epoch": 5.76984126984127, "grad_norm": 0.6168843969129469, "learning_rate": 4.3013986983184705e-08, "loss": 0.3234, "mean_token_accuracy": 0.884893387556076, "num_tokens": 625349169.0, "step": 727 }, { "entropy": 0.393035888671875, "epoch": 5.777777777777778, "grad_norm": 0.6294236264104904, "learning_rate": 4.019796582151181e-08, "loss": 0.3231, "mean_token_accuracy": 0.883484820369631, "num_tokens": 626205023.0, "step": 728 }, { "entropy": 0.392425537109375, "epoch": 5.785714285714286, "grad_norm": 0.7014456722284323, "learning_rate": 3.747689893139228e-08, "loss": 0.3253, "mean_token_accuracy": 0.884223835542798, "num_tokens": 627072409.0, "step": 729 }, { "entropy": 0.39483642578125, "epoch": 5.7936507936507935, "grad_norm": 0.6354033492793867, "learning_rate": 3.4850838407027297e-08, "loss": 0.3351, "mean_token_accuracy": 0.8803266729228199, "num_tokens": 627933762.0, "step": 730 }, { "entropy": 0.392974853515625, "epoch": 5.801587301587301, "grad_norm": 0.6324944348230324, "learning_rate": 3.2319834523742435e-08, "loss": 0.3248, "mean_token_accuracy": 0.8832277562469244, "num_tokens": 628815182.0, "step": 731 }, { "entropy": 0.3950042724609375, "epoch": 5.809523809523809, "grad_norm": 0.629955389311813, "learning_rate": 2.988393573702675e-08, "loss": 0.3201, "mean_token_accuracy": 0.884325556922704, "num_tokens": 629666458.0, "step": 732 }, { "entropy": 0.3938446044921875, "epoch": 5.817460317460317, "grad_norm": 0.6166361263236042, "learning_rate": 2.754318868160244e-08, "loss": 0.3221, "mean_token_accuracy": 0.8852822785265744, "num_tokens": 630529787.0, "step": 733 }, { "entropy": 0.397705078125, "epoch": 5.825396825396825, "grad_norm": 0.644139271435762, "learning_rate": 2.5297638170535542e-08, "loss": 0.3219, "mean_token_accuracy": 0.8845392796210945, "num_tokens": 631374518.0, "step": 734 }, { "entropy": 0.3916778564453125, "epoch": 5.833333333333333, "grad_norm": 0.6747464037741341, "learning_rate": 2.31473271943744e-08, "loss": 0.3364, "mean_token_accuracy": 0.8802501196041703, "num_tokens": 632234249.0, "step": 735 }, { "entropy": 0.3955078125, "epoch": 5.841269841269841, "grad_norm": 0.639174800920214, "learning_rate": 2.109229692032977e-08, "loss": 0.3255, "mean_token_accuracy": 0.8846253966912627, "num_tokens": 633096164.0, "step": 736 }, { "entropy": 0.3962554931640625, "epoch": 5.849206349206349, "grad_norm": 0.6365328488931461, "learning_rate": 1.9132586691484323e-08, "loss": 0.32, "mean_token_accuracy": 0.8840158293023705, "num_tokens": 633966696.0, "step": 737 }, { "entropy": 0.396240234375, "epoch": 5.857142857142857, "grad_norm": 0.6274839579353274, "learning_rate": 1.7268234026041053e-08, "loss": 0.3254, "mean_token_accuracy": 0.8834780002944171, "num_tokens": 634817676.0, "step": 738 }, { "entropy": 0.394012451171875, "epoch": 5.865079365079366, "grad_norm": 0.6482673934422294, "learning_rate": 1.5499274616602723e-08, "loss": 0.3246, "mean_token_accuracy": 0.8846540525555611, "num_tokens": 635677818.0, "step": 739 }, { "entropy": 0.3939056396484375, "epoch": 5.8730158730158735, "grad_norm": 0.6092517161901388, "learning_rate": 1.3825742329492408e-08, "loss": 0.3286, "mean_token_accuracy": 0.8829479278065264, "num_tokens": 636552146.0, "step": 740 }, { "entropy": 0.3929901123046875, "epoch": 5.880952380952381, "grad_norm": 0.6568730517308575, "learning_rate": 1.2247669204100699e-08, "loss": 0.3324, "mean_token_accuracy": 0.8809677893295884, "num_tokens": 637434649.0, "step": 741 }, { "entropy": 0.3914947509765625, "epoch": 5.888888888888889, "grad_norm": 0.6200572094186191, "learning_rate": 1.0765085452275614e-08, "loss": 0.3292, "mean_token_accuracy": 0.8837377014569938, "num_tokens": 638294457.0, "step": 742 }, { "entropy": 0.3955535888671875, "epoch": 5.896825396825397, "grad_norm": 0.6375108635856576, "learning_rate": 9.378019457743082e-09, "loss": 0.3276, "mean_token_accuracy": 0.8823563028126955, "num_tokens": 639157905.0, "step": 743 }, { "entropy": 0.3954620361328125, "epoch": 5.904761904761905, "grad_norm": 0.6632626274807544, "learning_rate": 8.086497775562918e-09, "loss": 0.3306, "mean_token_accuracy": 0.8825922501273453, "num_tokens": 640011175.0, "step": 744 }, { "entropy": 0.3955535888671875, "epoch": 5.912698412698413, "grad_norm": 0.6203028600677455, "learning_rate": 6.890545131621462e-09, "loss": 0.3296, "mean_token_accuracy": 0.8806997863575816, "num_tokens": 640861100.0, "step": 745 }, { "entropy": 0.3971710205078125, "epoch": 5.920634920634921, "grad_norm": 0.608418609243511, "learning_rate": 5.790184422158063e-09, "loss": 0.3201, "mean_token_accuracy": 0.8849551575258374, "num_tokens": 641710588.0, "step": 746 }, { "entropy": 0.398040771484375, "epoch": 5.928571428571429, "grad_norm": 0.6703364371814936, "learning_rate": 4.785436713324876e-09, "loss": 0.3223, "mean_token_accuracy": 0.8841964863240719, "num_tokens": 642574184.0, "step": 747 }, { "entropy": 0.390533447265625, "epoch": 5.936507936507937, "grad_norm": 0.6254194421549316, "learning_rate": 3.876321240786629e-09, "loss": 0.3255, "mean_token_accuracy": 0.8837793176062405, "num_tokens": 643427789.0, "step": 748 }, { "entropy": 0.391357421875, "epoch": 5.944444444444445, "grad_norm": 0.6047693920470687, "learning_rate": 3.062855409350918e-09, "loss": 0.3226, "mean_token_accuracy": 0.8838337506167591, "num_tokens": 644312139.0, "step": 749 }, { "entropy": 0.3914794921875, "epoch": 5.9523809523809526, "grad_norm": 0.620245834415008, "learning_rate": 2.345054792634027e-09, "loss": 0.3192, "mean_token_accuracy": 0.8871927126310766, "num_tokens": 645164595.0, "step": 750 }, { "entropy": 0.3953399658203125, "epoch": 5.9603174603174605, "grad_norm": 0.597155253447394, "learning_rate": 1.7229331327633935e-09, "loss": 0.3258, "mean_token_accuracy": 0.8841970260255039, "num_tokens": 646011434.0, "step": 751 }, { "entropy": 0.3984527587890625, "epoch": 5.968253968253968, "grad_norm": 0.6471414852719458, "learning_rate": 1.1965023401161457e-09, "loss": 0.3224, "mean_token_accuracy": 0.8848558394238353, "num_tokens": 646838463.0, "step": 752 }, { "entropy": 0.3917388916015625, "epoch": 5.976190476190476, "grad_norm": 0.6216717165417032, "learning_rate": 7.657724930887344e-10, "loss": 0.3189, "mean_token_accuracy": 0.8857344668358564, "num_tokens": 647701193.0, "step": 753 }, { "entropy": 0.398895263671875, "epoch": 5.984126984126984, "grad_norm": 0.6120300659931632, "learning_rate": 4.3075183790541875e-10, "loss": 0.3228, "mean_token_accuracy": 0.8856973070651293, "num_tokens": 648543554.0, "step": 754 }, { "entropy": 0.39404296875, "epoch": 5.992063492063492, "grad_norm": 0.6483787873623111, "learning_rate": 1.9144678845950393e-10, "loss": 0.3344, "mean_token_accuracy": 0.8800923773087561, "num_tokens": 649397139.0, "step": 755 }, { "entropy": 0.3967437744140625, "epoch": 6.0, "grad_norm": 0.6554944488124251, "learning_rate": 4.786192619121721e-11, "loss": 0.3239, "mean_token_accuracy": 0.883914896287024, "num_tokens": 650235668.0, "step": 756 }, { "epoch": 6.0, "step": 756, "total_flos": 1202499003482112.0, "train_loss": 0.440722111040953, "train_runtime": 115067.3138, "train_samples_per_second": 1.28, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 756, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1202499003482112.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }