Files
P2-split2_prob_Qwen3-8B-Bas…/trainer_state.json
ModelHub XC b1ed679e98 初始化项目,由ModelHub XC社区提供模型
Model: Hyeongwon/P2-split2_prob_Qwen3-8B-Base_0325-04-bs128-lr1e-5-epoch6
Source: Original Platform
2026-05-13 06:11:35 +08:00

7604 lines
215 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 756,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5677337646484375,
"epoch": 0.007936507936507936,
"grad_norm": 5.825740150213408,
"learning_rate": 0.0,
"loss": 1.3956,
"mean_token_accuracy": 0.6547382255084813,
"num_tokens": 849869.0,
"step": 1
},
{
"entropy": 0.569549560546875,
"epoch": 0.015873015873015872,
"grad_norm": 5.801156934108041,
"learning_rate": 2.6315789473684213e-07,
"loss": 1.4001,
"mean_token_accuracy": 0.6515501267276704,
"num_tokens": 1710146.0,
"step": 2
},
{
"entropy": 0.5733184814453125,
"epoch": 0.023809523809523808,
"grad_norm": 5.697225721311094,
"learning_rate": 5.263157894736843e-07,
"loss": 1.3825,
"mean_token_accuracy": 0.6571523365564644,
"num_tokens": 2560005.0,
"step": 3
},
{
"entropy": 0.5648651123046875,
"epoch": 0.031746031746031744,
"grad_norm": 5.692098743617845,
"learning_rate": 7.894736842105263e-07,
"loss": 1.3997,
"mean_token_accuracy": 0.65298081189394,
"num_tokens": 3457966.0,
"step": 4
},
{
"entropy": 0.57421875,
"epoch": 0.03968253968253968,
"grad_norm": 5.78106605094393,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.4008,
"mean_token_accuracy": 0.6524212104268372,
"num_tokens": 4321827.0,
"step": 5
},
{
"entropy": 0.5650482177734375,
"epoch": 0.047619047619047616,
"grad_norm": 5.616712544244806,
"learning_rate": 1.3157894736842106e-06,
"loss": 1.3776,
"mean_token_accuracy": 0.6610458297654986,
"num_tokens": 5188122.0,
"step": 6
},
{
"entropy": 0.5651702880859375,
"epoch": 0.05555555555555555,
"grad_norm": 5.431878258315051,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.3756,
"mean_token_accuracy": 0.6562704290263355,
"num_tokens": 6042413.0,
"step": 7
},
{
"entropy": 0.5759429931640625,
"epoch": 0.06349206349206349,
"grad_norm": 5.337410424762961,
"learning_rate": 1.8421052631578948e-06,
"loss": 1.3735,
"mean_token_accuracy": 0.6550004091113806,
"num_tokens": 6898441.0,
"step": 8
},
{
"entropy": 0.562835693359375,
"epoch": 0.07142857142857142,
"grad_norm": 5.254246336716741,
"learning_rate": 2.105263157894737e-06,
"loss": 1.3621,
"mean_token_accuracy": 0.6594638815149665,
"num_tokens": 7794638.0,
"step": 9
},
{
"entropy": 0.5644378662109375,
"epoch": 0.07936507936507936,
"grad_norm": 4.504374040638264,
"learning_rate": 2.368421052631579e-06,
"loss": 1.3207,
"mean_token_accuracy": 0.6636323751881719,
"num_tokens": 8673402.0,
"step": 10
},
{
"entropy": 0.5662384033203125,
"epoch": 0.0873015873015873,
"grad_norm": 4.290465182305912,
"learning_rate": 2.631578947368421e-06,
"loss": 1.2869,
"mean_token_accuracy": 0.6718250620178878,
"num_tokens": 9525436.0,
"step": 11
},
{
"entropy": 0.573974609375,
"epoch": 0.09523809523809523,
"grad_norm": 4.112783175310539,
"learning_rate": 2.8947368421052634e-06,
"loss": 1.2744,
"mean_token_accuracy": 0.6722556869499385,
"num_tokens": 10358777.0,
"step": 12
},
{
"entropy": 0.5698089599609375,
"epoch": 0.10317460317460317,
"grad_norm": 3.231368850972016,
"learning_rate": 3.157894736842105e-06,
"loss": 1.1762,
"mean_token_accuracy": 0.6900928025133908,
"num_tokens": 11211677.0,
"step": 13
},
{
"entropy": 0.568206787109375,
"epoch": 0.1111111111111111,
"grad_norm": 3.2472932637870775,
"learning_rate": 3.421052631578948e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.6925145331770182,
"num_tokens": 12067363.0,
"step": 14
},
{
"entropy": 0.5522003173828125,
"epoch": 0.11904761904761904,
"grad_norm": 2.955470813245027,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.1484,
"mean_token_accuracy": 0.6949247056618333,
"num_tokens": 12945458.0,
"step": 15
},
{
"entropy": 0.5524139404296875,
"epoch": 0.12698412698412698,
"grad_norm": 2.953534618001751,
"learning_rate": 3.947368421052632e-06,
"loss": 1.1231,
"mean_token_accuracy": 0.6996096298098564,
"num_tokens": 13815066.0,
"step": 16
},
{
"entropy": 0.5273590087890625,
"epoch": 0.1349206349206349,
"grad_norm": 3.6738847590405457,
"learning_rate": 4.210526315789474e-06,
"loss": 1.0456,
"mean_token_accuracy": 0.715466492343694,
"num_tokens": 14685173.0,
"step": 17
},
{
"entropy": 0.5328826904296875,
"epoch": 0.14285714285714285,
"grad_norm": 3.975231208187267,
"learning_rate": 4.473684210526316e-06,
"loss": 1.0218,
"mean_token_accuracy": 0.7184609142132103,
"num_tokens": 15522062.0,
"step": 18
},
{
"entropy": 0.5339813232421875,
"epoch": 0.15079365079365079,
"grad_norm": 3.7568291660582056,
"learning_rate": 4.736842105263158e-06,
"loss": 0.9846,
"mean_token_accuracy": 0.7254857295192778,
"num_tokens": 16388252.0,
"step": 19
},
{
"entropy": 0.539520263671875,
"epoch": 0.15873015873015872,
"grad_norm": 3.256772081860255,
"learning_rate": 5e-06,
"loss": 0.9572,
"mean_token_accuracy": 0.7309625665657222,
"num_tokens": 17235205.0,
"step": 20
},
{
"entropy": 0.5419464111328125,
"epoch": 0.16666666666666666,
"grad_norm": 2.7287809182707607,
"learning_rate": 5.263157894736842e-06,
"loss": 0.9449,
"mean_token_accuracy": 0.734284377656877,
"num_tokens": 18090069.0,
"step": 21
},
{
"entropy": 0.541290283203125,
"epoch": 0.1746031746031746,
"grad_norm": 2.661465266751511,
"learning_rate": 5.526315789473685e-06,
"loss": 0.9245,
"mean_token_accuracy": 0.7380136135034263,
"num_tokens": 18955962.0,
"step": 22
},
{
"entropy": 0.5420989990234375,
"epoch": 0.18253968253968253,
"grad_norm": 2.1871180143795454,
"learning_rate": 5.789473684210527e-06,
"loss": 0.8806,
"mean_token_accuracy": 0.7492561861872673,
"num_tokens": 19812261.0,
"step": 23
},
{
"entropy": 0.5357818603515625,
"epoch": 0.19047619047619047,
"grad_norm": 2.9467428670698266,
"learning_rate": 6.0526315789473685e-06,
"loss": 0.8643,
"mean_token_accuracy": 0.7490303930826485,
"num_tokens": 20647709.0,
"step": 24
},
{
"entropy": 0.52435302734375,
"epoch": 0.1984126984126984,
"grad_norm": 2.907660231508903,
"learning_rate": 6.31578947368421e-06,
"loss": 0.861,
"mean_token_accuracy": 0.7502002012915909,
"num_tokens": 21486371.0,
"step": 25
},
{
"entropy": 0.5368194580078125,
"epoch": 0.20634920634920634,
"grad_norm": 2.490760554791867,
"learning_rate": 6.578947368421054e-06,
"loss": 0.8441,
"mean_token_accuracy": 0.7527890643104911,
"num_tokens": 22318414.0,
"step": 26
},
{
"entropy": 0.5297393798828125,
"epoch": 0.21428571428571427,
"grad_norm": 2.1289542070292082,
"learning_rate": 6.842105263157896e-06,
"loss": 0.8129,
"mean_token_accuracy": 0.7607404845766723,
"num_tokens": 23134604.0,
"step": 27
},
{
"entropy": 0.51568603515625,
"epoch": 0.2222222222222222,
"grad_norm": 1.8284519133414685,
"learning_rate": 7.1052631578947375e-06,
"loss": 0.8081,
"mean_token_accuracy": 0.7623137319460511,
"num_tokens": 24009770.0,
"step": 28
},
{
"entropy": 0.52362060546875,
"epoch": 0.23015873015873015,
"grad_norm": 1.9471502088574602,
"learning_rate": 7.368421052631579e-06,
"loss": 0.8048,
"mean_token_accuracy": 0.762959006242454,
"num_tokens": 24869806.0,
"step": 29
},
{
"entropy": 0.512420654296875,
"epoch": 0.23809523809523808,
"grad_norm": 1.745367819572919,
"learning_rate": 7.631578947368423e-06,
"loss": 0.7896,
"mean_token_accuracy": 0.7669625347480178,
"num_tokens": 25762999.0,
"step": 30
},
{
"entropy": 0.5128631591796875,
"epoch": 0.24603174603174602,
"grad_norm": 2.0285955824692774,
"learning_rate": 7.894736842105265e-06,
"loss": 0.7605,
"mean_token_accuracy": 0.77361392788589,
"num_tokens": 26651412.0,
"step": 31
},
{
"entropy": 0.51251220703125,
"epoch": 0.25396825396825395,
"grad_norm": 1.8504240732599995,
"learning_rate": 8.157894736842106e-06,
"loss": 0.7568,
"mean_token_accuracy": 0.7740332204848528,
"num_tokens": 27520069.0,
"step": 32
},
{
"entropy": 0.5124359130859375,
"epoch": 0.2619047619047619,
"grad_norm": 1.5796846536434936,
"learning_rate": 8.421052631578948e-06,
"loss": 0.7509,
"mean_token_accuracy": 0.7743185707367957,
"num_tokens": 28367541.0,
"step": 33
},
{
"entropy": 0.506439208984375,
"epoch": 0.2698412698412698,
"grad_norm": 1.3378034052723207,
"learning_rate": 8.68421052631579e-06,
"loss": 0.7161,
"mean_token_accuracy": 0.7827055719681084,
"num_tokens": 29217570.0,
"step": 34
},
{
"entropy": 0.4996490478515625,
"epoch": 0.2777777777777778,
"grad_norm": 1.679208635550248,
"learning_rate": 8.947368421052632e-06,
"loss": 0.7187,
"mean_token_accuracy": 0.7833654824644327,
"num_tokens": 30088869.0,
"step": 35
},
{
"entropy": 0.4967041015625,
"epoch": 0.2857142857142857,
"grad_norm": 1.6387552610606482,
"learning_rate": 9.210526315789474e-06,
"loss": 0.7145,
"mean_token_accuracy": 0.7830667225643992,
"num_tokens": 30959821.0,
"step": 36
},
{
"entropy": 0.500274658203125,
"epoch": 0.29365079365079366,
"grad_norm": 1.371112117801989,
"learning_rate": 9.473684210526315e-06,
"loss": 0.7097,
"mean_token_accuracy": 0.7837298880331218,
"num_tokens": 31830767.0,
"step": 37
},
{
"entropy": 0.5020751953125,
"epoch": 0.30158730158730157,
"grad_norm": 1.6810348517072142,
"learning_rate": 9.736842105263159e-06,
"loss": 0.6954,
"mean_token_accuracy": 0.7868552934378386,
"num_tokens": 32692726.0,
"step": 38
},
{
"entropy": 0.5,
"epoch": 0.30952380952380953,
"grad_norm": 1.3054950683288973,
"learning_rate": 1e-05,
"loss": 0.669,
"mean_token_accuracy": 0.7914999099448323,
"num_tokens": 33543076.0,
"step": 39
},
{
"entropy": 0.4976806640625,
"epoch": 0.31746031746031744,
"grad_norm": 1.5523188226474438,
"learning_rate": 9.99995213807381e-06,
"loss": 0.677,
"mean_token_accuracy": 0.7901066686026752,
"num_tokens": 34404352.0,
"step": 40
},
{
"entropy": 0.5042724609375,
"epoch": 0.3253968253968254,
"grad_norm": 1.3329229488550147,
"learning_rate": 9.99980855321154e-06,
"loss": 0.6721,
"mean_token_accuracy": 0.7911685910075903,
"num_tokens": 35236933.0,
"step": 41
},
{
"entropy": 0.491058349609375,
"epoch": 0.3333333333333333,
"grad_norm": 1.2836381282552631,
"learning_rate": 9.999569248162095e-06,
"loss": 0.662,
"mean_token_accuracy": 0.7938097110018134,
"num_tokens": 36088050.0,
"step": 42
},
{
"entropy": 0.491973876953125,
"epoch": 0.3412698412698413,
"grad_norm": 1.5436048820238097,
"learning_rate": 9.999234227506912e-06,
"loss": 0.659,
"mean_token_accuracy": 0.7951631429605186,
"num_tokens": 36942407.0,
"step": 43
},
{
"entropy": 0.488128662109375,
"epoch": 0.3492063492063492,
"grad_norm": 1.5701418827048832,
"learning_rate": 9.998803497659885e-06,
"loss": 0.6567,
"mean_token_accuracy": 0.7952465042471886,
"num_tokens": 37803882.0,
"step": 44
},
{
"entropy": 0.4854278564453125,
"epoch": 0.35714285714285715,
"grad_norm": 1.453745925853672,
"learning_rate": 9.998277066867236e-06,
"loss": 0.6557,
"mean_token_accuracy": 0.7963202544488013,
"num_tokens": 38667329.0,
"step": 45
},
{
"entropy": 0.4832763671875,
"epoch": 0.36507936507936506,
"grad_norm": 1.4543393351762242,
"learning_rate": 9.997654945207368e-06,
"loss": 0.6415,
"mean_token_accuracy": 0.7979013016447425,
"num_tokens": 39524166.0,
"step": 46
},
{
"entropy": 0.47686767578125,
"epoch": 0.373015873015873,
"grad_norm": 1.2844075952499214,
"learning_rate": 9.99693714459065e-06,
"loss": 0.6314,
"mean_token_accuracy": 0.8014799957163632,
"num_tokens": 40389693.0,
"step": 47
},
{
"entropy": 0.485137939453125,
"epoch": 0.38095238095238093,
"grad_norm": 1.2055402617655024,
"learning_rate": 9.996123678759214e-06,
"loss": 0.6304,
"mean_token_accuracy": 0.8023288743570447,
"num_tokens": 41231841.0,
"step": 48
},
{
"entropy": 0.4888763427734375,
"epoch": 0.3888888888888889,
"grad_norm": 1.562622920851612,
"learning_rate": 9.995214563286677e-06,
"loss": 0.6315,
"mean_token_accuracy": 0.8009699960239232,
"num_tokens": 42082897.0,
"step": 49
},
{
"entropy": 0.4813079833984375,
"epoch": 0.3968253968253968,
"grad_norm": 1.1138521900837535,
"learning_rate": 9.994209815577843e-06,
"loss": 0.6138,
"mean_token_accuracy": 0.8051781668327749,
"num_tokens": 42941458.0,
"step": 50
},
{
"entropy": 0.4780731201171875,
"epoch": 0.40476190476190477,
"grad_norm": 1.4023404249187725,
"learning_rate": 9.993109454868379e-06,
"loss": 0.6123,
"mean_token_accuracy": 0.805035431869328,
"num_tokens": 43801380.0,
"step": 51
},
{
"entropy": 0.4847564697265625,
"epoch": 0.4126984126984127,
"grad_norm": 1.1362760383931592,
"learning_rate": 9.991913502224438e-06,
"loss": 0.6252,
"mean_token_accuracy": 0.8026665896177292,
"num_tokens": 44671335.0,
"step": 52
},
{
"entropy": 0.4954071044921875,
"epoch": 0.42063492063492064,
"grad_norm": 1.2459663164920876,
"learning_rate": 9.990621980542258e-06,
"loss": 0.6116,
"mean_token_accuracy": 0.8040637490339577,
"num_tokens": 45508166.0,
"step": 53
},
{
"entropy": 0.479949951171875,
"epoch": 0.42857142857142855,
"grad_norm": 1.3916959345611162,
"learning_rate": 9.989234914547725e-06,
"loss": 0.6209,
"mean_token_accuracy": 0.803072199691087,
"num_tokens": 46398974.0,
"step": 54
},
{
"entropy": 0.472991943359375,
"epoch": 0.4365079365079365,
"grad_norm": 1.2293838348325725,
"learning_rate": 9.9877523307959e-06,
"loss": 0.619,
"mean_token_accuracy": 0.8017513235099614,
"num_tokens": 47311746.0,
"step": 55
},
{
"entropy": 0.477813720703125,
"epoch": 0.4444444444444444,
"grad_norm": 1.3910584256010747,
"learning_rate": 9.986174257670509e-06,
"loss": 0.5928,
"mean_token_accuracy": 0.8104538763873279,
"num_tokens": 48177761.0,
"step": 56
},
{
"entropy": 0.4864501953125,
"epoch": 0.4523809523809524,
"grad_norm": 1.398508282793997,
"learning_rate": 9.984500725383397e-06,
"loss": 0.5977,
"mean_token_accuracy": 0.8092257836833596,
"num_tokens": 49026375.0,
"step": 57
},
{
"entropy": 0.4769744873046875,
"epoch": 0.4603174603174603,
"grad_norm": 1.5292432492685464,
"learning_rate": 9.98273176597396e-06,
"loss": 0.6023,
"mean_token_accuracy": 0.8064105985686183,
"num_tokens": 49889163.0,
"step": 58
},
{
"entropy": 0.484161376953125,
"epoch": 0.46825396825396826,
"grad_norm": 1.2391735146187919,
"learning_rate": 9.980867413308516e-06,
"loss": 0.5945,
"mean_token_accuracy": 0.8099995800293982,
"num_tokens": 50735361.0,
"step": 59
},
{
"entropy": 0.4808349609375,
"epoch": 0.47619047619047616,
"grad_norm": 1.1506873672230502,
"learning_rate": 9.978907703079672e-06,
"loss": 0.5839,
"mean_token_accuracy": 0.8109453665092587,
"num_tokens": 51597577.0,
"step": 60
},
{
"entropy": 0.4707794189453125,
"epoch": 0.48412698412698413,
"grad_norm": 1.1563025667287559,
"learning_rate": 9.976852672805625e-06,
"loss": 0.5933,
"mean_token_accuracy": 0.8103347043506801,
"num_tokens": 52482382.0,
"step": 61
},
{
"entropy": 0.47381591796875,
"epoch": 0.49206349206349204,
"grad_norm": 1.2114630802835358,
"learning_rate": 9.974702361829465e-06,
"loss": 0.587,
"mean_token_accuracy": 0.8098243419080973,
"num_tokens": 53370750.0,
"step": 62
},
{
"entropy": 0.4756927490234375,
"epoch": 0.5,
"grad_norm": 1.1306748677183436,
"learning_rate": 9.972456811318399e-06,
"loss": 0.5792,
"mean_token_accuracy": 0.8143061874434352,
"num_tokens": 54235189.0,
"step": 63
},
{
"entropy": 0.47760009765625,
"epoch": 0.5079365079365079,
"grad_norm": 1.1104384579683657,
"learning_rate": 9.970116064262975e-06,
"loss": 0.5711,
"mean_token_accuracy": 0.814652734901756,
"num_tokens": 55066936.0,
"step": 64
},
{
"entropy": 0.475128173828125,
"epoch": 0.5158730158730159,
"grad_norm": 1.130873737994674,
"learning_rate": 9.96768016547626e-06,
"loss": 0.5749,
"mean_token_accuracy": 0.8144003404304385,
"num_tokens": 55922405.0,
"step": 65
},
{
"entropy": 0.4756317138671875,
"epoch": 0.5238095238095238,
"grad_norm": 1.2543626844594944,
"learning_rate": 9.965149161592973e-06,
"loss": 0.5717,
"mean_token_accuracy": 0.8129943162202835,
"num_tokens": 56770275.0,
"step": 66
},
{
"entropy": 0.47332763671875,
"epoch": 0.5317460317460317,
"grad_norm": 1.164483305334561,
"learning_rate": 9.962523101068608e-06,
"loss": 0.5781,
"mean_token_accuracy": 0.8122508767992258,
"num_tokens": 57627870.0,
"step": 67
},
{
"entropy": 0.475616455078125,
"epoch": 0.5396825396825397,
"grad_norm": 1.0960487524124647,
"learning_rate": 9.959802034178489e-06,
"loss": 0.5661,
"mean_token_accuracy": 0.817779887933284,
"num_tokens": 58469884.0,
"step": 68
},
{
"entropy": 0.4727325439453125,
"epoch": 0.5476190476190477,
"grad_norm": 1.0760531755759473,
"learning_rate": 9.956986013016816e-06,
"loss": 0.5655,
"mean_token_accuracy": 0.815218704752624,
"num_tokens": 59323796.0,
"step": 69
},
{
"entropy": 0.472900390625,
"epoch": 0.5555555555555556,
"grad_norm": 1.150431448189494,
"learning_rate": 9.954075091495669e-06,
"loss": 0.5564,
"mean_token_accuracy": 0.8179643992334604,
"num_tokens": 60183841.0,
"step": 70
},
{
"entropy": 0.4703521728515625,
"epoch": 0.5634920634920635,
"grad_norm": 1.0833066927711814,
"learning_rate": 9.951069325343972e-06,
"loss": 0.5668,
"mean_token_accuracy": 0.8150419541634619,
"num_tokens": 61048601.0,
"step": 71
},
{
"entropy": 0.472991943359375,
"epoch": 0.5714285714285714,
"grad_norm": 1.1267753740162227,
"learning_rate": 9.947968772106428e-06,
"loss": 0.5683,
"mean_token_accuracy": 0.8151015248149633,
"num_tokens": 61887912.0,
"step": 72
},
{
"entropy": 0.468231201171875,
"epoch": 0.5793650793650794,
"grad_norm": 1.0041998917193997,
"learning_rate": 9.944773491142416e-06,
"loss": 0.561,
"mean_token_accuracy": 0.8171937335282564,
"num_tokens": 62741342.0,
"step": 73
},
{
"entropy": 0.464263916015625,
"epoch": 0.5873015873015873,
"grad_norm": 1.0641654456452165,
"learning_rate": 9.94148354362486e-06,
"loss": 0.5591,
"mean_token_accuracy": 0.8203174020163715,
"num_tokens": 63594617.0,
"step": 74
},
{
"entropy": 0.4626922607421875,
"epoch": 0.5952380952380952,
"grad_norm": 1.0150387789211928,
"learning_rate": 9.938098992539045e-06,
"loss": 0.5534,
"mean_token_accuracy": 0.8201709003187716,
"num_tokens": 64467695.0,
"step": 75
},
{
"entropy": 0.4632568359375,
"epoch": 0.6031746031746031,
"grad_norm": 1.1279871018458427,
"learning_rate": 9.93461990268143e-06,
"loss": 0.5589,
"mean_token_accuracy": 0.8168505723588169,
"num_tokens": 65333788.0,
"step": 76
},
{
"entropy": 0.4673309326171875,
"epoch": 0.6111111111111112,
"grad_norm": 1.1373063081267694,
"learning_rate": 9.931046340658387e-06,
"loss": 0.5494,
"mean_token_accuracy": 0.8186001246795058,
"num_tokens": 66178918.0,
"step": 77
},
{
"entropy": 0.4654693603515625,
"epoch": 0.6190476190476191,
"grad_norm": 0.987490904917509,
"learning_rate": 9.927378374884947e-06,
"loss": 0.5617,
"mean_token_accuracy": 0.8173182448372245,
"num_tokens": 67052342.0,
"step": 78
},
{
"entropy": 0.4614715576171875,
"epoch": 0.626984126984127,
"grad_norm": 1.158480909536903,
"learning_rate": 9.923616075583465e-06,
"loss": 0.5521,
"mean_token_accuracy": 0.8191495719365776,
"num_tokens": 67913391.0,
"step": 79
},
{
"entropy": 0.46258544921875,
"epoch": 0.6349206349206349,
"grad_norm": 1.0694507889690312,
"learning_rate": 9.919759514782304e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8191684056073427,
"num_tokens": 68772115.0,
"step": 80
},
{
"entropy": 0.4591064453125,
"epoch": 0.6428571428571429,
"grad_norm": 1.031285333594921,
"learning_rate": 9.91580876631443e-06,
"loss": 0.5395,
"mean_token_accuracy": 0.8231914453208447,
"num_tokens": 69611653.0,
"step": 81
},
{
"entropy": 0.4618072509765625,
"epoch": 0.6507936507936508,
"grad_norm": 1.2191568231885717,
"learning_rate": 9.91176390581602e-06,
"loss": 0.5609,
"mean_token_accuracy": 0.8178833266720176,
"num_tokens": 70496952.0,
"step": 82
},
{
"entropy": 0.4582672119140625,
"epoch": 0.6587301587301587,
"grad_norm": 1.098798237502869,
"learning_rate": 9.907625010724999e-06,
"loss": 0.5426,
"mean_token_accuracy": 0.8210734003223479,
"num_tokens": 71343921.0,
"step": 83
},
{
"entropy": 0.45758056640625,
"epoch": 0.6666666666666666,
"grad_norm": 1.0293307451137137,
"learning_rate": 9.903392160279564e-06,
"loss": 0.5547,
"mean_token_accuracy": 0.8177660717628896,
"num_tokens": 72240608.0,
"step": 84
},
{
"entropy": 0.4568939208984375,
"epoch": 0.6746031746031746,
"grad_norm": 1.1591213276198025,
"learning_rate": 9.899065435516661e-06,
"loss": 0.5419,
"mean_token_accuracy": 0.8213843265548348,
"num_tokens": 73118452.0,
"step": 85
},
{
"entropy": 0.45391845703125,
"epoch": 0.6825396825396826,
"grad_norm": 1.0203407509108033,
"learning_rate": 9.894644919270448e-06,
"loss": 0.5482,
"mean_token_accuracy": 0.8173369145952165,
"num_tokens": 73991069.0,
"step": 86
},
{
"entropy": 0.4616546630859375,
"epoch": 0.6904761904761905,
"grad_norm": 1.2761061587844562,
"learning_rate": 9.890130696170691e-06,
"loss": 0.5398,
"mean_token_accuracy": 0.8226454192772508,
"num_tokens": 74839901.0,
"step": 87
},
{
"entropy": 0.4576416015625,
"epoch": 0.6984126984126984,
"grad_norm": 1.0577284033847283,
"learning_rate": 9.885522852641156e-06,
"loss": 0.5527,
"mean_token_accuracy": 0.8187054474838078,
"num_tokens": 75749725.0,
"step": 88
},
{
"entropy": 0.465087890625,
"epoch": 0.7063492063492064,
"grad_norm": 1.1084468078084884,
"learning_rate": 9.880821476897948e-06,
"loss": 0.5456,
"mean_token_accuracy": 0.8205522131174803,
"num_tokens": 76593690.0,
"step": 89
},
{
"entropy": 0.466217041015625,
"epoch": 0.7142857142857143,
"grad_norm": 1.1432698793337261,
"learning_rate": 9.87602665894783e-06,
"loss": 0.5352,
"mean_token_accuracy": 0.8226723484694958,
"num_tokens": 77431030.0,
"step": 90
},
{
"entropy": 0.4627227783203125,
"epoch": 0.7222222222222222,
"grad_norm": 1.2237349934344866,
"learning_rate": 9.871138490586489e-06,
"loss": 0.54,
"mean_token_accuracy": 0.8220892632380128,
"num_tokens": 78278605.0,
"step": 91
},
{
"entropy": 0.457794189453125,
"epoch": 0.7301587301587301,
"grad_norm": 0.9735987604661573,
"learning_rate": 9.866157065396784e-06,
"loss": 0.5336,
"mean_token_accuracy": 0.8227689885534346,
"num_tokens": 79154216.0,
"step": 92
},
{
"entropy": 0.4553680419921875,
"epoch": 0.7380952380952381,
"grad_norm": 1.2060097781830723,
"learning_rate": 9.861082478746962e-06,
"loss": 0.5453,
"mean_token_accuracy": 0.8193097808398306,
"num_tokens": 80039204.0,
"step": 93
},
{
"entropy": 0.452606201171875,
"epoch": 0.746031746031746,
"grad_norm": 1.219116442820296,
"learning_rate": 9.855914827788814e-06,
"loss": 0.537,
"mean_token_accuracy": 0.8233561674132943,
"num_tokens": 80910348.0,
"step": 94
},
{
"entropy": 0.454315185546875,
"epoch": 0.753968253968254,
"grad_norm": 1.1176715433890458,
"learning_rate": 9.850654211455837e-06,
"loss": 0.5371,
"mean_token_accuracy": 0.8221336985006928,
"num_tokens": 81765325.0,
"step": 95
},
{
"entropy": 0.4631195068359375,
"epoch": 0.7619047619047619,
"grad_norm": 0.9620936989799753,
"learning_rate": 9.84530073046132e-06,
"loss": 0.5288,
"mean_token_accuracy": 0.8240236868150532,
"num_tokens": 82611125.0,
"step": 96
},
{
"entropy": 0.454254150390625,
"epoch": 0.7698412698412699,
"grad_norm": 1.0990268102179221,
"learning_rate": 9.83985448729643e-06,
"loss": 0.5376,
"mean_token_accuracy": 0.8205642709508538,
"num_tokens": 83488021.0,
"step": 97
},
{
"entropy": 0.45806884765625,
"epoch": 0.7777777777777778,
"grad_norm": 1.1036853861455271,
"learning_rate": 9.83431558622824e-06,
"loss": 0.533,
"mean_token_accuracy": 0.8217868432402611,
"num_tokens": 84321775.0,
"step": 98
},
{
"entropy": 0.4533843994140625,
"epoch": 0.7857142857142857,
"grad_norm": 1.1621319158175323,
"learning_rate": 9.828684133297738e-06,
"loss": 0.5223,
"mean_token_accuracy": 0.8267802041955292,
"num_tokens": 85167087.0,
"step": 99
},
{
"entropy": 0.453704833984375,
"epoch": 0.7936507936507936,
"grad_norm": 1.01710948689366,
"learning_rate": 9.822960236317804e-06,
"loss": 0.5324,
"mean_token_accuracy": 0.824018832296133,
"num_tokens": 86009383.0,
"step": 100
},
{
"entropy": 0.4491729736328125,
"epoch": 0.8015873015873016,
"grad_norm": 1.136196189273348,
"learning_rate": 9.817144004871127e-06,
"loss": 0.5249,
"mean_token_accuracy": 0.8261763895861804,
"num_tokens": 86862628.0,
"step": 101
},
{
"entropy": 0.447357177734375,
"epoch": 0.8095238095238095,
"grad_norm": 1.1653574778770674,
"learning_rate": 9.811235550308127e-06,
"loss": 0.5298,
"mean_token_accuracy": 0.8248972818255424,
"num_tokens": 87713395.0,
"step": 102
},
{
"entropy": 0.4496612548828125,
"epoch": 0.8174603174603174,
"grad_norm": 1.0197520810298888,
"learning_rate": 9.805234985744804e-06,
"loss": 0.5374,
"mean_token_accuracy": 0.8215886438265443,
"num_tokens": 88602545.0,
"step": 103
},
{
"entropy": 0.45355224609375,
"epoch": 0.8253968253968254,
"grad_norm": 1.0746283719190985,
"learning_rate": 9.799142426060595e-06,
"loss": 0.5211,
"mean_token_accuracy": 0.8281928705982864,
"num_tokens": 89444255.0,
"step": 104
},
{
"entropy": 0.4513702392578125,
"epoch": 0.8333333333333334,
"grad_norm": 0.9405502354816935,
"learning_rate": 9.792957987896154e-06,
"loss": 0.5164,
"mean_token_accuracy": 0.8303409847430885,
"num_tokens": 90312774.0,
"step": 105
},
{
"entropy": 0.4513397216796875,
"epoch": 0.8412698412698413,
"grad_norm": 0.9828002484074289,
"learning_rate": 9.786681789651134e-06,
"loss": 0.5167,
"mean_token_accuracy": 0.8268536222167313,
"num_tokens": 91177447.0,
"step": 106
},
{
"entropy": 0.452178955078125,
"epoch": 0.8492063492063492,
"grad_norm": 1.0029391226175481,
"learning_rate": 9.780313951481904e-06,
"loss": 0.5159,
"mean_token_accuracy": 0.8277767463587224,
"num_tokens": 92026164.0,
"step": 107
},
{
"entropy": 0.448455810546875,
"epoch": 0.8571428571428571,
"grad_norm": 1.0111683546148944,
"learning_rate": 9.773854595299269e-06,
"loss": 0.5158,
"mean_token_accuracy": 0.8292862558737397,
"num_tokens": 92891631.0,
"step": 108
},
{
"entropy": 0.4544219970703125,
"epoch": 0.8650793650793651,
"grad_norm": 1.0693593999923847,
"learning_rate": 9.767303844766118e-06,
"loss": 0.5228,
"mean_token_accuracy": 0.8271551127545536,
"num_tokens": 93760535.0,
"step": 109
},
{
"entropy": 0.4507293701171875,
"epoch": 0.873015873015873,
"grad_norm": 1.078077119226931,
"learning_rate": 9.760661825295068e-06,
"loss": 0.5188,
"mean_token_accuracy": 0.8268710542470217,
"num_tokens": 94599260.0,
"step": 110
},
{
"entropy": 0.45849609375,
"epoch": 0.8809523809523809,
"grad_norm": 1.1500541245910931,
"learning_rate": 9.753928664046055e-06,
"loss": 0.5188,
"mean_token_accuracy": 0.8258566916920245,
"num_tokens": 95425799.0,
"step": 111
},
{
"entropy": 0.455718994140625,
"epoch": 0.8888888888888888,
"grad_norm": 0.9411323026364743,
"learning_rate": 9.747104489923907e-06,
"loss": 0.5223,
"mean_token_accuracy": 0.8278644122183323,
"num_tokens": 96260271.0,
"step": 112
},
{
"entropy": 0.44830322265625,
"epoch": 0.8968253968253969,
"grad_norm": 1.0496976356343377,
"learning_rate": 9.740189433575873e-06,
"loss": 0.5209,
"mean_token_accuracy": 0.8264745082706213,
"num_tokens": 97135925.0,
"step": 113
},
{
"entropy": 0.4494781494140625,
"epoch": 0.9047619047619048,
"grad_norm": 1.1407666858849659,
"learning_rate": 9.733183627389117e-06,
"loss": 0.5223,
"mean_token_accuracy": 0.8253828585147858,
"num_tokens": 98011108.0,
"step": 114
},
{
"entropy": 0.4512481689453125,
"epoch": 0.9126984126984127,
"grad_norm": 1.042415494174418,
"learning_rate": 9.726087205488192e-06,
"loss": 0.5122,
"mean_token_accuracy": 0.827279772143811,
"num_tokens": 98873029.0,
"step": 115
},
{
"entropy": 0.4514007568359375,
"epoch": 0.9206349206349206,
"grad_norm": 1.2092755396739658,
"learning_rate": 9.718900303732465e-06,
"loss": 0.5205,
"mean_token_accuracy": 0.8262831498868763,
"num_tokens": 99734864.0,
"step": 116
},
{
"entropy": 0.4513702392578125,
"epoch": 0.9285714285714286,
"grad_norm": 1.1637681555356034,
"learning_rate": 9.711623059713522e-06,
"loss": 0.525,
"mean_token_accuracy": 0.826652648858726,
"num_tokens": 100606926.0,
"step": 117
},
{
"entropy": 0.45599365234375,
"epoch": 0.9365079365079365,
"grad_norm": 1.0209942008459238,
"learning_rate": 9.70425561275253e-06,
"loss": 0.5082,
"mean_token_accuracy": 0.8288828083314002,
"num_tokens": 101447599.0,
"step": 118
},
{
"entropy": 0.4554595947265625,
"epoch": 0.9444444444444444,
"grad_norm": 1.0671983733664188,
"learning_rate": 9.696798103897567e-06,
"loss": 0.5105,
"mean_token_accuracy": 0.8294458598829806,
"num_tokens": 102275358.0,
"step": 119
},
{
"entropy": 0.459686279296875,
"epoch": 0.9523809523809523,
"grad_norm": 1.043670325725254,
"learning_rate": 9.689250675920932e-06,
"loss": 0.5091,
"mean_token_accuracy": 0.8285032915882766,
"num_tokens": 103113293.0,
"step": 120
},
{
"entropy": 0.4480743408203125,
"epoch": 0.9603174603174603,
"grad_norm": 0.9316811204172604,
"learning_rate": 9.6816134733164e-06,
"loss": 0.5139,
"mean_token_accuracy": 0.8276259712874889,
"num_tokens": 104000550.0,
"step": 121
},
{
"entropy": 0.448577880859375,
"epoch": 0.9682539682539683,
"grad_norm": 1.0919546140808916,
"learning_rate": 9.67388664229646e-06,
"loss": 0.5134,
"mean_token_accuracy": 0.8282599356025457,
"num_tokens": 104859286.0,
"step": 122
},
{
"entropy": 0.448150634765625,
"epoch": 0.9761904761904762,
"grad_norm": 0.999620140999962,
"learning_rate": 9.66607033078952e-06,
"loss": 0.5131,
"mean_token_accuracy": 0.827931288164109,
"num_tokens": 105729675.0,
"step": 123
},
{
"entropy": 0.4438629150390625,
"epoch": 0.9841269841269841,
"grad_norm": 0.9685899298092844,
"learning_rate": 9.658164688437073e-06,
"loss": 0.5098,
"mean_token_accuracy": 0.8288385523483157,
"num_tokens": 106603968.0,
"step": 124
},
{
"entropy": 0.44073486328125,
"epoch": 0.9920634920634921,
"grad_norm": 0.9358281617577068,
"learning_rate": 9.65016986659082e-06,
"loss": 0.5053,
"mean_token_accuracy": 0.8307707700878382,
"num_tokens": 107495007.0,
"step": 125
},
{
"entropy": 0.4453887939453125,
"epoch": 1.0,
"grad_norm": 1.0968390964445407,
"learning_rate": 9.642086018309798e-06,
"loss": 0.5189,
"mean_token_accuracy": 0.8269711588509381,
"num_tokens": 108364335.0,
"step": 126
},
{
"entropy": 0.4580535888671875,
"epoch": 1.007936507936508,
"grad_norm": 1.0332257753415273,
"learning_rate": 9.63391329835742e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8341724565252662,
"num_tokens": 109202665.0,
"step": 127
},
{
"entropy": 0.443359375,
"epoch": 1.0158730158730158,
"grad_norm": 0.9791939186386783,
"learning_rate": 9.625651863198538e-06,
"loss": 0.4937,
"mean_token_accuracy": 0.8334903731010854,
"num_tokens": 110061605.0,
"step": 128
},
{
"entropy": 0.4462127685546875,
"epoch": 1.0238095238095237,
"grad_norm": 0.9767524502632213,
"learning_rate": 9.617301870996432e-06,
"loss": 0.4907,
"mean_token_accuracy": 0.8331266730092466,
"num_tokens": 110924491.0,
"step": 129
},
{
"entropy": 0.4455108642578125,
"epoch": 1.0317460317460316,
"grad_norm": 1.0445136711209406,
"learning_rate": 9.608863481609784e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8337863022461534,
"num_tokens": 111773832.0,
"step": 130
},
{
"entropy": 0.4438018798828125,
"epoch": 1.0396825396825398,
"grad_norm": 0.9414725400905224,
"learning_rate": 9.600336856589622e-06,
"loss": 0.4856,
"mean_token_accuracy": 0.835617205593735,
"num_tokens": 112637470.0,
"step": 131
},
{
"entropy": 0.44439697265625,
"epoch": 1.0476190476190477,
"grad_norm": 0.9341197433359306,
"learning_rate": 9.591722159176229e-06,
"loss": 0.4942,
"mean_token_accuracy": 0.8329057167284191,
"num_tokens": 113501590.0,
"step": 132
},
{
"entropy": 0.4480133056640625,
"epoch": 1.0555555555555556,
"grad_norm": 0.9687991138519816,
"learning_rate": 9.583019554296004e-06,
"loss": 0.4921,
"mean_token_accuracy": 0.8349612141028047,
"num_tokens": 114360533.0,
"step": 133
},
{
"entropy": 0.44232177734375,
"epoch": 1.0634920634920635,
"grad_norm": 0.9629260310473902,
"learning_rate": 9.574229208558322e-06,
"loss": 0.4899,
"mean_token_accuracy": 0.8340120441280305,
"num_tokens": 115231953.0,
"step": 134
},
{
"entropy": 0.443511962890625,
"epoch": 1.0714285714285714,
"grad_norm": 0.904231182313968,
"learning_rate": 9.565351290252339e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.8350210129283369,
"num_tokens": 116092221.0,
"step": 135
},
{
"entropy": 0.443817138671875,
"epoch": 1.0793650793650793,
"grad_norm": 1.0016105467556813,
"learning_rate": 9.556385969343756e-06,
"loss": 0.493,
"mean_token_accuracy": 0.8340729284100235,
"num_tokens": 116984739.0,
"step": 136
},
{
"entropy": 0.441619873046875,
"epoch": 1.0873015873015872,
"grad_norm": 1.0079401042157452,
"learning_rate": 9.547333417471589e-06,
"loss": 0.4921,
"mean_token_accuracy": 0.8338921638205647,
"num_tokens": 117852991.0,
"step": 137
},
{
"entropy": 0.449127197265625,
"epoch": 1.0952380952380953,
"grad_norm": 1.0727711874297776,
"learning_rate": 9.538193807944864e-06,
"loss": 0.49,
"mean_token_accuracy": 0.8347254949621856,
"num_tokens": 118696927.0,
"step": 138
},
{
"entropy": 0.4441070556640625,
"epoch": 1.1031746031746033,
"grad_norm": 1.1054420578884305,
"learning_rate": 9.528967315739308e-06,
"loss": 0.4899,
"mean_token_accuracy": 0.8341114274226129,
"num_tokens": 119569613.0,
"step": 139
},
{
"entropy": 0.4412689208984375,
"epoch": 1.1111111111111112,
"grad_norm": 1.0649123433307148,
"learning_rate": 9.519654117493996e-06,
"loss": 0.4942,
"mean_token_accuracy": 0.8335490431636572,
"num_tokens": 120447089.0,
"step": 140
},
{
"entropy": 0.441131591796875,
"epoch": 1.119047619047619,
"grad_norm": 0.9658018391507793,
"learning_rate": 9.510254391507971e-06,
"loss": 0.4839,
"mean_token_accuracy": 0.8368041082285345,
"num_tokens": 121314886.0,
"step": 141
},
{
"entropy": 0.44622802734375,
"epoch": 1.126984126984127,
"grad_norm": 0.9540717616355562,
"learning_rate": 9.500768317736832e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.837146339006722,
"num_tokens": 122167617.0,
"step": 142
},
{
"entropy": 0.44293212890625,
"epoch": 1.1349206349206349,
"grad_norm": 1.0354981305656152,
"learning_rate": 9.49119607778928e-06,
"loss": 0.4849,
"mean_token_accuracy": 0.8341447049751878,
"num_tokens": 123013840.0,
"step": 143
},
{
"entropy": 0.4395904541015625,
"epoch": 1.1428571428571428,
"grad_norm": 0.9923916024668512,
"learning_rate": 9.481537854923654e-06,
"loss": 0.477,
"mean_token_accuracy": 0.8377352114766836,
"num_tokens": 123876490.0,
"step": 144
},
{
"entropy": 0.4472503662109375,
"epoch": 1.1507936507936507,
"grad_norm": 1.129639029346026,
"learning_rate": 9.471793834044416e-06,
"loss": 0.4853,
"mean_token_accuracy": 0.8350173779763281,
"num_tokens": 124713314.0,
"step": 145
},
{
"entropy": 0.44598388671875,
"epoch": 1.1587301587301586,
"grad_norm": 1.186594461732162,
"learning_rate": 9.461964201698604e-06,
"loss": 0.4939,
"mean_token_accuracy": 0.8313662535510957,
"num_tokens": 125564746.0,
"step": 146
},
{
"entropy": 0.43914794921875,
"epoch": 1.1666666666666667,
"grad_norm": 0.9201175898857195,
"learning_rate": 9.452049146072278e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8369712042622268,
"num_tokens": 126415997.0,
"step": 147
},
{
"entropy": 0.445343017578125,
"epoch": 1.1746031746031746,
"grad_norm": 0.9688073354920573,
"learning_rate": 9.442048856986899e-06,
"loss": 0.4914,
"mean_token_accuracy": 0.8343577086925507,
"num_tokens": 127285760.0,
"step": 148
},
{
"entropy": 0.4484710693359375,
"epoch": 1.1825396825396826,
"grad_norm": 1.0757588730703025,
"learning_rate": 9.431963525895709e-06,
"loss": 0.4946,
"mean_token_accuracy": 0.8332444536499679,
"num_tokens": 128153685.0,
"step": 149
},
{
"entropy": 0.45037841796875,
"epoch": 1.1904761904761905,
"grad_norm": 1.0256311878794404,
"learning_rate": 9.421793345880055e-06,
"loss": 0.4789,
"mean_token_accuracy": 0.8380363639444113,
"num_tokens": 128990738.0,
"step": 150
},
{
"entropy": 0.44927978515625,
"epoch": 1.1984126984126984,
"grad_norm": 1.0956116916076417,
"learning_rate": 9.4115385116457e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.8342617130838335,
"num_tokens": 129848900.0,
"step": 151
},
{
"entropy": 0.448333740234375,
"epoch": 1.2063492063492063,
"grad_norm": 1.0166065436311602,
"learning_rate": 9.401199219519088e-06,
"loss": 0.4878,
"mean_token_accuracy": 0.8344792602583766,
"num_tokens": 130724709.0,
"step": 152
},
{
"entropy": 0.44781494140625,
"epoch": 1.2142857142857142,
"grad_norm": 0.9085771039074888,
"learning_rate": 9.390775667443602e-06,
"loss": 0.4761,
"mean_token_accuracy": 0.8378241760656238,
"num_tokens": 131582811.0,
"step": 153
},
{
"entropy": 0.4468536376953125,
"epoch": 1.2222222222222223,
"grad_norm": 1.0868732902444567,
"learning_rate": 9.380268054975745e-06,
"loss": 0.4835,
"mean_token_accuracy": 0.8363062706775963,
"num_tokens": 132429743.0,
"step": 154
},
{
"entropy": 0.4462738037109375,
"epoch": 1.2301587301587302,
"grad_norm": 1.0628171497183283,
"learning_rate": 9.36967658328135e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.8348783804103732,
"num_tokens": 133291943.0,
"step": 155
},
{
"entropy": 0.44970703125,
"epoch": 1.2380952380952381,
"grad_norm": 0.9804992859024957,
"learning_rate": 9.359001455131713e-06,
"loss": 0.4815,
"mean_token_accuracy": 0.8365443642251194,
"num_tokens": 134149814.0,
"step": 156
},
{
"entropy": 0.44989013671875,
"epoch": 1.246031746031746,
"grad_norm": 1.1446452708449568,
"learning_rate": 9.34824287489971e-06,
"loss": 0.4728,
"mean_token_accuracy": 0.839170094113797,
"num_tokens": 134989406.0,
"step": 157
},
{
"entropy": 0.4441986083984375,
"epoch": 1.253968253968254,
"grad_norm": 1.003349637480706,
"learning_rate": 9.337401048555892e-06,
"loss": 0.4688,
"mean_token_accuracy": 0.8404755499213934,
"num_tokens": 135832642.0,
"step": 158
},
{
"entropy": 0.4422149658203125,
"epoch": 1.2619047619047619,
"grad_norm": 1.0839125563857472,
"learning_rate": 9.326476183664535e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.837718007620424,
"num_tokens": 136724748.0,
"step": 159
},
{
"entropy": 0.449066162109375,
"epoch": 1.2698412698412698,
"grad_norm": 1.0019921797525027,
"learning_rate": 9.315468489379668e-06,
"loss": 0.4788,
"mean_token_accuracy": 0.8362822770141065,
"num_tokens": 137570382.0,
"step": 160
},
{
"entropy": 0.4457855224609375,
"epoch": 1.2777777777777777,
"grad_norm": 0.9885822793795929,
"learning_rate": 9.304378176441076e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8382827825844288,
"num_tokens": 138431194.0,
"step": 161
},
{
"entropy": 0.4445953369140625,
"epoch": 1.2857142857142856,
"grad_norm": 1.040992724515245,
"learning_rate": 9.29320545717025e-06,
"loss": 0.4673,
"mean_token_accuracy": 0.840416397433728,
"num_tokens": 139287890.0,
"step": 162
},
{
"entropy": 0.44342041015625,
"epoch": 1.2936507936507937,
"grad_norm": 0.9829545106372576,
"learning_rate": 9.281950545466336e-06,
"loss": 0.4814,
"mean_token_accuracy": 0.8361613317392766,
"num_tokens": 140160179.0,
"step": 163
},
{
"entropy": 0.44000244140625,
"epoch": 1.3015873015873016,
"grad_norm": 1.0385970457713267,
"learning_rate": 9.27061365680204e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8369025052525103,
"num_tokens": 141002875.0,
"step": 164
},
{
"entropy": 0.4385528564453125,
"epoch": 1.3095238095238095,
"grad_norm": 0.9812301520220873,
"learning_rate": 9.25919500821949e-06,
"loss": 0.471,
"mean_token_accuracy": 0.838864213321358,
"num_tokens": 141858286.0,
"step": 165
},
{
"entropy": 0.4442901611328125,
"epoch": 1.3174603174603174,
"grad_norm": 1.0236518502998646,
"learning_rate": 9.247694818326092e-06,
"loss": 0.4711,
"mean_token_accuracy": 0.8398910835385323,
"num_tokens": 142698339.0,
"step": 166
},
{
"entropy": 0.4426116943359375,
"epoch": 1.3253968253968254,
"grad_norm": 1.1382281143976174,
"learning_rate": 9.236113307290345e-06,
"loss": 0.4742,
"mean_token_accuracy": 0.837109467945993,
"num_tokens": 143561112.0,
"step": 167
},
{
"entropy": 0.442718505859375,
"epoch": 1.3333333333333333,
"grad_norm": 0.9746191934401784,
"learning_rate": 9.224450696837617e-06,
"loss": 0.4756,
"mean_token_accuracy": 0.8384752809070051,
"num_tokens": 144390223.0,
"step": 168
},
{
"entropy": 0.4402008056640625,
"epoch": 1.3412698412698414,
"grad_norm": 0.9749642677850219,
"learning_rate": 9.212707210245908e-06,
"loss": 0.4881,
"mean_token_accuracy": 0.8348734346218407,
"num_tokens": 145276276.0,
"step": 169
},
{
"entropy": 0.4441070556640625,
"epoch": 1.3492063492063493,
"grad_norm": 1.0438747532350088,
"learning_rate": 9.200883072341573e-06,
"loss": 0.4761,
"mean_token_accuracy": 0.8384365830570459,
"num_tokens": 146148957.0,
"step": 170
},
{
"entropy": 0.4490203857421875,
"epoch": 1.3571428571428572,
"grad_norm": 0.9016207370694161,
"learning_rate": 9.188978509495022e-06,
"loss": 0.475,
"mean_token_accuracy": 0.8379638059996068,
"num_tokens": 146999892.0,
"step": 171
},
{
"entropy": 0.4430694580078125,
"epoch": 1.3650793650793651,
"grad_norm": 0.8815407085280926,
"learning_rate": 9.176993749616374e-06,
"loss": 0.4768,
"mean_token_accuracy": 0.8367542624473572,
"num_tokens": 147888947.0,
"step": 172
},
{
"entropy": 0.451995849609375,
"epoch": 1.373015873015873,
"grad_norm": 0.9246528792063293,
"learning_rate": 9.164929022151106e-06,
"loss": 0.4871,
"mean_token_accuracy": 0.8344444935210049,
"num_tokens": 148771994.0,
"step": 173
},
{
"entropy": 0.444549560546875,
"epoch": 1.380952380952381,
"grad_norm": 0.9018527258286749,
"learning_rate": 9.15278455807566e-06,
"loss": 0.4715,
"mean_token_accuracy": 0.8390994230285287,
"num_tokens": 149626353.0,
"step": 174
},
{
"entropy": 0.4414825439453125,
"epoch": 1.3888888888888888,
"grad_norm": 0.8544852116993605,
"learning_rate": 9.140560589893012e-06,
"loss": 0.4697,
"mean_token_accuracy": 0.8393202098086476,
"num_tokens": 150484433.0,
"step": 175
},
{
"entropy": 0.4454803466796875,
"epoch": 1.3968253968253967,
"grad_norm": 1.0071897404357584,
"learning_rate": 9.128257351628224e-06,
"loss": 0.473,
"mean_token_accuracy": 0.8398340521380305,
"num_tokens": 151351171.0,
"step": 176
},
{
"entropy": 0.440704345703125,
"epoch": 1.4047619047619047,
"grad_norm": 1.0906938840190845,
"learning_rate": 9.115875078823975e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.8349933759309351,
"num_tokens": 152198704.0,
"step": 177
},
{
"entropy": 0.4438629150390625,
"epoch": 1.4126984126984126,
"grad_norm": 0.9630152565863848,
"learning_rate": 9.103414008536029e-06,
"loss": 0.4762,
"mean_token_accuracy": 0.8377989139407873,
"num_tokens": 153027562.0,
"step": 178
},
{
"entropy": 0.4464569091796875,
"epoch": 1.4206349206349207,
"grad_norm": 0.968332662888831,
"learning_rate": 9.09087437932872e-06,
"loss": 0.47,
"mean_token_accuracy": 0.8383941231295466,
"num_tokens": 153863890.0,
"step": 179
},
{
"entropy": 0.4459228515625,
"epoch": 1.4285714285714286,
"grad_norm": 0.8682147767187481,
"learning_rate": 9.07825643127037e-06,
"loss": 0.477,
"mean_token_accuracy": 0.8366480157710612,
"num_tokens": 154707913.0,
"step": 180
},
{
"entropy": 0.4376068115234375,
"epoch": 1.4365079365079365,
"grad_norm": 0.897993109890026,
"learning_rate": 9.065560405928699e-06,
"loss": 0.4756,
"mean_token_accuracy": 0.8380651730112731,
"num_tokens": 155590050.0,
"step": 181
},
{
"entropy": 0.4394073486328125,
"epoch": 1.4444444444444444,
"grad_norm": 0.9388047872340883,
"learning_rate": 9.0527865463662e-06,
"loss": 0.4709,
"mean_token_accuracy": 0.8392384983599186,
"num_tokens": 156449879.0,
"step": 182
},
{
"entropy": 0.440673828125,
"epoch": 1.4523809523809523,
"grad_norm": 0.8193746423443552,
"learning_rate": 9.039935097135479e-06,
"loss": 0.4584,
"mean_token_accuracy": 0.8437643311917782,
"num_tokens": 157304143.0,
"step": 183
},
{
"entropy": 0.43817138671875,
"epoch": 1.4603174603174602,
"grad_norm": 0.9374511059068703,
"learning_rate": 9.027006304274584e-06,
"loss": 0.4748,
"mean_token_accuracy": 0.8367259805090725,
"num_tokens": 158177988.0,
"step": 184
},
{
"entropy": 0.4360198974609375,
"epoch": 1.4682539682539684,
"grad_norm": 0.8212121210922411,
"learning_rate": 9.014000415302286e-06,
"loss": 0.4783,
"mean_token_accuracy": 0.8371384800411761,
"num_tokens": 159060066.0,
"step": 185
},
{
"entropy": 0.441680908203125,
"epoch": 1.4761904761904763,
"grad_norm": 0.8806995545843207,
"learning_rate": 9.000917679213344e-06,
"loss": 0.474,
"mean_token_accuracy": 0.8378347246907651,
"num_tokens": 159942986.0,
"step": 186
},
{
"entropy": 0.4442596435546875,
"epoch": 1.4841269841269842,
"grad_norm": 0.8463270672966281,
"learning_rate": 8.987758346473739e-06,
"loss": 0.4649,
"mean_token_accuracy": 0.8411796907894313,
"num_tokens": 160782816.0,
"step": 187
},
{
"entropy": 0.4425201416015625,
"epoch": 1.492063492063492,
"grad_norm": 0.839482572154392,
"learning_rate": 8.974522669015872e-06,
"loss": 0.4672,
"mean_token_accuracy": 0.8414199482649565,
"num_tokens": 161643512.0,
"step": 188
},
{
"entropy": 0.436920166015625,
"epoch": 1.5,
"grad_norm": 0.9422232012137579,
"learning_rate": 8.961210900233757e-06,
"loss": 0.4593,
"mean_token_accuracy": 0.8416055347770452,
"num_tokens": 162503001.0,
"step": 189
},
{
"entropy": 0.4347076416015625,
"epoch": 1.507936507936508,
"grad_norm": 0.9312174445116989,
"learning_rate": 8.947823294978147e-06,
"loss": 0.4741,
"mean_token_accuracy": 0.8390083778649569,
"num_tokens": 163388010.0,
"step": 190
},
{
"entropy": 0.4320220947265625,
"epoch": 1.5158730158730158,
"grad_norm": 0.8634591037958418,
"learning_rate": 8.934360109551671e-06,
"loss": 0.4694,
"mean_token_accuracy": 0.8393782819621265,
"num_tokens": 164270399.0,
"step": 191
},
{
"entropy": 0.4329681396484375,
"epoch": 1.5238095238095237,
"grad_norm": 1.0268418396952028,
"learning_rate": 8.920821601703927e-06,
"loss": 0.4657,
"mean_token_accuracy": 0.8410811661742628,
"num_tokens": 165155523.0,
"step": 192
},
{
"entropy": 0.4373931884765625,
"epoch": 1.5317460317460316,
"grad_norm": 0.911296138695116,
"learning_rate": 8.907208030626538e-06,
"loss": 0.4647,
"mean_token_accuracy": 0.8417128617875278,
"num_tokens": 166004219.0,
"step": 193
},
{
"entropy": 0.436920166015625,
"epoch": 1.5396825396825395,
"grad_norm": 0.8615585964723216,
"learning_rate": 8.8935196569482e-06,
"loss": 0.4659,
"mean_token_accuracy": 0.841303990688175,
"num_tokens": 166887486.0,
"step": 194
},
{
"entropy": 0.4330291748046875,
"epoch": 1.5476190476190477,
"grad_norm": 0.9022520563994237,
"learning_rate": 8.879756742729683e-06,
"loss": 0.4642,
"mean_token_accuracy": 0.842128555290401,
"num_tokens": 167743608.0,
"step": 195
},
{
"entropy": 0.4404449462890625,
"epoch": 1.5555555555555556,
"grad_norm": 0.8427585056849268,
"learning_rate": 8.865919551458823e-06,
"loss": 0.4638,
"mean_token_accuracy": 0.8412150857038796,
"num_tokens": 168602032.0,
"step": 196
},
{
"entropy": 0.43634033203125,
"epoch": 1.5634920634920635,
"grad_norm": 0.9473332832713499,
"learning_rate": 8.852008348045468e-06,
"loss": 0.4713,
"mean_token_accuracy": 0.8384702135808766,
"num_tokens": 169469975.0,
"step": 197
},
{
"entropy": 0.4295654296875,
"epoch": 1.5714285714285714,
"grad_norm": 0.8265269594435529,
"learning_rate": 8.838023398816417e-06,
"loss": 0.471,
"mean_token_accuracy": 0.8378414455801249,
"num_tokens": 170343282.0,
"step": 198
},
{
"entropy": 0.4375,
"epoch": 1.5793650793650795,
"grad_norm": 0.9350879056767756,
"learning_rate": 8.823964971510313e-06,
"loss": 0.4701,
"mean_token_accuracy": 0.8392301532439888,
"num_tokens": 171227432.0,
"step": 199
},
{
"entropy": 0.4355621337890625,
"epoch": 1.5873015873015874,
"grad_norm": 0.8262956094897539,
"learning_rate": 8.809833335272517e-06,
"loss": 0.4531,
"mean_token_accuracy": 0.8436351302079856,
"num_tokens": 172096305.0,
"step": 200
},
{
"entropy": 0.4359893798828125,
"epoch": 1.5952380952380953,
"grad_norm": 0.8393643465691598,
"learning_rate": 8.795628760649965e-06,
"loss": 0.4552,
"mean_token_accuracy": 0.8432473209686577,
"num_tokens": 172910673.0,
"step": 201
},
{
"entropy": 0.4252777099609375,
"epoch": 1.6031746031746033,
"grad_norm": 0.930458721360079,
"learning_rate": 8.781351519585978e-06,
"loss": 0.4602,
"mean_token_accuracy": 0.8418687861412764,
"num_tokens": 173775762.0,
"step": 202
},
{
"entropy": 0.4301910400390625,
"epoch": 1.6111111111111112,
"grad_norm": 0.9255152673550228,
"learning_rate": 8.767001885415055e-06,
"loss": 0.4658,
"mean_token_accuracy": 0.8412896669469774,
"num_tokens": 174651858.0,
"step": 203
},
{
"entropy": 0.4319915771484375,
"epoch": 1.619047619047619,
"grad_norm": 0.8156780804217264,
"learning_rate": 8.752580132857652e-06,
"loss": 0.4576,
"mean_token_accuracy": 0.8430444840341806,
"num_tokens": 175519282.0,
"step": 204
},
{
"entropy": 0.4349212646484375,
"epoch": 1.626984126984127,
"grad_norm": 0.8770655718885645,
"learning_rate": 8.73808653801491e-06,
"loss": 0.4714,
"mean_token_accuracy": 0.8400326487608254,
"num_tokens": 176387199.0,
"step": 205
},
{
"entropy": 0.430511474609375,
"epoch": 1.6349206349206349,
"grad_norm": 0.8757592238026767,
"learning_rate": 8.723521378363378e-06,
"loss": 0.4681,
"mean_token_accuracy": 0.8415880398824811,
"num_tokens": 177264131.0,
"step": 206
},
{
"entropy": 0.43365478515625,
"epoch": 1.6428571428571428,
"grad_norm": 0.89706430686004,
"learning_rate": 8.70888493274969e-06,
"loss": 0.4581,
"mean_token_accuracy": 0.8423688313923776,
"num_tokens": 178114003.0,
"step": 207
},
{
"entropy": 0.4362945556640625,
"epoch": 1.6507936507936507,
"grad_norm": 1.245121139741542,
"learning_rate": 8.694177481385244e-06,
"loss": 0.4681,
"mean_token_accuracy": 0.8389896345324814,
"num_tokens": 178950487.0,
"step": 208
},
{
"entropy": 0.4297637939453125,
"epoch": 1.6587301587301586,
"grad_norm": 0.9621053522750438,
"learning_rate": 8.679399305840815e-06,
"loss": 0.4694,
"mean_token_accuracy": 0.83825440146029,
"num_tokens": 179833212.0,
"step": 209
},
{
"entropy": 0.4279327392578125,
"epoch": 1.6666666666666665,
"grad_norm": 0.8737950990378116,
"learning_rate": 8.664550689041187e-06,
"loss": 0.461,
"mean_token_accuracy": 0.8423066223040223,
"num_tokens": 180712234.0,
"step": 210
},
{
"entropy": 0.4318084716796875,
"epoch": 1.6746031746031746,
"grad_norm": 1.000096359581219,
"learning_rate": 8.649631915259716e-06,
"loss": 0.4741,
"mean_token_accuracy": 0.8375975685194135,
"num_tokens": 181566490.0,
"step": 211
},
{
"entropy": 0.4336090087890625,
"epoch": 1.6825396825396826,
"grad_norm": 0.8934119220152827,
"learning_rate": 8.634643270112903e-06,
"loss": 0.4667,
"mean_token_accuracy": 0.8412727518007159,
"num_tokens": 182413254.0,
"step": 212
},
{
"entropy": 0.430084228515625,
"epoch": 1.6904761904761905,
"grad_norm": 0.8156026818943841,
"learning_rate": 8.61958504055492e-06,
"loss": 0.4599,
"mean_token_accuracy": 0.8421376254409552,
"num_tokens": 183269538.0,
"step": 213
},
{
"entropy": 0.4370574951171875,
"epoch": 1.6984126984126984,
"grad_norm": 1.0433955766227752,
"learning_rate": 8.604457514872115e-06,
"loss": 0.4577,
"mean_token_accuracy": 0.8438415261916816,
"num_tokens": 184109496.0,
"step": 214
},
{
"entropy": 0.4349365234375,
"epoch": 1.7063492063492065,
"grad_norm": 0.8795834565304798,
"learning_rate": 8.589260982677496e-06,
"loss": 0.4716,
"mean_token_accuracy": 0.8374428367242217,
"num_tokens": 184968366.0,
"step": 215
},
{
"entropy": 0.42926025390625,
"epoch": 1.7142857142857144,
"grad_norm": 0.8234311825574274,
"learning_rate": 8.573995734905185e-06,
"loss": 0.4689,
"mean_token_accuracy": 0.8390569076873362,
"num_tokens": 185857166.0,
"step": 216
},
{
"entropy": 0.436279296875,
"epoch": 1.7222222222222223,
"grad_norm": 0.9720711352685596,
"learning_rate": 8.558662063804843e-06,
"loss": 0.452,
"mean_token_accuracy": 0.8439166625030339,
"num_tokens": 186684767.0,
"step": 217
},
{
"entropy": 0.4309844970703125,
"epoch": 1.7301587301587302,
"grad_norm": 0.8753161905032254,
"learning_rate": 8.543260262936087e-06,
"loss": 0.4545,
"mean_token_accuracy": 0.843706154730171,
"num_tokens": 187534641.0,
"step": 218
},
{
"entropy": 0.429901123046875,
"epoch": 1.7380952380952381,
"grad_norm": 0.8611756266061616,
"learning_rate": 8.527790627162858e-06,
"loss": 0.4594,
"mean_token_accuracy": 0.8414032305590808,
"num_tokens": 188403747.0,
"step": 219
},
{
"entropy": 0.4284210205078125,
"epoch": 1.746031746031746,
"grad_norm": 0.917314816658313,
"learning_rate": 8.512253452647783e-06,
"loss": 0.4636,
"mean_token_accuracy": 0.8410017411224544,
"num_tokens": 189286051.0,
"step": 220
},
{
"entropy": 0.435394287109375,
"epoch": 1.753968253968254,
"grad_norm": 0.8160829606015351,
"learning_rate": 8.496649036846502e-06,
"loss": 0.4556,
"mean_token_accuracy": 0.8419742425903678,
"num_tokens": 190135846.0,
"step": 221
},
{
"entropy": 0.4247589111328125,
"epoch": 1.7619047619047619,
"grad_norm": 1.0175216766708233,
"learning_rate": 8.480977678501974e-06,
"loss": 0.4658,
"mean_token_accuracy": 0.8410613937303424,
"num_tokens": 191023956.0,
"step": 222
},
{
"entropy": 0.43572998046875,
"epoch": 1.7698412698412698,
"grad_norm": 0.9507033398860977,
"learning_rate": 8.465239677638755e-06,
"loss": 0.4554,
"mean_token_accuracy": 0.8437660122290254,
"num_tokens": 191865715.0,
"step": 223
},
{
"entropy": 0.4373016357421875,
"epoch": 1.7777777777777777,
"grad_norm": 0.8180069161681153,
"learning_rate": 8.449435335557264e-06,
"loss": 0.4575,
"mean_token_accuracy": 0.8432631348259747,
"num_tokens": 192687536.0,
"step": 224
},
{
"entropy": 0.4383087158203125,
"epoch": 1.7857142857142856,
"grad_norm": 0.8838273166712945,
"learning_rate": 8.433564954828e-06,
"loss": 0.4526,
"mean_token_accuracy": 0.8442786163650453,
"num_tokens": 193514317.0,
"step": 225
},
{
"entropy": 0.42999267578125,
"epoch": 1.7936507936507935,
"grad_norm": 0.9410341071186311,
"learning_rate": 8.417628839285757e-06,
"loss": 0.4581,
"mean_token_accuracy": 0.8429269646294415,
"num_tokens": 194368425.0,
"step": 226
},
{
"entropy": 0.4307098388671875,
"epoch": 1.8015873015873016,
"grad_norm": 0.90812536954616,
"learning_rate": 8.401627294023815e-06,
"loss": 0.4577,
"mean_token_accuracy": 0.8424977059476078,
"num_tokens": 195229420.0,
"step": 227
},
{
"entropy": 0.4311065673828125,
"epoch": 1.8095238095238095,
"grad_norm": 0.8936972613642765,
"learning_rate": 8.385560625388081e-06,
"loss": 0.4613,
"mean_token_accuracy": 0.8418103088624775,
"num_tokens": 196086060.0,
"step": 228
},
{
"entropy": 0.4331512451171875,
"epoch": 1.8174603174603174,
"grad_norm": 0.9023364391946196,
"learning_rate": 8.369429140971239e-06,
"loss": 0.4587,
"mean_token_accuracy": 0.840968404430896,
"num_tokens": 196949752.0,
"step": 229
},
{
"entropy": 0.431732177734375,
"epoch": 1.8253968253968254,
"grad_norm": 0.9269255151577744,
"learning_rate": 8.353233149606859e-06,
"loss": 0.4564,
"mean_token_accuracy": 0.8422707901336253,
"num_tokens": 197787383.0,
"step": 230
},
{
"entropy": 0.4332275390625,
"epoch": 1.8333333333333335,
"grad_norm": 0.8746862673486592,
"learning_rate": 8.336972961363472e-06,
"loss": 0.4641,
"mean_token_accuracy": 0.8415999473072588,
"num_tokens": 198640204.0,
"step": 231
},
{
"entropy": 0.4254608154296875,
"epoch": 1.8412698412698414,
"grad_norm": 0.8419998918591282,
"learning_rate": 8.320648887538657e-06,
"loss": 0.4628,
"mean_token_accuracy": 0.8425387698225677,
"num_tokens": 199534945.0,
"step": 232
},
{
"entropy": 0.4349517822265625,
"epoch": 1.8492063492063493,
"grad_norm": 0.945589758024129,
"learning_rate": 8.304261240653054e-06,
"loss": 0.4546,
"mean_token_accuracy": 0.8429999812506139,
"num_tokens": 200401566.0,
"step": 233
},
{
"entropy": 0.4324798583984375,
"epoch": 1.8571428571428572,
"grad_norm": 0.9389620288256866,
"learning_rate": 8.287810334444406e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.8408999373205006,
"num_tokens": 201286569.0,
"step": 234
},
{
"entropy": 0.4327392578125,
"epoch": 1.8650793650793651,
"grad_norm": 0.9056957266265069,
"learning_rate": 8.271296483861532e-06,
"loss": 0.4555,
"mean_token_accuracy": 0.8440436110831797,
"num_tokens": 202148785.0,
"step": 235
},
{
"entropy": 0.43682861328125,
"epoch": 1.873015873015873,
"grad_norm": 0.9007501274176329,
"learning_rate": 8.254720005058317e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.8437027987092733,
"num_tokens": 202969412.0,
"step": 236
},
{
"entropy": 0.429046630859375,
"epoch": 1.880952380952381,
"grad_norm": 0.7886955269176177,
"learning_rate": 8.238081215387639e-06,
"loss": 0.4572,
"mean_token_accuracy": 0.8425727025605738,
"num_tokens": 203845826.0,
"step": 237
},
{
"entropy": 0.4304656982421875,
"epoch": 1.8888888888888888,
"grad_norm": 0.9902829953426554,
"learning_rate": 8.221380433395308e-06,
"loss": 0.4522,
"mean_token_accuracy": 0.8438800727017224,
"num_tokens": 204713067.0,
"step": 238
},
{
"entropy": 0.4382476806640625,
"epoch": 1.8968253968253967,
"grad_norm": 0.8783861526345048,
"learning_rate": 8.204617978813963e-06,
"loss": 0.4544,
"mean_token_accuracy": 0.8443545303307474,
"num_tokens": 205549482.0,
"step": 239
},
{
"entropy": 0.4334259033203125,
"epoch": 1.9047619047619047,
"grad_norm": 0.7800627411645534,
"learning_rate": 8.187794172556947e-06,
"loss": 0.4535,
"mean_token_accuracy": 0.8426107591949403,
"num_tokens": 206394578.0,
"step": 240
},
{
"entropy": 0.4372100830078125,
"epoch": 1.9126984126984126,
"grad_norm": 0.8924141210495853,
"learning_rate": 8.170909336712171e-06,
"loss": 0.4593,
"mean_token_accuracy": 0.8427824974060059,
"num_tokens": 207233636.0,
"step": 241
},
{
"entropy": 0.4353790283203125,
"epoch": 1.9206349206349205,
"grad_norm": 0.8893426872353928,
"learning_rate": 8.153963794535945e-06,
"loss": 0.4604,
"mean_token_accuracy": 0.841770654078573,
"num_tokens": 208074376.0,
"step": 242
},
{
"entropy": 0.4335174560546875,
"epoch": 1.9285714285714286,
"grad_norm": 0.8143885506939128,
"learning_rate": 8.136957870446779e-06,
"loss": 0.4591,
"mean_token_accuracy": 0.8414175752550364,
"num_tokens": 208947370.0,
"step": 243
},
{
"entropy": 0.4306640625,
"epoch": 1.9365079365079365,
"grad_norm": 0.8217558583786552,
"learning_rate": 8.119891890019187e-06,
"loss": 0.4502,
"mean_token_accuracy": 0.8447873778641224,
"num_tokens": 209798547.0,
"step": 244
},
{
"entropy": 0.4336090087890625,
"epoch": 1.9444444444444444,
"grad_norm": 0.8345483891742207,
"learning_rate": 8.102766179977452e-06,
"loss": 0.4548,
"mean_token_accuracy": 0.843397512100637,
"num_tokens": 210661829.0,
"step": 245
},
{
"entropy": 0.4267578125,
"epoch": 1.9523809523809523,
"grad_norm": 0.886458439838755,
"learning_rate": 8.085581068189358e-06,
"loss": 0.4546,
"mean_token_accuracy": 0.8432880756445229,
"num_tokens": 211549046.0,
"step": 246
},
{
"entropy": 0.4273834228515625,
"epoch": 1.9603174603174605,
"grad_norm": 0.7893301359285466,
"learning_rate": 8.068336883659926e-06,
"loss": 0.4483,
"mean_token_accuracy": 0.8453035233542323,
"num_tokens": 212447521.0,
"step": 247
},
{
"entropy": 0.4306793212890625,
"epoch": 1.9682539682539684,
"grad_norm": 0.891429474690652,
"learning_rate": 8.051033956525113e-06,
"loss": 0.4539,
"mean_token_accuracy": 0.8432926838286221,
"num_tokens": 213310334.0,
"step": 248
},
{
"entropy": 0.429534912109375,
"epoch": 1.9761904761904763,
"grad_norm": 0.8247760750659134,
"learning_rate": 8.033672618045485e-06,
"loss": 0.4524,
"mean_token_accuracy": 0.8450775747187436,
"num_tokens": 214169043.0,
"step": 249
},
{
"entropy": 0.4324188232421875,
"epoch": 1.9841269841269842,
"grad_norm": 0.8524339615157,
"learning_rate": 8.016253200599885e-06,
"loss": 0.4519,
"mean_token_accuracy": 0.8445535181090236,
"num_tokens": 215005057.0,
"step": 250
},
{
"entropy": 0.4263763427734375,
"epoch": 1.992063492063492,
"grad_norm": 0.8331975898868739,
"learning_rate": 7.998776037679061e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8456794614903629,
"num_tokens": 215869766.0,
"step": 251
},
{
"entropy": 0.4291229248046875,
"epoch": 2.0,
"grad_norm": 0.8613130972882047,
"learning_rate": 7.981241463879284e-06,
"loss": 0.4466,
"mean_token_accuracy": 0.8456757622770965,
"num_tokens": 216731206.0,
"step": 252
},
{
"entropy": 0.428619384765625,
"epoch": 2.007936507936508,
"grad_norm": 0.9277446577089026,
"learning_rate": 7.963649814895945e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.8530098241753876,
"num_tokens": 217589905.0,
"step": 253
},
{
"entropy": 0.4280853271484375,
"epoch": 2.015873015873016,
"grad_norm": 0.8708275069644504,
"learning_rate": 7.94600142751713e-06,
"loss": 0.432,
"mean_token_accuracy": 0.8501051301136613,
"num_tokens": 218446876.0,
"step": 254
},
{
"entropy": 0.42718505859375,
"epoch": 2.0238095238095237,
"grad_norm": 0.8842468147508419,
"learning_rate": 7.92829663961716e-06,
"loss": 0.433,
"mean_token_accuracy": 0.850572609808296,
"num_tokens": 219322571.0,
"step": 255
},
{
"entropy": 0.42889404296875,
"epoch": 2.0317460317460316,
"grad_norm": 0.8439073466722959,
"learning_rate": 7.910535790150135e-06,
"loss": 0.4291,
"mean_token_accuracy": 0.8493411005474627,
"num_tokens": 220180160.0,
"step": 256
},
{
"entropy": 0.42620849609375,
"epoch": 2.0396825396825395,
"grad_norm": 0.8757701106721189,
"learning_rate": 7.892719219143446e-06,
"loss": 0.42,
"mean_token_accuracy": 0.8547583618201315,
"num_tokens": 221016578.0,
"step": 257
},
{
"entropy": 0.42437744140625,
"epoch": 2.0476190476190474,
"grad_norm": 0.8882389150609792,
"learning_rate": 7.874847267691254e-06,
"loss": 0.4293,
"mean_token_accuracy": 0.8507325639948249,
"num_tokens": 221871968.0,
"step": 258
},
{
"entropy": 0.4220123291015625,
"epoch": 2.0555555555555554,
"grad_norm": 0.8012689613491503,
"learning_rate": 7.856920277947969e-06,
"loss": 0.4236,
"mean_token_accuracy": 0.8522419198416173,
"num_tokens": 222738323.0,
"step": 259
},
{
"entropy": 0.4245758056640625,
"epoch": 2.0634920634920633,
"grad_norm": 0.9448934032497721,
"learning_rate": 7.83893859312169e-06,
"loss": 0.4286,
"mean_token_accuracy": 0.8518815254792571,
"num_tokens": 223584134.0,
"step": 260
},
{
"entropy": 0.4188232421875,
"epoch": 2.0714285714285716,
"grad_norm": 0.8498468200899314,
"learning_rate": 7.820902557467648e-06,
"loss": 0.4256,
"mean_token_accuracy": 0.8522023572586477,
"num_tokens": 224461654.0,
"step": 261
},
{
"entropy": 0.42340087890625,
"epoch": 2.0793650793650795,
"grad_norm": 0.9494122603581977,
"learning_rate": 7.80281251628161e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.8496407098136842,
"num_tokens": 225327562.0,
"step": 262
},
{
"entropy": 0.4233551025390625,
"epoch": 2.0873015873015874,
"grad_norm": 0.773955396676882,
"learning_rate": 7.784668815893256e-06,
"loss": 0.4201,
"mean_token_accuracy": 0.853766305372119,
"num_tokens": 226189031.0,
"step": 263
},
{
"entropy": 0.4216156005859375,
"epoch": 2.0952380952380953,
"grad_norm": 0.9147295809460214,
"learning_rate": 7.766471803659571e-06,
"loss": 0.4396,
"mean_token_accuracy": 0.8481186041608453,
"num_tokens": 227062309.0,
"step": 264
},
{
"entropy": 0.4239349365234375,
"epoch": 2.1031746031746033,
"grad_norm": 0.9057979347358639,
"learning_rate": 7.748221827958174e-06,
"loss": 0.4297,
"mean_token_accuracy": 0.8508882015012205,
"num_tokens": 227920598.0,
"step": 265
},
{
"entropy": 0.422332763671875,
"epoch": 2.111111111111111,
"grad_norm": 0.7698507478470203,
"learning_rate": 7.729919238180663e-06,
"loss": 0.4239,
"mean_token_accuracy": 0.8522637677378953,
"num_tokens": 228773818.0,
"step": 266
},
{
"entropy": 0.4227447509765625,
"epoch": 2.119047619047619,
"grad_norm": 0.7899935011725722,
"learning_rate": 7.711564384725916e-06,
"loss": 0.4215,
"mean_token_accuracy": 0.8535870416089892,
"num_tokens": 229627711.0,
"step": 267
},
{
"entropy": 0.4207000732421875,
"epoch": 2.126984126984127,
"grad_norm": 0.7934125500230887,
"learning_rate": 7.693157618993392e-06,
"loss": 0.4334,
"mean_token_accuracy": 0.8498726398684084,
"num_tokens": 230489032.0,
"step": 268
},
{
"entropy": 0.421234130859375,
"epoch": 2.134920634920635,
"grad_norm": 0.7955872905920104,
"learning_rate": 7.674699293376397e-06,
"loss": 0.4349,
"mean_token_accuracy": 0.8490500543266535,
"num_tokens": 231339019.0,
"step": 269
},
{
"entropy": 0.4244232177734375,
"epoch": 2.142857142857143,
"grad_norm": 0.7854549442643708,
"learning_rate": 7.656189761255333e-06,
"loss": 0.4319,
"mean_token_accuracy": 0.8492707693949342,
"num_tokens": 232199672.0,
"step": 270
},
{
"entropy": 0.4268646240234375,
"epoch": 2.1507936507936507,
"grad_norm": 0.7878191667376515,
"learning_rate": 7.63762937699095e-06,
"loss": 0.4309,
"mean_token_accuracy": 0.8514748462475836,
"num_tokens": 233076007.0,
"step": 271
},
{
"entropy": 0.4263763427734375,
"epoch": 2.1587301587301586,
"grad_norm": 0.7996554554456847,
"learning_rate": 7.619018495917543e-06,
"loss": 0.4302,
"mean_token_accuracy": 0.8500847779214382,
"num_tokens": 233942156.0,
"step": 272
},
{
"entropy": 0.4241943359375,
"epoch": 2.1666666666666665,
"grad_norm": 0.7525601660809861,
"learning_rate": 7.600357474336157e-06,
"loss": 0.432,
"mean_token_accuracy": 0.8499450846575201,
"num_tokens": 234844668.0,
"step": 273
},
{
"entropy": 0.4309234619140625,
"epoch": 2.1746031746031744,
"grad_norm": 0.8012481468665732,
"learning_rate": 7.581646669507768e-06,
"loss": 0.4329,
"mean_token_accuracy": 0.8488554251380265,
"num_tokens": 235697877.0,
"step": 274
},
{
"entropy": 0.425933837890625,
"epoch": 2.1825396825396823,
"grad_norm": 0.7849735766550899,
"learning_rate": 7.56288643964644e-06,
"loss": 0.4253,
"mean_token_accuracy": 0.851461592130363,
"num_tokens": 236586699.0,
"step": 275
},
{
"entropy": 0.4276123046875,
"epoch": 2.1904761904761907,
"grad_norm": 0.8396810265614048,
"learning_rate": 7.544077143912467e-06,
"loss": 0.425,
"mean_token_accuracy": 0.8501534420065582,
"num_tokens": 237426378.0,
"step": 276
},
{
"entropy": 0.4206085205078125,
"epoch": 2.1984126984126986,
"grad_norm": 0.8220018076708352,
"learning_rate": 7.525219142405501e-06,
"loss": 0.4272,
"mean_token_accuracy": 0.8498959382995963,
"num_tokens": 238297987.0,
"step": 277
},
{
"entropy": 0.4322357177734375,
"epoch": 2.2063492063492065,
"grad_norm": 0.8197639207257438,
"learning_rate": 7.506312796157649e-06,
"loss": 0.4381,
"mean_token_accuracy": 0.8488305411301553,
"num_tokens": 239171101.0,
"step": 278
},
{
"entropy": 0.4268646240234375,
"epoch": 2.2142857142857144,
"grad_norm": 0.783646321002473,
"learning_rate": 7.487358467126573e-06,
"loss": 0.4242,
"mean_token_accuracy": 0.8518037595786154,
"num_tokens": 240034337.0,
"step": 279
},
{
"entropy": 0.43035888671875,
"epoch": 2.2222222222222223,
"grad_norm": 0.8546249612241452,
"learning_rate": 7.468356518188551e-06,
"loss": 0.4174,
"mean_token_accuracy": 0.8534762058407068,
"num_tokens": 240860927.0,
"step": 280
},
{
"entropy": 0.42437744140625,
"epoch": 2.2301587301587302,
"grad_norm": 0.87740351405863,
"learning_rate": 7.449307313131533e-06,
"loss": 0.4296,
"mean_token_accuracy": 0.8500415538437665,
"num_tokens": 241739076.0,
"step": 281
},
{
"entropy": 0.4241180419921875,
"epoch": 2.238095238095238,
"grad_norm": 0.8620781094998687,
"learning_rate": 7.4302112166481814e-06,
"loss": 0.4152,
"mean_token_accuracy": 0.8549392893910408,
"num_tokens": 242574011.0,
"step": 282
},
{
"entropy": 0.4252471923828125,
"epoch": 2.246031746031746,
"grad_norm": 0.8379775455346818,
"learning_rate": 7.411068594328876e-06,
"loss": 0.4292,
"mean_token_accuracy": 0.8494298844598234,
"num_tokens": 243458878.0,
"step": 283
},
{
"entropy": 0.4227294921875,
"epoch": 2.253968253968254,
"grad_norm": 0.825758180012109,
"learning_rate": 7.391879812654727e-06,
"loss": 0.4257,
"mean_token_accuracy": 0.852616976480931,
"num_tokens": 244313964.0,
"step": 284
},
{
"entropy": 0.42498779296875,
"epoch": 2.261904761904762,
"grad_norm": 0.7941347937575597,
"learning_rate": 7.37264523899056e-06,
"loss": 0.4204,
"mean_token_accuracy": 0.8534517176449299,
"num_tokens": 245200322.0,
"step": 285
},
{
"entropy": 0.4271087646484375,
"epoch": 2.2698412698412698,
"grad_norm": 0.8928939293234606,
"learning_rate": 7.353365241577869e-06,
"loss": 0.4274,
"mean_token_accuracy": 0.8513154000975192,
"num_tokens": 246083539.0,
"step": 286
},
{
"entropy": 0.427947998046875,
"epoch": 2.2777777777777777,
"grad_norm": 0.8392795753081728,
"learning_rate": 7.3340401895277816e-06,
"loss": 0.4276,
"mean_token_accuracy": 0.8511864547617733,
"num_tokens": 246933619.0,
"step": 287
},
{
"entropy": 0.4322662353515625,
"epoch": 2.2857142857142856,
"grad_norm": 0.8013508005420562,
"learning_rate": 7.314670452813982e-06,
"loss": 0.4188,
"mean_token_accuracy": 0.8539650039747357,
"num_tokens": 247765672.0,
"step": 288
},
{
"entropy": 0.4230804443359375,
"epoch": 2.2936507936507935,
"grad_norm": 0.7925852081903219,
"learning_rate": 7.295256402265636e-06,
"loss": 0.4208,
"mean_token_accuracy": 0.8516722363419831,
"num_tokens": 248628378.0,
"step": 289
},
{
"entropy": 0.4259033203125,
"epoch": 2.3015873015873014,
"grad_norm": 0.881087099852364,
"learning_rate": 7.275798409560282e-06,
"loss": 0.4286,
"mean_token_accuracy": 0.8508175020106137,
"num_tokens": 249501143.0,
"step": 290
},
{
"entropy": 0.42486572265625,
"epoch": 2.3095238095238093,
"grad_norm": 0.8044072073317771,
"learning_rate": 7.256296847216727e-06,
"loss": 0.4208,
"mean_token_accuracy": 0.8538436009548604,
"num_tokens": 250356099.0,
"step": 291
},
{
"entropy": 0.42413330078125,
"epoch": 2.317460317460317,
"grad_norm": 0.8884297194353945,
"learning_rate": 7.236752088587905e-06,
"loss": 0.4278,
"mean_token_accuracy": 0.850508657284081,
"num_tokens": 251219125.0,
"step": 292
},
{
"entropy": 0.4213409423828125,
"epoch": 2.3253968253968256,
"grad_norm": 0.8328136207372636,
"learning_rate": 7.217164507853734e-06,
"loss": 0.423,
"mean_token_accuracy": 0.8522739242762327,
"num_tokens": 252080434.0,
"step": 293
},
{
"entropy": 0.426666259765625,
"epoch": 2.3333333333333335,
"grad_norm": 0.8427474548537396,
"learning_rate": 7.197534480013951e-06,
"loss": 0.4203,
"mean_token_accuracy": 0.85275460453704,
"num_tokens": 252923218.0,
"step": 294
},
{
"entropy": 0.430084228515625,
"epoch": 2.3412698412698414,
"grad_norm": 0.9004833679442211,
"learning_rate": 7.177862380880935e-06,
"loss": 0.4218,
"mean_token_accuracy": 0.8528542476706207,
"num_tokens": 253761289.0,
"step": 295
},
{
"entropy": 0.4216156005859375,
"epoch": 2.3492063492063493,
"grad_norm": 0.8913589598940626,
"learning_rate": 7.158148587072509e-06,
"loss": 0.425,
"mean_token_accuracy": 0.8505891724489629,
"num_tokens": 254643716.0,
"step": 296
},
{
"entropy": 0.426910400390625,
"epoch": 2.357142857142857,
"grad_norm": 0.7894369274614703,
"learning_rate": 7.138393476004725e-06,
"loss": 0.425,
"mean_token_accuracy": 0.8516062931157649,
"num_tokens": 255486573.0,
"step": 297
},
{
"entropy": 0.426300048828125,
"epoch": 2.365079365079365,
"grad_norm": 0.8212693828322741,
"learning_rate": 7.118597425884659e-06,
"loss": 0.4154,
"mean_token_accuracy": 0.8540734858252108,
"num_tokens": 256345326.0,
"step": 298
},
{
"entropy": 0.4244842529296875,
"epoch": 2.373015873015873,
"grad_norm": 0.8408664594175462,
"learning_rate": 7.098760815703139e-06,
"loss": 0.4159,
"mean_token_accuracy": 0.8559228433296084,
"num_tokens": 257185152.0,
"step": 299
},
{
"entropy": 0.42059326171875,
"epoch": 2.380952380952381,
"grad_norm": 0.8309509931885469,
"learning_rate": 7.078884025227519e-06,
"loss": 0.4215,
"mean_token_accuracy": 0.8527105739340186,
"num_tokens": 258067282.0,
"step": 300
},
{
"entropy": 0.421661376953125,
"epoch": 2.388888888888889,
"grad_norm": 0.7954323327920004,
"learning_rate": 7.058967434994388e-06,
"loss": 0.4251,
"mean_token_accuracy": 0.8514108480885625,
"num_tokens": 258944016.0,
"step": 301
},
{
"entropy": 0.425048828125,
"epoch": 2.3968253968253967,
"grad_norm": 0.7636778328503014,
"learning_rate": 7.0390114263022955e-06,
"loss": 0.4198,
"mean_token_accuracy": 0.8537601926364005,
"num_tokens": 259808268.0,
"step": 302
},
{
"entropy": 0.41961669921875,
"epoch": 2.4047619047619047,
"grad_norm": 0.8444532257440839,
"learning_rate": 7.019016381204448e-06,
"loss": 0.4264,
"mean_token_accuracy": 0.8519964478909969,
"num_tokens": 260684292.0,
"step": 303
},
{
"entropy": 0.4252777099609375,
"epoch": 2.4126984126984126,
"grad_norm": 0.8265870173926899,
"learning_rate": 6.998982682501394e-06,
"loss": 0.4233,
"mean_token_accuracy": 0.8529867087490857,
"num_tokens": 261555918.0,
"step": 304
},
{
"entropy": 0.4233245849609375,
"epoch": 2.4206349206349205,
"grad_norm": 0.8766807638096971,
"learning_rate": 6.978910713733696e-06,
"loss": 0.4207,
"mean_token_accuracy": 0.8529614573344588,
"num_tokens": 262425946.0,
"step": 305
},
{
"entropy": 0.4260406494140625,
"epoch": 2.4285714285714284,
"grad_norm": 0.8180287946820591,
"learning_rate": 6.958800859174591e-06,
"loss": 0.4155,
"mean_token_accuracy": 0.8538580327294767,
"num_tokens": 263268966.0,
"step": 306
},
{
"entropy": 0.420166015625,
"epoch": 2.4365079365079367,
"grad_norm": 0.8038512105532972,
"learning_rate": 6.938653503822628e-06,
"loss": 0.4193,
"mean_token_accuracy": 0.8529025730676949,
"num_tokens": 264137961.0,
"step": 307
},
{
"entropy": 0.4186859130859375,
"epoch": 2.4444444444444446,
"grad_norm": 0.8356237787218255,
"learning_rate": 6.9184690333942995e-06,
"loss": 0.4179,
"mean_token_accuracy": 0.8538753935135901,
"num_tokens": 264995910.0,
"step": 308
},
{
"entropy": 0.4160308837890625,
"epoch": 2.4523809523809526,
"grad_norm": 0.8358036143672558,
"learning_rate": 6.898247834316662e-06,
"loss": 0.4147,
"mean_token_accuracy": 0.8543582037091255,
"num_tokens": 265867518.0,
"step": 309
},
{
"entropy": 0.41815185546875,
"epoch": 2.4603174603174605,
"grad_norm": 0.9260389067513531,
"learning_rate": 6.877990293719928e-06,
"loss": 0.4211,
"mean_token_accuracy": 0.8540931805036962,
"num_tokens": 266730039.0,
"step": 310
},
{
"entropy": 0.4172515869140625,
"epoch": 2.4682539682539684,
"grad_norm": 0.7930039952405856,
"learning_rate": 6.857696799430064e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.8519657654687762,
"num_tokens": 267605673.0,
"step": 311
},
{
"entropy": 0.4198455810546875,
"epoch": 2.4761904761904763,
"grad_norm": 0.8779922529454903,
"learning_rate": 6.83736773996136e-06,
"loss": 0.4276,
"mean_token_accuracy": 0.852175232488662,
"num_tokens": 268470812.0,
"step": 312
},
{
"entropy": 0.418670654296875,
"epoch": 2.484126984126984,
"grad_norm": 0.7739740399164128,
"learning_rate": 6.817003504508993e-06,
"loss": 0.4145,
"mean_token_accuracy": 0.853930065408349,
"num_tokens": 269329768.0,
"step": 313
},
{
"entropy": 0.4190521240234375,
"epoch": 2.492063492063492,
"grad_norm": 0.7927430903268082,
"learning_rate": 6.796604482941578e-06,
"loss": 0.4238,
"mean_token_accuracy": 0.8510767961852252,
"num_tokens": 270192672.0,
"step": 314
},
{
"entropy": 0.4205474853515625,
"epoch": 2.5,
"grad_norm": 0.7677286448168184,
"learning_rate": 6.7761710657936995e-06,
"loss": 0.4282,
"mean_token_accuracy": 0.8515617684461176,
"num_tokens": 271053623.0,
"step": 315
},
{
"entropy": 0.415618896484375,
"epoch": 2.507936507936508,
"grad_norm": 0.7893175807304748,
"learning_rate": 6.75570364425844e-06,
"loss": 0.4215,
"mean_token_accuracy": 0.8526675584726036,
"num_tokens": 271921985.0,
"step": 316
},
{
"entropy": 0.4283447265625,
"epoch": 2.515873015873016,
"grad_norm": 0.8617893689163498,
"learning_rate": 6.735202610179886e-06,
"loss": 0.4235,
"mean_token_accuracy": 0.8520378330722451,
"num_tokens": 272757706.0,
"step": 317
},
{
"entropy": 0.42413330078125,
"epoch": 2.5238095238095237,
"grad_norm": 0.76248538584374,
"learning_rate": 6.714668356045629e-06,
"loss": 0.4155,
"mean_token_accuracy": 0.8540036669000983,
"num_tokens": 273603268.0,
"step": 318
},
{
"entropy": 0.421356201171875,
"epoch": 2.5317460317460316,
"grad_norm": 1.1471382166034823,
"learning_rate": 6.694101274979253e-06,
"loss": 0.4182,
"mean_token_accuracy": 0.8544265124946833,
"num_tokens": 274458735.0,
"step": 319
},
{
"entropy": 0.419586181640625,
"epoch": 2.5396825396825395,
"grad_norm": 0.8503843517257628,
"learning_rate": 6.673501760732805e-06,
"loss": 0.4188,
"mean_token_accuracy": 0.851504479534924,
"num_tokens": 275320028.0,
"step": 320
},
{
"entropy": 0.41754150390625,
"epoch": 2.5476190476190474,
"grad_norm": 0.7742097684397823,
"learning_rate": 6.652870207679253e-06,
"loss": 0.4154,
"mean_token_accuracy": 0.8555147871375084,
"num_tokens": 276151262.0,
"step": 321
},
{
"entropy": 0.415802001953125,
"epoch": 2.5555555555555554,
"grad_norm": 0.7996726962055972,
"learning_rate": 6.632207010804949e-06,
"loss": 0.4175,
"mean_token_accuracy": 0.8534226748161018,
"num_tokens": 276997327.0,
"step": 322
},
{
"entropy": 0.420318603515625,
"epoch": 2.5634920634920633,
"grad_norm": 0.8023983937223226,
"learning_rate": 6.611512565702053e-06,
"loss": 0.4226,
"mean_token_accuracy": 0.8535379455424845,
"num_tokens": 277849848.0,
"step": 323
},
{
"entropy": 0.4129180908203125,
"epoch": 2.571428571428571,
"grad_norm": 0.794860570280225,
"learning_rate": 6.590787268560967e-06,
"loss": 0.4126,
"mean_token_accuracy": 0.8558539836667478,
"num_tokens": 278726761.0,
"step": 324
},
{
"entropy": 0.417694091796875,
"epoch": 2.5793650793650795,
"grad_norm": 0.8947468548309203,
"learning_rate": 6.570031516162746e-06,
"loss": 0.4161,
"mean_token_accuracy": 0.8547689928673208,
"num_tokens": 279572082.0,
"step": 325
},
{
"entropy": 0.4159698486328125,
"epoch": 2.5873015873015874,
"grad_norm": 0.7955201654992391,
"learning_rate": 6.549245705871507e-06,
"loss": 0.4146,
"mean_token_accuracy": 0.854183979332447,
"num_tokens": 280414468.0,
"step": 326
},
{
"entropy": 0.4205780029296875,
"epoch": 2.5952380952380953,
"grad_norm": 0.8138022818439977,
"learning_rate": 6.528430235626819e-06,
"loss": 0.4216,
"mean_token_accuracy": 0.8531410917639732,
"num_tokens": 281237288.0,
"step": 327
},
{
"entropy": 0.4152374267578125,
"epoch": 2.6031746031746033,
"grad_norm": 0.8114079031107396,
"learning_rate": 6.5075855039360805e-06,
"loss": 0.4092,
"mean_token_accuracy": 0.8578996560536325,
"num_tokens": 282118057.0,
"step": 328
},
{
"entropy": 0.409637451171875,
"epoch": 2.611111111111111,
"grad_norm": 0.8647529166774726,
"learning_rate": 6.486711909866895e-06,
"loss": 0.4248,
"mean_token_accuracy": 0.8518201056867838,
"num_tokens": 283028330.0,
"step": 329
},
{
"entropy": 0.4239501953125,
"epoch": 2.619047619047619,
"grad_norm": 0.7331498819381451,
"learning_rate": 6.465809853039431e-06,
"loss": 0.4172,
"mean_token_accuracy": 0.8533883499912918,
"num_tokens": 283866607.0,
"step": 330
},
{
"entropy": 0.425384521484375,
"epoch": 2.626984126984127,
"grad_norm": 0.9242263399118948,
"learning_rate": 6.444879733618766e-06,
"loss": 0.4229,
"mean_token_accuracy": 0.852979929652065,
"num_tokens": 284705319.0,
"step": 331
},
{
"entropy": 0.419525146484375,
"epoch": 2.634920634920635,
"grad_norm": 0.8158292669223365,
"learning_rate": 6.423921952307237e-06,
"loss": 0.4338,
"mean_token_accuracy": 0.8505428163334727,
"num_tokens": 285598883.0,
"step": 332
},
{
"entropy": 0.422210693359375,
"epoch": 2.642857142857143,
"grad_norm": 0.8529287289999934,
"learning_rate": 6.4029369103367545e-06,
"loss": 0.4199,
"mean_token_accuracy": 0.8537574140354991,
"num_tokens": 286461446.0,
"step": 333
},
{
"entropy": 0.4251708984375,
"epoch": 2.6507936507936507,
"grad_norm": 0.8196864990296487,
"learning_rate": 6.381925009461128e-06,
"loss": 0.4171,
"mean_token_accuracy": 0.8536815252155066,
"num_tokens": 287308399.0,
"step": 334
},
{
"entropy": 0.4163818359375,
"epoch": 2.6587301587301586,
"grad_norm": 0.7820718545979705,
"learning_rate": 6.3608866519483825e-06,
"loss": 0.4198,
"mean_token_accuracy": 0.8528619990684092,
"num_tokens": 288187832.0,
"step": 335
},
{
"entropy": 0.4176025390625,
"epoch": 2.6666666666666665,
"grad_norm": 0.796216651976639,
"learning_rate": 6.339822240573041e-06,
"loss": 0.4169,
"mean_token_accuracy": 0.8543005757965147,
"num_tokens": 289047051.0,
"step": 336
},
{
"entropy": 0.421844482421875,
"epoch": 2.674603174603175,
"grad_norm": 0.8463751671443359,
"learning_rate": 6.3187321786084236e-06,
"loss": 0.423,
"mean_token_accuracy": 0.852651288267225,
"num_tokens": 289920851.0,
"step": 337
},
{
"entropy": 0.418731689453125,
"epoch": 2.682539682539683,
"grad_norm": 0.8240504405278195,
"learning_rate": 6.297616869818926e-06,
"loss": 0.4069,
"mean_token_accuracy": 0.8571417732164264,
"num_tokens": 290766931.0,
"step": 338
},
{
"entropy": 0.427032470703125,
"epoch": 2.6904761904761907,
"grad_norm": 0.8185363544673269,
"learning_rate": 6.276476718452289e-06,
"loss": 0.4155,
"mean_token_accuracy": 0.853483980987221,
"num_tokens": 291599836.0,
"step": 339
},
{
"entropy": 0.417877197265625,
"epoch": 2.6984126984126986,
"grad_norm": 0.837427213509895,
"learning_rate": 6.2553121292318595e-06,
"loss": 0.4211,
"mean_token_accuracy": 0.8524906514212489,
"num_tokens": 292454972.0,
"step": 340
},
{
"entropy": 0.42510986328125,
"epoch": 2.7063492063492065,
"grad_norm": 0.8135990341026819,
"learning_rate": 6.23412350734884e-06,
"loss": 0.4166,
"mean_token_accuracy": 0.852956528775394,
"num_tokens": 293307675.0,
"step": 341
},
{
"entropy": 0.4229583740234375,
"epoch": 2.7142857142857144,
"grad_norm": 0.7369881660528143,
"learning_rate": 6.2129112584545325e-06,
"loss": 0.4144,
"mean_token_accuracy": 0.8540790337137878,
"num_tokens": 294149752.0,
"step": 342
},
{
"entropy": 0.4259033203125,
"epoch": 2.7222222222222223,
"grad_norm": 0.8315573451881167,
"learning_rate": 6.191675788652574e-06,
"loss": 0.4017,
"mean_token_accuracy": 0.8583689746446908,
"num_tokens": 294975614.0,
"step": 343
},
{
"entropy": 0.416900634765625,
"epoch": 2.7301587301587302,
"grad_norm": 0.8638440384540704,
"learning_rate": 6.170417504491157e-06,
"loss": 0.4147,
"mean_token_accuracy": 0.854499620385468,
"num_tokens": 295846874.0,
"step": 344
},
{
"entropy": 0.4163665771484375,
"epoch": 2.738095238095238,
"grad_norm": 0.8116865889754844,
"learning_rate": 6.149136812955256e-06,
"loss": 0.4166,
"mean_token_accuracy": 0.8544518309645355,
"num_tokens": 296730922.0,
"step": 345
},
{
"entropy": 0.41357421875,
"epoch": 2.746031746031746,
"grad_norm": 0.7806791564546498,
"learning_rate": 6.1278341214588255e-06,
"loss": 0.4101,
"mean_token_accuracy": 0.8577063884586096,
"num_tokens": 297610941.0,
"step": 346
},
{
"entropy": 0.4183197021484375,
"epoch": 2.753968253968254,
"grad_norm": 0.8686079824008746,
"learning_rate": 6.106509837837004e-06,
"loss": 0.412,
"mean_token_accuracy": 0.8529722727835178,
"num_tokens": 298464464.0,
"step": 347
},
{
"entropy": 0.4134063720703125,
"epoch": 2.761904761904762,
"grad_norm": 0.8287811327498212,
"learning_rate": 6.0851643703383066e-06,
"loss": 0.407,
"mean_token_accuracy": 0.8568426473066211,
"num_tokens": 299315956.0,
"step": 348
},
{
"entropy": 0.420440673828125,
"epoch": 2.7698412698412698,
"grad_norm": 0.7606321520792506,
"learning_rate": 6.063798127616811e-06,
"loss": 0.4129,
"mean_token_accuracy": 0.8552384455688298,
"num_tokens": 300162540.0,
"step": 349
},
{
"entropy": 0.422515869140625,
"epoch": 2.7777777777777777,
"grad_norm": 0.7256068297614475,
"learning_rate": 6.042411518724327e-06,
"loss": 0.41,
"mean_token_accuracy": 0.8559038788080215,
"num_tokens": 301009855.0,
"step": 350
},
{
"entropy": 0.42303466796875,
"epoch": 2.7857142857142856,
"grad_norm": 0.7634303802543713,
"learning_rate": 6.021004953102576e-06,
"loss": 0.4039,
"mean_token_accuracy": 0.8571093692444265,
"num_tokens": 301852351.0,
"step": 351
},
{
"entropy": 0.4193878173828125,
"epoch": 2.7936507936507935,
"grad_norm": 0.7645192522691564,
"learning_rate": 5.999578840575342e-06,
"loss": 0.4046,
"mean_token_accuracy": 0.8566430397331715,
"num_tokens": 302702189.0,
"step": 352
},
{
"entropy": 0.4232940673828125,
"epoch": 2.8015873015873014,
"grad_norm": 0.794739166753094,
"learning_rate": 5.978133591340633e-06,
"loss": 0.4091,
"mean_token_accuracy": 0.8565059076063335,
"num_tokens": 303546117.0,
"step": 353
},
{
"entropy": 0.4161529541015625,
"epoch": 2.8095238095238093,
"grad_norm": 0.8805512544331933,
"learning_rate": 5.956669615962821e-06,
"loss": 0.413,
"mean_token_accuracy": 0.8556133066304028,
"num_tokens": 304424136.0,
"step": 354
},
{
"entropy": 0.41705322265625,
"epoch": 2.817460317460317,
"grad_norm": 0.7877254936944273,
"learning_rate": 5.935187325364791e-06,
"loss": 0.42,
"mean_token_accuracy": 0.8545625568367541,
"num_tokens": 305299176.0,
"step": 355
},
{
"entropy": 0.4157562255859375,
"epoch": 2.825396825396825,
"grad_norm": 0.8062504809460449,
"learning_rate": 5.913687130820064e-06,
"loss": 0.4104,
"mean_token_accuracy": 0.8556024674326181,
"num_tokens": 306180918.0,
"step": 356
},
{
"entropy": 0.41650390625,
"epoch": 2.8333333333333335,
"grad_norm": 0.7092100136349762,
"learning_rate": 5.892169443944929e-06,
"loss": 0.4151,
"mean_token_accuracy": 0.8552160942927003,
"num_tokens": 307053855.0,
"step": 357
},
{
"entropy": 0.4198150634765625,
"epoch": 2.8412698412698414,
"grad_norm": 0.8020397673815377,
"learning_rate": 5.870634676690564e-06,
"loss": 0.414,
"mean_token_accuracy": 0.8550357166677713,
"num_tokens": 307908233.0,
"step": 358
},
{
"entropy": 0.419891357421875,
"epoch": 2.8492063492063493,
"grad_norm": 0.8184927667236647,
"learning_rate": 5.8490832413351465e-06,
"loss": 0.406,
"mean_token_accuracy": 0.8566894140094519,
"num_tokens": 308765267.0,
"step": 359
},
{
"entropy": 0.4268646240234375,
"epoch": 2.857142857142857,
"grad_norm": 0.7696962615494287,
"learning_rate": 5.827515550475955e-06,
"loss": 0.4112,
"mean_token_accuracy": 0.8539468543604016,
"num_tokens": 309586897.0,
"step": 360
},
{
"entropy": 0.4179840087890625,
"epoch": 2.865079365079365,
"grad_norm": 0.7851245206394726,
"learning_rate": 5.805932017021486e-06,
"loss": 0.4116,
"mean_token_accuracy": 0.8549962108954787,
"num_tokens": 310440896.0,
"step": 361
},
{
"entropy": 0.4180450439453125,
"epoch": 2.873015873015873,
"grad_norm": 0.7806163849576252,
"learning_rate": 5.784333054183533e-06,
"loss": 0.4069,
"mean_token_accuracy": 0.8565008505247533,
"num_tokens": 311297562.0,
"step": 362
},
{
"entropy": 0.4127197265625,
"epoch": 2.880952380952381,
"grad_norm": 0.754556378509014,
"learning_rate": 5.762719075469277e-06,
"loss": 0.4155,
"mean_token_accuracy": 0.8560635317116976,
"num_tokens": 312189513.0,
"step": 363
},
{
"entropy": 0.41400146484375,
"epoch": 2.888888888888889,
"grad_norm": 0.833131948010438,
"learning_rate": 5.741090494673386e-06,
"loss": 0.4098,
"mean_token_accuracy": 0.8566766679286957,
"num_tokens": 313055977.0,
"step": 364
},
{
"entropy": 0.4152984619140625,
"epoch": 2.8968253968253967,
"grad_norm": 0.8035675568742273,
"learning_rate": 5.719447725870071e-06,
"loss": 0.417,
"mean_token_accuracy": 0.8535463376902044,
"num_tokens": 313934488.0,
"step": 365
},
{
"entropy": 0.41644287109375,
"epoch": 2.9047619047619047,
"grad_norm": 0.809344160354769,
"learning_rate": 5.697791183405174e-06,
"loss": 0.4123,
"mean_token_accuracy": 0.8555832463316619,
"num_tokens": 314782888.0,
"step": 366
},
{
"entropy": 0.4123077392578125,
"epoch": 2.9126984126984126,
"grad_norm": 0.7542255175949691,
"learning_rate": 5.67612128188823e-06,
"loss": 0.4042,
"mean_token_accuracy": 0.8586938725784421,
"num_tokens": 315667111.0,
"step": 367
},
{
"entropy": 0.418426513671875,
"epoch": 2.9206349206349205,
"grad_norm": 0.7325186075881142,
"learning_rate": 5.654438436184531e-06,
"loss": 0.41,
"mean_token_accuracy": 0.8550154692493379,
"num_tokens": 316519645.0,
"step": 368
},
{
"entropy": 0.4178619384765625,
"epoch": 2.928571428571429,
"grad_norm": 0.764847574977915,
"learning_rate": 5.6327430614071794e-06,
"loss": 0.409,
"mean_token_accuracy": 0.8574743596836925,
"num_tokens": 317376914.0,
"step": 369
},
{
"entropy": 0.4179229736328125,
"epoch": 2.9365079365079367,
"grad_norm": 0.7979585773178869,
"learning_rate": 5.611035572909147e-06,
"loss": 0.4116,
"mean_token_accuracy": 0.8546005864627659,
"num_tokens": 318210944.0,
"step": 370
},
{
"entropy": 0.411712646484375,
"epoch": 2.9444444444444446,
"grad_norm": 0.7465872378991787,
"learning_rate": 5.589316386275318e-06,
"loss": 0.4127,
"mean_token_accuracy": 0.8551405002363026,
"num_tokens": 319072977.0,
"step": 371
},
{
"entropy": 0.413848876953125,
"epoch": 2.9523809523809526,
"grad_norm": 0.7670391280421824,
"learning_rate": 5.567585917314535e-06,
"loss": 0.4085,
"mean_token_accuracy": 0.8564633526839316,
"num_tokens": 319936836.0,
"step": 372
},
{
"entropy": 0.41168212890625,
"epoch": 2.9603174603174605,
"grad_norm": 0.8099483587987164,
"learning_rate": 5.545844582051641e-06,
"loss": 0.4053,
"mean_token_accuracy": 0.8578686797991395,
"num_tokens": 320807541.0,
"step": 373
},
{
"entropy": 0.4144439697265625,
"epoch": 2.9682539682539684,
"grad_norm": 0.8134336846221772,
"learning_rate": 5.524092796719507e-06,
"loss": 0.4096,
"mean_token_accuracy": 0.8564304136671126,
"num_tokens": 321676330.0,
"step": 374
},
{
"entropy": 0.41241455078125,
"epoch": 2.9761904761904763,
"grad_norm": 0.7501989763747119,
"learning_rate": 5.502330977751072e-06,
"loss": 0.4012,
"mean_token_accuracy": 0.8606314528733492,
"num_tokens": 322526195.0,
"step": 375
},
{
"entropy": 0.4184417724609375,
"epoch": 2.984126984126984,
"grad_norm": 0.8084127993444857,
"learning_rate": 5.4805595417713634e-06,
"loss": 0.4129,
"mean_token_accuracy": 0.854987567756325,
"num_tokens": 323373321.0,
"step": 376
},
{
"entropy": 0.413482666015625,
"epoch": 2.992063492063492,
"grad_norm": 0.8921476455980862,
"learning_rate": 5.458778905589528e-06,
"loss": 0.4048,
"mean_token_accuracy": 0.8568636071868241,
"num_tokens": 324241487.0,
"step": 377
},
{
"entropy": 0.413818359375,
"epoch": 3.0,
"grad_norm": 0.7212429646275152,
"learning_rate": 5.436989486190846e-06,
"loss": 0.4132,
"mean_token_accuracy": 0.8552796910516918,
"num_tokens": 325114310.0,
"step": 378
},
{
"entropy": 0.41851806640625,
"epoch": 3.007936507936508,
"grad_norm": 0.8309684413468622,
"learning_rate": 5.415191700728749e-06,
"loss": 0.3803,
"mean_token_accuracy": 0.8651906503364444,
"num_tokens": 325956929.0,
"step": 379
},
{
"entropy": 0.415771484375,
"epoch": 3.015873015873016,
"grad_norm": 0.875627308634879,
"learning_rate": 5.393385966516838e-06,
"loss": 0.3949,
"mean_token_accuracy": 0.8609235784970224,
"num_tokens": 326825247.0,
"step": 380
},
{
"entropy": 0.4132232666015625,
"epoch": 3.0238095238095237,
"grad_norm": 0.7660187349203336,
"learning_rate": 5.371572701020891e-06,
"loss": 0.3843,
"mean_token_accuracy": 0.865902341902256,
"num_tokens": 327664768.0,
"step": 381
},
{
"entropy": 0.4121246337890625,
"epoch": 3.0317460317460316,
"grad_norm": 0.876168435455912,
"learning_rate": 5.349752321850866e-06,
"loss": 0.3891,
"mean_token_accuracy": 0.8622553567402065,
"num_tokens": 328521474.0,
"step": 382
},
{
"entropy": 0.4065399169921875,
"epoch": 3.0396825396825395,
"grad_norm": 0.8349975533992948,
"learning_rate": 5.327925246752917e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8634061855264008,
"num_tokens": 329375199.0,
"step": 383
},
{
"entropy": 0.41015625,
"epoch": 3.0476190476190474,
"grad_norm": 0.7379532101462078,
"learning_rate": 5.306091893601384e-06,
"loss": 0.3854,
"mean_token_accuracy": 0.8652450819499791,
"num_tokens": 330238541.0,
"step": 384
},
{
"entropy": 0.4120330810546875,
"epoch": 3.0555555555555554,
"grad_norm": 0.7995763856973052,
"learning_rate": 5.284252680390803e-06,
"loss": 0.3919,
"mean_token_accuracy": 0.8609937699511647,
"num_tokens": 331111401.0,
"step": 385
},
{
"entropy": 0.412261962890625,
"epoch": 3.0634920634920633,
"grad_norm": 0.7968948726175992,
"learning_rate": 5.2624080252279006e-06,
"loss": 0.3891,
"mean_token_accuracy": 0.8621770567260683,
"num_tokens": 331970501.0,
"step": 386
},
{
"entropy": 0.412689208984375,
"epoch": 3.0714285714285716,
"grad_norm": 0.7567371102755298,
"learning_rate": 5.240558346323582e-06,
"loss": 0.388,
"mean_token_accuracy": 0.8623263705521822,
"num_tokens": 332839444.0,
"step": 387
},
{
"entropy": 0.4113616943359375,
"epoch": 3.0793650793650795,
"grad_norm": 0.8253393750303774,
"learning_rate": 5.218704061984938e-06,
"loss": 0.3805,
"mean_token_accuracy": 0.8644652073271573,
"num_tokens": 333694157.0,
"step": 388
},
{
"entropy": 0.4080963134765625,
"epoch": 3.0873015873015874,
"grad_norm": 0.8150357016122449,
"learning_rate": 5.196845590607225e-06,
"loss": 0.3778,
"mean_token_accuracy": 0.8659757277928293,
"num_tokens": 334553848.0,
"step": 389
},
{
"entropy": 0.411529541015625,
"epoch": 3.0952380952380953,
"grad_norm": 0.810657705832448,
"learning_rate": 5.174983350665861e-06,
"loss": 0.3837,
"mean_token_accuracy": 0.862535847350955,
"num_tokens": 335414382.0,
"step": 390
},
{
"entropy": 0.4155731201171875,
"epoch": 3.1031746031746033,
"grad_norm": 0.7578031832169086,
"learning_rate": 5.153117760708411e-06,
"loss": 0.388,
"mean_token_accuracy": 0.8647559527307749,
"num_tokens": 336270013.0,
"step": 391
},
{
"entropy": 0.4129638671875,
"epoch": 3.111111111111111,
"grad_norm": 0.7853251945488365,
"learning_rate": 5.131249239346574e-06,
"loss": 0.3874,
"mean_token_accuracy": 0.8632673225365579,
"num_tokens": 337153945.0,
"step": 392
},
{
"entropy": 0.41741943359375,
"epoch": 3.119047619047619,
"grad_norm": 0.8526319008895792,
"learning_rate": 5.109378205248177e-06,
"loss": 0.3813,
"mean_token_accuracy": 0.8653798257000744,
"num_tokens": 337986623.0,
"step": 393
},
{
"entropy": 0.41943359375,
"epoch": 3.126984126984127,
"grad_norm": 0.7777939267453691,
"learning_rate": 5.087505077129144e-06,
"loss": 0.3847,
"mean_token_accuracy": 0.8638610797934234,
"num_tokens": 338820053.0,
"step": 394
},
{
"entropy": 0.4080352783203125,
"epoch": 3.134920634920635,
"grad_norm": 0.8050487820823641,
"learning_rate": 5.065630273745495e-06,
"loss": 0.391,
"mean_token_accuracy": 0.8619571630842984,
"num_tokens": 339709184.0,
"step": 395
},
{
"entropy": 0.4158172607421875,
"epoch": 3.142857142857143,
"grad_norm": 0.788047324475754,
"learning_rate": 5.043754213885319e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8652480882592499,
"num_tokens": 340560422.0,
"step": 396
},
{
"entropy": 0.4126739501953125,
"epoch": 3.1507936507936507,
"grad_norm": 0.7895581256609918,
"learning_rate": 5.021877316360759e-06,
"loss": 0.3857,
"mean_token_accuracy": 0.8641035025939345,
"num_tokens": 341427547.0,
"step": 397
},
{
"entropy": 0.4117431640625,
"epoch": 3.1587301587301586,
"grad_norm": 0.7885237864621762,
"learning_rate": 5e-06,
"loss": 0.3809,
"mean_token_accuracy": 0.8656587679870427,
"num_tokens": 342297536.0,
"step": 398
},
{
"entropy": 0.42266845703125,
"epoch": 3.1666666666666665,
"grad_norm": 0.7347090872403708,
"learning_rate": 4.978122683639241e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.8654965776950121,
"num_tokens": 343108738.0,
"step": 399
},
{
"entropy": 0.41461181640625,
"epoch": 3.1746031746031744,
"grad_norm": 0.7919607175007582,
"learning_rate": 4.956245786114683e-06,
"loss": 0.3805,
"mean_token_accuracy": 0.8652189085260034,
"num_tokens": 343963294.0,
"step": 400
},
{
"entropy": 0.4164276123046875,
"epoch": 3.1825396825396823,
"grad_norm": 0.7982890026738051,
"learning_rate": 4.934369726254506e-06,
"loss": 0.3839,
"mean_token_accuracy": 0.8629119992256165,
"num_tokens": 344812036.0,
"step": 401
},
{
"entropy": 0.4126739501953125,
"epoch": 3.1904761904761907,
"grad_norm": 0.7923716094274679,
"learning_rate": 4.9124949228708566e-06,
"loss": 0.385,
"mean_token_accuracy": 0.8645118903368711,
"num_tokens": 345643006.0,
"step": 402
},
{
"entropy": 0.41131591796875,
"epoch": 3.1984126984126986,
"grad_norm": 0.8224201397236163,
"learning_rate": 4.890621794751825e-06,
"loss": 0.3781,
"mean_token_accuracy": 0.8670813706703484,
"num_tokens": 346526093.0,
"step": 403
},
{
"entropy": 0.4088134765625,
"epoch": 3.2063492063492065,
"grad_norm": 0.748278286726127,
"learning_rate": 4.8687507606534274e-06,
"loss": 0.3869,
"mean_token_accuracy": 0.8644262808375061,
"num_tokens": 347429334.0,
"step": 404
},
{
"entropy": 0.4080810546875,
"epoch": 3.2142857142857144,
"grad_norm": 0.7655034553814312,
"learning_rate": 4.8468822392915925e-06,
"loss": 0.3879,
"mean_token_accuracy": 0.86290636472404,
"num_tokens": 348300345.0,
"step": 405
},
{
"entropy": 0.41357421875,
"epoch": 3.2222222222222223,
"grad_norm": 0.8371130749450066,
"learning_rate": 4.82501664933414e-06,
"loss": 0.3895,
"mean_token_accuracy": 0.8630081634037197,
"num_tokens": 349174728.0,
"step": 406
},
{
"entropy": 0.414794921875,
"epoch": 3.2301587301587302,
"grad_norm": 0.7863015466902707,
"learning_rate": 4.803154409392776e-06,
"loss": 0.3827,
"mean_token_accuracy": 0.864469132386148,
"num_tokens": 350019662.0,
"step": 407
},
{
"entropy": 0.411102294921875,
"epoch": 3.238095238095238,
"grad_norm": 0.7465717328173177,
"learning_rate": 4.781295938015063e-06,
"loss": 0.3831,
"mean_token_accuracy": 0.8645726442337036,
"num_tokens": 350867252.0,
"step": 408
},
{
"entropy": 0.41094970703125,
"epoch": 3.246031746031746,
"grad_norm": 0.7655319214119843,
"learning_rate": 4.759441653676419e-06,
"loss": 0.3788,
"mean_token_accuracy": 0.8645767103880644,
"num_tokens": 351713015.0,
"step": 409
},
{
"entropy": 0.4076080322265625,
"epoch": 3.253968253968254,
"grad_norm": 0.7922787284316188,
"learning_rate": 4.737591974772102e-06,
"loss": 0.383,
"mean_token_accuracy": 0.8641512831673026,
"num_tokens": 352612442.0,
"step": 410
},
{
"entropy": 0.41400146484375,
"epoch": 3.261904761904762,
"grad_norm": 0.7897648138284737,
"learning_rate": 4.715747319609199e-06,
"loss": 0.3808,
"mean_token_accuracy": 0.8653241745196283,
"num_tokens": 353475199.0,
"step": 411
},
{
"entropy": 0.4135589599609375,
"epoch": 3.2698412698412698,
"grad_norm": 0.7900462489736286,
"learning_rate": 4.693908106398617e-06,
"loss": 0.3805,
"mean_token_accuracy": 0.8655834072269499,
"num_tokens": 354336321.0,
"step": 412
},
{
"entropy": 0.413421630859375,
"epoch": 3.2777777777777777,
"grad_norm": 0.8361561340006152,
"learning_rate": 4.6720747532470845e-06,
"loss": 0.3909,
"mean_token_accuracy": 0.8628287450410426,
"num_tokens": 355204238.0,
"step": 413
},
{
"entropy": 0.41632080078125,
"epoch": 3.2857142857142856,
"grad_norm": 0.7192804861618904,
"learning_rate": 4.650247678149135e-06,
"loss": 0.3822,
"mean_token_accuracy": 0.8637247635051608,
"num_tokens": 356039432.0,
"step": 414
},
{
"entropy": 0.417022705078125,
"epoch": 3.2936507936507935,
"grad_norm": 0.7884051950710803,
"learning_rate": 4.628427298979111e-06,
"loss": 0.3834,
"mean_token_accuracy": 0.8649981459602714,
"num_tokens": 356872476.0,
"step": 415
},
{
"entropy": 0.4183197021484375,
"epoch": 3.3015873015873014,
"grad_norm": 0.7660377464634954,
"learning_rate": 4.606614033483164e-06,
"loss": 0.3733,
"mean_token_accuracy": 0.8667362979613245,
"num_tokens": 357702880.0,
"step": 416
},
{
"entropy": 0.4095458984375,
"epoch": 3.3095238095238093,
"grad_norm": 1.502576186555255,
"learning_rate": 4.5848082992712516e-06,
"loss": 0.3851,
"mean_token_accuracy": 0.8638884532265365,
"num_tokens": 358593212.0,
"step": 417
},
{
"entropy": 0.411895751953125,
"epoch": 3.317460317460317,
"grad_norm": 0.8516989355397148,
"learning_rate": 4.563010513809156e-06,
"loss": 0.374,
"mean_token_accuracy": 0.8689160253852606,
"num_tokens": 359427196.0,
"step": 418
},
{
"entropy": 0.415771484375,
"epoch": 3.3253968253968256,
"grad_norm": 0.8333941577643297,
"learning_rate": 4.541221094410473e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.8632900207303464,
"num_tokens": 360297417.0,
"step": 419
},
{
"entropy": 0.4151153564453125,
"epoch": 3.3333333333333335,
"grad_norm": 0.9185443535766896,
"learning_rate": 4.519440458228638e-06,
"loss": 0.3929,
"mean_token_accuracy": 0.8609873973764479,
"num_tokens": 361158991.0,
"step": 420
},
{
"entropy": 0.407745361328125,
"epoch": 3.3412698412698414,
"grad_norm": 0.7923581076176655,
"learning_rate": 4.497669022248931e-06,
"loss": 0.3768,
"mean_token_accuracy": 0.86583196464926,
"num_tokens": 362026434.0,
"step": 421
},
{
"entropy": 0.4108734130859375,
"epoch": 3.3492063492063493,
"grad_norm": 0.8878219569934822,
"learning_rate": 4.475907203280494e-06,
"loss": 0.3874,
"mean_token_accuracy": 0.862205957993865,
"num_tokens": 362894932.0,
"step": 422
},
{
"entropy": 0.4105682373046875,
"epoch": 3.357142857142857,
"grad_norm": 0.8849923219918312,
"learning_rate": 4.45415541794836e-06,
"loss": 0.3861,
"mean_token_accuracy": 0.8655079673044384,
"num_tokens": 363774304.0,
"step": 423
},
{
"entropy": 0.415740966796875,
"epoch": 3.365079365079365,
"grad_norm": 0.7854653600781764,
"learning_rate": 4.432414082685466e-06,
"loss": 0.3759,
"mean_token_accuracy": 0.8663219287991524,
"num_tokens": 364624495.0,
"step": 424
},
{
"entropy": 0.4079132080078125,
"epoch": 3.373015873015873,
"grad_norm": 0.7932914584936744,
"learning_rate": 4.410683613724684e-06,
"loss": 0.3827,
"mean_token_accuracy": 0.8652459299191833,
"num_tokens": 365522341.0,
"step": 425
},
{
"entropy": 0.4139251708984375,
"epoch": 3.380952380952381,
"grad_norm": 0.8623623236366137,
"learning_rate": 4.388964427090855e-06,
"loss": 0.3818,
"mean_token_accuracy": 0.8645171159878373,
"num_tokens": 366384352.0,
"step": 426
},
{
"entropy": 0.406494140625,
"epoch": 3.388888888888889,
"grad_norm": 0.8905833082641034,
"learning_rate": 4.367256938592822e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8634491441771388,
"num_tokens": 367231565.0,
"step": 427
},
{
"entropy": 0.409393310546875,
"epoch": 3.3968253968253967,
"grad_norm": 0.8362209111342414,
"learning_rate": 4.345561563815471e-06,
"loss": 0.3722,
"mean_token_accuracy": 0.8681328790262341,
"num_tokens": 368105765.0,
"step": 428
},
{
"entropy": 0.4090118408203125,
"epoch": 3.4047619047619047,
"grad_norm": 0.7875786753075623,
"learning_rate": 4.323878718111771e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.8649689424782991,
"num_tokens": 368976566.0,
"step": 429
},
{
"entropy": 0.4091339111328125,
"epoch": 3.4126984126984126,
"grad_norm": 0.8242239527343677,
"learning_rate": 4.302208816594829e-06,
"loss": 0.3775,
"mean_token_accuracy": 0.8660351554863155,
"num_tokens": 369847661.0,
"step": 430
},
{
"entropy": 0.4122161865234375,
"epoch": 3.4206349206349205,
"grad_norm": 0.8253724820932592,
"learning_rate": 4.280552274129932e-06,
"loss": 0.3832,
"mean_token_accuracy": 0.8640619218349457,
"num_tokens": 370716519.0,
"step": 431
},
{
"entropy": 0.4095916748046875,
"epoch": 3.4285714285714284,
"grad_norm": 0.7522197274851726,
"learning_rate": 4.258909505326617e-06,
"loss": 0.3747,
"mean_token_accuracy": 0.8674253430217505,
"num_tokens": 371548994.0,
"step": 432
},
{
"entropy": 0.4106597900390625,
"epoch": 3.4365079365079367,
"grad_norm": 0.7580899790250368,
"learning_rate": 4.237280924530723e-06,
"loss": 0.3731,
"mean_token_accuracy": 0.8671010048128664,
"num_tokens": 372388644.0,
"step": 433
},
{
"entropy": 0.4091644287109375,
"epoch": 3.4444444444444446,
"grad_norm": 0.7966902079320883,
"learning_rate": 4.215666945816469e-06,
"loss": 0.3824,
"mean_token_accuracy": 0.8636285294778645,
"num_tokens": 373216255.0,
"step": 434
},
{
"entropy": 0.4067840576171875,
"epoch": 3.4523809523809526,
"grad_norm": 0.7259667199168536,
"learning_rate": 4.194067982978516e-06,
"loss": 0.3744,
"mean_token_accuracy": 0.8671418204903603,
"num_tokens": 374091119.0,
"step": 435
},
{
"entropy": 0.4103546142578125,
"epoch": 3.4603174603174605,
"grad_norm": 0.7168906340219251,
"learning_rate": 4.172484449524047e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8649011980742216,
"num_tokens": 374964989.0,
"step": 436
},
{
"entropy": 0.4099578857421875,
"epoch": 3.4682539682539684,
"grad_norm": 0.7226054435885343,
"learning_rate": 4.150916758664857e-06,
"loss": 0.3743,
"mean_token_accuracy": 0.86769935535267,
"num_tokens": 375849312.0,
"step": 437
},
{
"entropy": 0.4129791259765625,
"epoch": 3.4761904761904763,
"grad_norm": 0.7254203102984561,
"learning_rate": 4.129365323309436e-06,
"loss": 0.3886,
"mean_token_accuracy": 0.8636708622798324,
"num_tokens": 376711755.0,
"step": 438
},
{
"entropy": 0.4055328369140625,
"epoch": 3.484126984126984,
"grad_norm": 0.7258886081350053,
"learning_rate": 4.107830556055072e-06,
"loss": 0.377,
"mean_token_accuracy": 0.865284236613661,
"num_tokens": 377601829.0,
"step": 439
},
{
"entropy": 0.4116363525390625,
"epoch": 3.492063492063492,
"grad_norm": 0.7385419803324385,
"learning_rate": 4.086312869179938e-06,
"loss": 0.3811,
"mean_token_accuracy": 0.8655071114189923,
"num_tokens": 378449007.0,
"step": 440
},
{
"entropy": 0.409393310546875,
"epoch": 3.5,
"grad_norm": 0.7975589365886925,
"learning_rate": 4.06481267463521e-06,
"loss": 0.3746,
"mean_token_accuracy": 0.867948766797781,
"num_tokens": 379309394.0,
"step": 441
},
{
"entropy": 0.4044036865234375,
"epoch": 3.507936507936508,
"grad_norm": 0.7484471478807218,
"learning_rate": 4.04333038403718e-06,
"loss": 0.3755,
"mean_token_accuracy": 0.8677190546877682,
"num_tokens": 380174726.0,
"step": 442
},
{
"entropy": 0.406707763671875,
"epoch": 3.515873015873016,
"grad_norm": 0.7052307989651647,
"learning_rate": 4.021866408659368e-06,
"loss": 0.3766,
"mean_token_accuracy": 0.8668166692368686,
"num_tokens": 381047802.0,
"step": 443
},
{
"entropy": 0.4095001220703125,
"epoch": 3.5238095238095237,
"grad_norm": 0.7205324024463486,
"learning_rate": 4.000421159424658e-06,
"loss": 0.3782,
"mean_token_accuracy": 0.8670969372615218,
"num_tokens": 381900519.0,
"step": 444
},
{
"entropy": 0.40911865234375,
"epoch": 3.5317460317460316,
"grad_norm": 0.6751185639712526,
"learning_rate": 3.978995046897425e-06,
"loss": 0.3811,
"mean_token_accuracy": 0.8652258133515716,
"num_tokens": 382738529.0,
"step": 445
},
{
"entropy": 0.4097747802734375,
"epoch": 3.5396825396825395,
"grad_norm": 0.8003232347426622,
"learning_rate": 3.957588481275674e-06,
"loss": 0.3813,
"mean_token_accuracy": 0.8646343694999814,
"num_tokens": 383603819.0,
"step": 446
},
{
"entropy": 0.4104156494140625,
"epoch": 3.5476190476190474,
"grad_norm": 0.7612125218536709,
"learning_rate": 3.9362018723831915e-06,
"loss": 0.3834,
"mean_token_accuracy": 0.8642422612756491,
"num_tokens": 384466493.0,
"step": 447
},
{
"entropy": 0.40789794921875,
"epoch": 3.5555555555555554,
"grad_norm": 0.7301586930422078,
"learning_rate": 3.914835629661695e-06,
"loss": 0.3691,
"mean_token_accuracy": 0.8685760577209294,
"num_tokens": 385303493.0,
"step": 448
},
{
"entropy": 0.4090576171875,
"epoch": 3.5634920634920633,
"grad_norm": 0.8028620237168601,
"learning_rate": 3.893490162162997e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8661560285836458,
"num_tokens": 386139059.0,
"step": 449
},
{
"entropy": 0.4043426513671875,
"epoch": 3.571428571428571,
"grad_norm": 0.8038126363701456,
"learning_rate": 3.872165878541175e-06,
"loss": 0.3819,
"mean_token_accuracy": 0.8657438950613141,
"num_tokens": 387035788.0,
"step": 450
},
{
"entropy": 0.4053955078125,
"epoch": 3.5793650793650795,
"grad_norm": 0.7971696805205959,
"learning_rate": 3.850863187044745e-06,
"loss": 0.3783,
"mean_token_accuracy": 0.8658295255154371,
"num_tokens": 387893370.0,
"step": 451
},
{
"entropy": 0.4118804931640625,
"epoch": 3.5873015873015874,
"grad_norm": 0.7382565780061542,
"learning_rate": 3.829582495508844e-06,
"loss": 0.3774,
"mean_token_accuracy": 0.8669222141616046,
"num_tokens": 388743564.0,
"step": 452
},
{
"entropy": 0.4039764404296875,
"epoch": 3.5952380952380953,
"grad_norm": 0.762934354969153,
"learning_rate": 3.808324211347429e-06,
"loss": 0.3846,
"mean_token_accuracy": 0.8625594675540924,
"num_tokens": 389636739.0,
"step": 453
},
{
"entropy": 0.4062957763671875,
"epoch": 3.6031746031746033,
"grad_norm": 0.7869576244237314,
"learning_rate": 3.7870887415454687e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8657813919708133,
"num_tokens": 390510641.0,
"step": 454
},
{
"entropy": 0.4098968505859375,
"epoch": 3.611111111111111,
"grad_norm": 0.7351333487875162,
"learning_rate": 3.7658764926511613e-06,
"loss": 0.3659,
"mean_token_accuracy": 0.868850149679929,
"num_tokens": 391336036.0,
"step": 455
},
{
"entropy": 0.40362548828125,
"epoch": 3.619047619047619,
"grad_norm": 0.734375379903563,
"learning_rate": 3.7446878707681413e-06,
"loss": 0.373,
"mean_token_accuracy": 0.8676298609934747,
"num_tokens": 392197671.0,
"step": 456
},
{
"entropy": 0.4044189453125,
"epoch": 3.626984126984127,
"grad_norm": 0.8070921691017913,
"learning_rate": 3.7235232815477123e-06,
"loss": 0.3723,
"mean_token_accuracy": 0.8680153395980597,
"num_tokens": 393048877.0,
"step": 457
},
{
"entropy": 0.4051971435546875,
"epoch": 3.634920634920635,
"grad_norm": 0.7382078569033306,
"learning_rate": 3.7023831301810765e-06,
"loss": 0.3807,
"mean_token_accuracy": 0.8656099583022296,
"num_tokens": 393913386.0,
"step": 458
},
{
"entropy": 0.412567138671875,
"epoch": 3.642857142857143,
"grad_norm": 0.7500851651198606,
"learning_rate": 3.6812678213915777e-06,
"loss": 0.3753,
"mean_token_accuracy": 0.866928874514997,
"num_tokens": 394741069.0,
"step": 459
},
{
"entropy": 0.4068450927734375,
"epoch": 3.6507936507936507,
"grad_norm": 0.7014301015459137,
"learning_rate": 3.6601777594269605e-06,
"loss": 0.3716,
"mean_token_accuracy": 0.8681608587503433,
"num_tokens": 395587154.0,
"step": 460
},
{
"entropy": 0.4112701416015625,
"epoch": 3.6587301587301586,
"grad_norm": 0.7453844502592517,
"learning_rate": 3.6391133480516196e-06,
"loss": 0.37,
"mean_token_accuracy": 0.8680603308603168,
"num_tokens": 396429106.0,
"step": 461
},
{
"entropy": 0.40789794921875,
"epoch": 3.6666666666666665,
"grad_norm": 0.6714935595868241,
"learning_rate": 3.618074990538873e-06,
"loss": 0.3764,
"mean_token_accuracy": 0.8662193124182522,
"num_tokens": 397306106.0,
"step": 462
},
{
"entropy": 0.404541015625,
"epoch": 3.674603174603175,
"grad_norm": 0.7264996158823219,
"learning_rate": 3.5970630896632485e-06,
"loss": 0.3679,
"mean_token_accuracy": 0.8692012121900916,
"num_tokens": 398186044.0,
"step": 463
},
{
"entropy": 0.4099273681640625,
"epoch": 3.682539682539683,
"grad_norm": 0.6812527161900478,
"learning_rate": 3.5760780476927637e-06,
"loss": 0.3783,
"mean_token_accuracy": 0.866292960010469,
"num_tokens": 399059367.0,
"step": 464
},
{
"entropy": 0.4114532470703125,
"epoch": 3.6904761904761907,
"grad_norm": 0.7396202860678279,
"learning_rate": 3.5551202663812344e-06,
"loss": 0.3671,
"mean_token_accuracy": 0.8694238997995853,
"num_tokens": 399921480.0,
"step": 465
},
{
"entropy": 0.41082763671875,
"epoch": 3.6984126984126986,
"grad_norm": 0.7505615061152298,
"learning_rate": 3.534190146960571e-06,
"loss": 0.3738,
"mean_token_accuracy": 0.8666451787576079,
"num_tokens": 400768716.0,
"step": 466
},
{
"entropy": 0.407745361328125,
"epoch": 3.7063492063492065,
"grad_norm": 0.7716901466331328,
"learning_rate": 3.5132880901331067e-06,
"loss": 0.3836,
"mean_token_accuracy": 0.8653549118898809,
"num_tokens": 401643010.0,
"step": 467
},
{
"entropy": 0.411224365234375,
"epoch": 3.7142857142857144,
"grad_norm": 0.7425696552972633,
"learning_rate": 3.492414496063921e-06,
"loss": 0.3699,
"mean_token_accuracy": 0.8682533628307283,
"num_tokens": 402482222.0,
"step": 468
},
{
"entropy": 0.4105377197265625,
"epoch": 3.7222222222222223,
"grad_norm": 0.7549782563712677,
"learning_rate": 3.4715697643731828e-06,
"loss": 0.374,
"mean_token_accuracy": 0.8664119308814406,
"num_tokens": 403330184.0,
"step": 469
},
{
"entropy": 0.4114990234375,
"epoch": 3.7301587301587302,
"grad_norm": 0.6882214771400156,
"learning_rate": 3.4507542941284933e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8662938089109957,
"num_tokens": 404170985.0,
"step": 470
},
{
"entropy": 0.4065399169921875,
"epoch": 3.738095238095238,
"grad_norm": 0.7790709755576551,
"learning_rate": 3.4299684838372547e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.8684107572771609,
"num_tokens": 405023111.0,
"step": 471
},
{
"entropy": 0.4109344482421875,
"epoch": 3.746031746031746,
"grad_norm": 0.8999085919269414,
"learning_rate": 3.4092127314390354e-06,
"loss": 0.3733,
"mean_token_accuracy": 0.8679695804603398,
"num_tokens": 405909984.0,
"step": 472
},
{
"entropy": 0.410552978515625,
"epoch": 3.753968253968254,
"grad_norm": 0.7852309398323011,
"learning_rate": 3.388487434297949e-06,
"loss": 0.3726,
"mean_token_accuracy": 0.868429503403604,
"num_tokens": 406762973.0,
"step": 473
},
{
"entropy": 0.4063873291015625,
"epoch": 3.761904761904762,
"grad_norm": 0.7497329657708961,
"learning_rate": 3.3677929891950527e-06,
"loss": 0.3675,
"mean_token_accuracy": 0.8680104180239141,
"num_tokens": 407632013.0,
"step": 474
},
{
"entropy": 0.4088287353515625,
"epoch": 3.7698412698412698,
"grad_norm": 0.9959878902569155,
"learning_rate": 3.347129792320748e-06,
"loss": 0.3803,
"mean_token_accuracy": 0.8661512886174023,
"num_tokens": 408479038.0,
"step": 475
},
{
"entropy": 0.400726318359375,
"epoch": 3.7777777777777777,
"grad_norm": 0.7083662937552805,
"learning_rate": 3.3264982392671973e-06,
"loss": 0.3707,
"mean_token_accuracy": 0.8689758381806314,
"num_tokens": 409381423.0,
"step": 476
},
{
"entropy": 0.4104156494140625,
"epoch": 3.7857142857142856,
"grad_norm": 0.7381897728110274,
"learning_rate": 3.3058987250207476e-06,
"loss": 0.3677,
"mean_token_accuracy": 0.869597565382719,
"num_tokens": 410237991.0,
"step": 477
},
{
"entropy": 0.4114227294921875,
"epoch": 3.7936507936507935,
"grad_norm": 0.7658185988150478,
"learning_rate": 3.285331643954372e-06,
"loss": 0.3779,
"mean_token_accuracy": 0.8658644729293883,
"num_tokens": 411113505.0,
"step": 478
},
{
"entropy": 0.4087371826171875,
"epoch": 3.8015873015873014,
"grad_norm": 0.6773211053527161,
"learning_rate": 3.2647973898201157e-06,
"loss": 0.3692,
"mean_token_accuracy": 0.8686946122907102,
"num_tokens": 411973035.0,
"step": 479
},
{
"entropy": 0.41082763671875,
"epoch": 3.8095238095238093,
"grad_norm": 0.6795058958160979,
"learning_rate": 3.244296355741561e-06,
"loss": 0.3792,
"mean_token_accuracy": 0.864706945605576,
"num_tokens": 412804401.0,
"step": 480
},
{
"entropy": 0.411712646484375,
"epoch": 3.817460317460317,
"grad_norm": 0.7172393778753827,
"learning_rate": 3.2238289342063013e-06,
"loss": 0.3741,
"mean_token_accuracy": 0.8672458734363317,
"num_tokens": 413645879.0,
"step": 481
},
{
"entropy": 0.40325927734375,
"epoch": 3.825396825396825,
"grad_norm": 0.7137250398041949,
"learning_rate": 3.203395517058423e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.8636117246933281,
"num_tokens": 414526310.0,
"step": 482
},
{
"entropy": 0.40850830078125,
"epoch": 3.8333333333333335,
"grad_norm": 0.7060925413908444,
"learning_rate": 3.1829964954910076e-06,
"loss": 0.3744,
"mean_token_accuracy": 0.8668604497797787,
"num_tokens": 415386155.0,
"step": 483
},
{
"entropy": 0.4058837890625,
"epoch": 3.8412698412698414,
"grad_norm": 0.7071475776511494,
"learning_rate": 3.1626322600386418e-06,
"loss": 0.369,
"mean_token_accuracy": 0.8673932519741356,
"num_tokens": 416255235.0,
"step": 484
},
{
"entropy": 0.4070587158203125,
"epoch": 3.8492063492063493,
"grad_norm": 0.716470253050545,
"learning_rate": 3.1423032005699377e-06,
"loss": 0.3776,
"mean_token_accuracy": 0.8670607698149979,
"num_tokens": 417123335.0,
"step": 485
},
{
"entropy": 0.4124298095703125,
"epoch": 3.857142857142857,
"grad_norm": 0.7331008247958041,
"learning_rate": 3.122009706280072e-06,
"loss": 0.3725,
"mean_token_accuracy": 0.8690087418071926,
"num_tokens": 417961712.0,
"step": 486
},
{
"entropy": 0.407073974609375,
"epoch": 3.865079365079365,
"grad_norm": 0.7215581902390997,
"learning_rate": 3.1017521656833384e-06,
"loss": 0.3738,
"mean_token_accuracy": 0.8678515437059104,
"num_tokens": 418833226.0,
"step": 487
},
{
"entropy": 0.4102020263671875,
"epoch": 3.873015873015873,
"grad_norm": 0.6750907563285418,
"learning_rate": 3.0815309666057013e-06,
"loss": 0.3798,
"mean_token_accuracy": 0.8680808427743614,
"num_tokens": 419693605.0,
"step": 488
},
{
"entropy": 0.4107666015625,
"epoch": 3.880952380952381,
"grad_norm": 0.7253529027679292,
"learning_rate": 3.061346496177374e-06,
"loss": 0.3765,
"mean_token_accuracy": 0.8668225076980889,
"num_tokens": 420533126.0,
"step": 489
},
{
"entropy": 0.403076171875,
"epoch": 3.888888888888889,
"grad_norm": 0.774847139075562,
"learning_rate": 3.0411991408254116e-06,
"loss": 0.3734,
"mean_token_accuracy": 0.8675551642663777,
"num_tokens": 421408158.0,
"step": 490
},
{
"entropy": 0.407684326171875,
"epoch": 3.8968253968253967,
"grad_norm": 0.7430473963918918,
"learning_rate": 3.0210892862663043e-06,
"loss": 0.3669,
"mean_token_accuracy": 0.8696022001095116,
"num_tokens": 422285612.0,
"step": 491
},
{
"entropy": 0.40545654296875,
"epoch": 3.9047619047619047,
"grad_norm": 0.7359917176077896,
"learning_rate": 3.001017317498607e-06,
"loss": 0.3654,
"mean_token_accuracy": 0.8683894919231534,
"num_tokens": 423145278.0,
"step": 492
},
{
"entropy": 0.4095001220703125,
"epoch": 3.9126984126984126,
"grad_norm": 0.6995695516835777,
"learning_rate": 2.9809836187955532e-06,
"loss": 0.3759,
"mean_token_accuracy": 0.8677502269856632,
"num_tokens": 423982368.0,
"step": 493
},
{
"entropy": 0.4052734375,
"epoch": 3.9206349206349205,
"grad_norm": 0.7865083776711185,
"learning_rate": 2.960988573697705e-06,
"loss": 0.3769,
"mean_token_accuracy": 0.8668651487678289,
"num_tokens": 424855995.0,
"step": 494
},
{
"entropy": 0.4091949462890625,
"epoch": 3.928571428571429,
"grad_norm": 0.7069460296016946,
"learning_rate": 2.941032565005613e-06,
"loss": 0.3734,
"mean_token_accuracy": 0.8668679501861334,
"num_tokens": 425714116.0,
"step": 495
},
{
"entropy": 0.4059906005859375,
"epoch": 3.9365079365079367,
"grad_norm": 0.7074879125016784,
"learning_rate": 2.9211159747724813e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.86768330167979,
"num_tokens": 426587317.0,
"step": 496
},
{
"entropy": 0.4085845947265625,
"epoch": 3.9444444444444446,
"grad_norm": 0.6887885189112549,
"learning_rate": 2.90123918429686e-06,
"loss": 0.3693,
"mean_token_accuracy": 0.8686679415404797,
"num_tokens": 427459701.0,
"step": 497
},
{
"entropy": 0.4090728759765625,
"epoch": 3.9523809523809526,
"grad_norm": 0.7038820990298879,
"learning_rate": 2.881402574115344e-06,
"loss": 0.3693,
"mean_token_accuracy": 0.869040944147855,
"num_tokens": 428310157.0,
"step": 498
},
{
"entropy": 0.406158447265625,
"epoch": 3.9603174603174605,
"grad_norm": 0.7223999433933582,
"learning_rate": 2.8616065239952763e-06,
"loss": 0.3706,
"mean_token_accuracy": 0.8684421242214739,
"num_tokens": 429155049.0,
"step": 499
},
{
"entropy": 0.4044036865234375,
"epoch": 3.9682539682539684,
"grad_norm": 0.7016426279147887,
"learning_rate": 2.841851412927495e-06,
"loss": 0.3706,
"mean_token_accuracy": 0.8662050706334412,
"num_tokens": 430044250.0,
"step": 500
},
{
"entropy": 0.405242919921875,
"epoch": 3.9761904761904763,
"grad_norm": 0.6864573418269565,
"learning_rate": 2.822137619119065e-06,
"loss": 0.365,
"mean_token_accuracy": 0.8711186717264354,
"num_tokens": 430927931.0,
"step": 501
},
{
"entropy": 0.40313720703125,
"epoch": 3.984126984126984,
"grad_norm": 0.7183772106536512,
"learning_rate": 2.8024655199860495e-06,
"loss": 0.3682,
"mean_token_accuracy": 0.8691108208149672,
"num_tokens": 431777495.0,
"step": 502
},
{
"entropy": 0.4088134765625,
"epoch": 3.992063492063492,
"grad_norm": 0.7147302557020478,
"learning_rate": 2.7828354921462668e-06,
"loss": 0.3622,
"mean_token_accuracy": 0.8704049359075725,
"num_tokens": 432616684.0,
"step": 503
},
{
"entropy": 0.40557861328125,
"epoch": 4.0,
"grad_norm": 0.724778260633041,
"learning_rate": 2.7632479114120963e-06,
"loss": 0.367,
"mean_token_accuracy": 0.8679725076071918,
"num_tokens": 433464885.0,
"step": 504
},
{
"entropy": 0.3993377685546875,
"epoch": 4.007936507936508,
"grad_norm": 0.7300051389709427,
"learning_rate": 2.7437031527832747e-06,
"loss": 0.3473,
"mean_token_accuracy": 0.8751431121490896,
"num_tokens": 434354281.0,
"step": 505
},
{
"entropy": 0.40277099609375,
"epoch": 4.015873015873016,
"grad_norm": 0.7518820580172925,
"learning_rate": 2.72420159043972e-06,
"loss": 0.351,
"mean_token_accuracy": 0.8758351663127542,
"num_tokens": 435254037.0,
"step": 506
},
{
"entropy": 0.40679931640625,
"epoch": 4.023809523809524,
"grad_norm": 0.75823317605801,
"learning_rate": 2.704743597734365e-06,
"loss": 0.3449,
"mean_token_accuracy": 0.8767664707265794,
"num_tokens": 436096499.0,
"step": 507
},
{
"entropy": 0.4034576416015625,
"epoch": 4.031746031746032,
"grad_norm": 0.781839052148236,
"learning_rate": 2.685329547186018e-06,
"loss": 0.349,
"mean_token_accuracy": 0.8767024255357683,
"num_tokens": 436936261.0,
"step": 508
},
{
"entropy": 0.3980255126953125,
"epoch": 4.0396825396825395,
"grad_norm": 0.8874815588264233,
"learning_rate": 2.665959810472219e-06,
"loss": 0.3457,
"mean_token_accuracy": 0.8768184627406299,
"num_tokens": 437789126.0,
"step": 509
},
{
"entropy": 0.400390625,
"epoch": 4.0476190476190474,
"grad_norm": 0.8438997097584569,
"learning_rate": 2.6466347584221314e-06,
"loss": 0.3488,
"mean_token_accuracy": 0.8754238770343363,
"num_tokens": 438642725.0,
"step": 510
},
{
"entropy": 0.399810791015625,
"epoch": 4.055555555555555,
"grad_norm": 0.7914559178878162,
"learning_rate": 2.6273547610094408e-06,
"loss": 0.3568,
"mean_token_accuracy": 0.8729163003154099,
"num_tokens": 439509332.0,
"step": 511
},
{
"entropy": 0.4043426513671875,
"epoch": 4.063492063492063,
"grad_norm": 0.8018090810161909,
"learning_rate": 2.608120187345273e-06,
"loss": 0.3589,
"mean_token_accuracy": 0.8719246378168464,
"num_tokens": 440358824.0,
"step": 512
},
{
"entropy": 0.40130615234375,
"epoch": 4.071428571428571,
"grad_norm": 0.717052204902591,
"learning_rate": 2.588931405671127e-06,
"loss": 0.347,
"mean_token_accuracy": 0.876106639392674,
"num_tokens": 441231571.0,
"step": 513
},
{
"entropy": 0.40350341796875,
"epoch": 4.079365079365079,
"grad_norm": 0.7833684120919279,
"learning_rate": 2.5697887833518215e-06,
"loss": 0.3481,
"mean_token_accuracy": 0.874689971562475,
"num_tokens": 442070234.0,
"step": 514
},
{
"entropy": 0.400054931640625,
"epoch": 4.087301587301587,
"grad_norm": 0.7624057899379104,
"learning_rate": 2.5506926868684683e-06,
"loss": 0.354,
"mean_token_accuracy": 0.8740922566503286,
"num_tokens": 442955553.0,
"step": 515
},
{
"entropy": 0.4012908935546875,
"epoch": 4.095238095238095,
"grad_norm": 0.7656669668346728,
"learning_rate": 2.5316434818114517e-06,
"loss": 0.3412,
"mean_token_accuracy": 0.8769300729036331,
"num_tokens": 443803905.0,
"step": 516
},
{
"entropy": 0.398406982421875,
"epoch": 4.103174603174603,
"grad_norm": 0.7384783736599843,
"learning_rate": 2.5126415328734275e-06,
"loss": 0.3549,
"mean_token_accuracy": 0.875963733997196,
"num_tokens": 444676270.0,
"step": 517
},
{
"entropy": 0.3984222412109375,
"epoch": 4.111111111111111,
"grad_norm": 0.763214655206926,
"learning_rate": 2.4936872038423516e-06,
"loss": 0.3527,
"mean_token_accuracy": 0.8742309152148664,
"num_tokens": 445551871.0,
"step": 518
},
{
"entropy": 0.4028167724609375,
"epoch": 4.119047619047619,
"grad_norm": 0.7566995169023817,
"learning_rate": 2.4747808575945006e-06,
"loss": 0.351,
"mean_token_accuracy": 0.8753285491839051,
"num_tokens": 446395494.0,
"step": 519
},
{
"entropy": 0.4046173095703125,
"epoch": 4.1269841269841265,
"grad_norm": 0.7704383123576651,
"learning_rate": 2.4559228560875336e-06,
"loss": 0.3489,
"mean_token_accuracy": 0.8744379128329456,
"num_tokens": 447255878.0,
"step": 520
},
{
"entropy": 0.4016876220703125,
"epoch": 4.134920634920635,
"grad_norm": 0.689868389625917,
"learning_rate": 2.4371135603535613e-06,
"loss": 0.3475,
"mean_token_accuracy": 0.8762855334207416,
"num_tokens": 448095199.0,
"step": 521
},
{
"entropy": 0.4026947021484375,
"epoch": 4.142857142857143,
"grad_norm": 0.7668680601519664,
"learning_rate": 2.4183533304922336e-06,
"loss": 0.3459,
"mean_token_accuracy": 0.8746473412029445,
"num_tokens": 448954987.0,
"step": 522
},
{
"entropy": 0.4035491943359375,
"epoch": 4.150793650793651,
"grad_norm": 0.72483989286729,
"learning_rate": 2.399642525663843e-06,
"loss": 0.3558,
"mean_token_accuracy": 0.8751249178312719,
"num_tokens": 449829424.0,
"step": 523
},
{
"entropy": 0.3994140625,
"epoch": 4.158730158730159,
"grad_norm": 0.706119007803981,
"learning_rate": 2.380981504082459e-06,
"loss": 0.349,
"mean_token_accuracy": 0.875414258800447,
"num_tokens": 450685443.0,
"step": 524
},
{
"entropy": 0.3993988037109375,
"epoch": 4.166666666666667,
"grad_norm": 0.7212946770565037,
"learning_rate": 2.3623706230090517e-06,
"loss": 0.3557,
"mean_token_accuracy": 0.8735111146233976,
"num_tokens": 451597512.0,
"step": 525
},
{
"entropy": 0.4042205810546875,
"epoch": 4.174603174603175,
"grad_norm": 0.7005628568870317,
"learning_rate": 2.3438102387446686e-06,
"loss": 0.3469,
"mean_token_accuracy": 0.8763788240030408,
"num_tokens": 452424123.0,
"step": 526
},
{
"entropy": 0.3973541259765625,
"epoch": 4.182539682539683,
"grad_norm": 0.7155495360306364,
"learning_rate": 2.325300706623607e-06,
"loss": 0.353,
"mean_token_accuracy": 0.8734072712250054,
"num_tokens": 453294509.0,
"step": 527
},
{
"entropy": 0.4000091552734375,
"epoch": 4.190476190476191,
"grad_norm": 0.7006937562520844,
"learning_rate": 2.3068423810066085e-06,
"loss": 0.3528,
"mean_token_accuracy": 0.8746827309951186,
"num_tokens": 454176550.0,
"step": 528
},
{
"entropy": 0.404052734375,
"epoch": 4.198412698412699,
"grad_norm": 0.6995025837491133,
"learning_rate": 2.288435615274085e-06,
"loss": 0.3579,
"mean_token_accuracy": 0.8727600080892444,
"num_tokens": 455027872.0,
"step": 529
},
{
"entropy": 0.4065704345703125,
"epoch": 4.2063492063492065,
"grad_norm": 0.7479029235127206,
"learning_rate": 2.2700807618193393e-06,
"loss": 0.3416,
"mean_token_accuracy": 0.8783422014676034,
"num_tokens": 455894126.0,
"step": 530
},
{
"entropy": 0.40460205078125,
"epoch": 4.214285714285714,
"grad_norm": 0.746817252036699,
"learning_rate": 2.251778172041828e-06,
"loss": 0.3455,
"mean_token_accuracy": 0.8771996637806296,
"num_tokens": 456741345.0,
"step": 531
},
{
"entropy": 0.402862548828125,
"epoch": 4.222222222222222,
"grad_norm": 0.6876078945117027,
"learning_rate": 2.2335281963404315e-06,
"loss": 0.3501,
"mean_token_accuracy": 0.8753550541587174,
"num_tokens": 457597774.0,
"step": 532
},
{
"entropy": 0.3993682861328125,
"epoch": 4.23015873015873,
"grad_norm": 0.7282590241527106,
"learning_rate": 2.2153311841067438e-06,
"loss": 0.3442,
"mean_token_accuracy": 0.8769136122427881,
"num_tokens": 458481487.0,
"step": 533
},
{
"entropy": 0.3993377685546875,
"epoch": 4.238095238095238,
"grad_norm": 0.7553930529448756,
"learning_rate": 2.1971874837183914e-06,
"loss": 0.3458,
"mean_token_accuracy": 0.8748537562787533,
"num_tokens": 459329943.0,
"step": 534
},
{
"entropy": 0.3927154541015625,
"epoch": 4.246031746031746,
"grad_norm": 0.7591301726961953,
"learning_rate": 2.179097442532352e-06,
"loss": 0.3394,
"mean_token_accuracy": 0.8797525470145047,
"num_tokens": 460196696.0,
"step": 535
},
{
"entropy": 0.39837646484375,
"epoch": 4.253968253968254,
"grad_norm": 0.7626069369315728,
"learning_rate": 2.1610614068783112e-06,
"loss": 0.361,
"mean_token_accuracy": 0.8727622926235199,
"num_tokens": 461069051.0,
"step": 536
},
{
"entropy": 0.403106689453125,
"epoch": 4.261904761904762,
"grad_norm": 0.7708798701412393,
"learning_rate": 2.143079722052034e-06,
"loss": 0.3479,
"mean_token_accuracy": 0.8751750965602696,
"num_tokens": 461920899.0,
"step": 537
},
{
"entropy": 0.400177001953125,
"epoch": 4.26984126984127,
"grad_norm": 0.7389095619225245,
"learning_rate": 2.125152732308747e-06,
"loss": 0.3459,
"mean_token_accuracy": 0.8786491984501481,
"num_tokens": 462797643.0,
"step": 538
},
{
"entropy": 0.4058380126953125,
"epoch": 4.277777777777778,
"grad_norm": 0.665980057345897,
"learning_rate": 2.1072807808565547e-06,
"loss": 0.3501,
"mean_token_accuracy": 0.8757951087318361,
"num_tokens": 463640936.0,
"step": 539
},
{
"entropy": 0.40155029296875,
"epoch": 4.285714285714286,
"grad_norm": 0.7005429583368125,
"learning_rate": 2.0894642098498656e-06,
"loss": 0.3587,
"mean_token_accuracy": 0.8732366347685456,
"num_tokens": 464513012.0,
"step": 540
},
{
"entropy": 0.4015350341796875,
"epoch": 4.2936507936507935,
"grad_norm": 0.6682325297585425,
"learning_rate": 2.0717033603828436e-06,
"loss": 0.3485,
"mean_token_accuracy": 0.8750377274118364,
"num_tokens": 465345613.0,
"step": 541
},
{
"entropy": 0.4040985107421875,
"epoch": 4.301587301587301,
"grad_norm": 0.7424671938333351,
"learning_rate": 2.0539985724828736e-06,
"loss": 0.3498,
"mean_token_accuracy": 0.875401156488806,
"num_tokens": 466181756.0,
"step": 542
},
{
"entropy": 0.401275634765625,
"epoch": 4.309523809523809,
"grad_norm": 0.695999191273483,
"learning_rate": 2.0363501851040573e-06,
"loss": 0.3436,
"mean_token_accuracy": 0.8764086258597672,
"num_tokens": 467035382.0,
"step": 543
},
{
"entropy": 0.40155029296875,
"epoch": 4.317460317460317,
"grad_norm": 0.6827039903530154,
"learning_rate": 2.0187585361207174e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.8745089964941144,
"num_tokens": 467897340.0,
"step": 544
},
{
"entropy": 0.400177001953125,
"epoch": 4.325396825396825,
"grad_norm": 0.685211307868124,
"learning_rate": 2.001223962320941e-06,
"loss": 0.3517,
"mean_token_accuracy": 0.8753441325388849,
"num_tokens": 468764096.0,
"step": 545
},
{
"entropy": 0.405242919921875,
"epoch": 4.333333333333333,
"grad_norm": 0.6682700395214807,
"learning_rate": 1.9837467994001165e-06,
"loss": 0.3457,
"mean_token_accuracy": 0.8773820898495615,
"num_tokens": 469610106.0,
"step": 546
},
{
"entropy": 0.399566650390625,
"epoch": 4.341269841269841,
"grad_norm": 0.6719545574593448,
"learning_rate": 1.9663273819545157e-06,
"loss": 0.3396,
"mean_token_accuracy": 0.8774642567150295,
"num_tokens": 470468046.0,
"step": 547
},
{
"entropy": 0.40380859375,
"epoch": 4.349206349206349,
"grad_norm": 0.7001669509560304,
"learning_rate": 1.948966043474889e-06,
"loss": 0.3458,
"mean_token_accuracy": 0.8756930027157068,
"num_tokens": 471309098.0,
"step": 548
},
{
"entropy": 0.396209716796875,
"epoch": 4.357142857142857,
"grad_norm": 0.75965811702668,
"learning_rate": 1.931663116340074e-06,
"loss": 0.3455,
"mean_token_accuracy": 0.8765083705075085,
"num_tokens": 472145738.0,
"step": 549
},
{
"entropy": 0.396392822265625,
"epoch": 4.365079365079365,
"grad_norm": 0.6756191619675378,
"learning_rate": 1.914418931810643e-06,
"loss": 0.3512,
"mean_token_accuracy": 0.8744937106966972,
"num_tokens": 473047197.0,
"step": 550
},
{
"entropy": 0.401031494140625,
"epoch": 4.3730158730158735,
"grad_norm": 0.6965894626329614,
"learning_rate": 1.8972338200225509e-06,
"loss": 0.3421,
"mean_token_accuracy": 0.8775716116651893,
"num_tokens": 473907585.0,
"step": 551
},
{
"entropy": 0.4000091552734375,
"epoch": 4.380952380952381,
"grad_norm": 0.7076324681120165,
"learning_rate": 1.880108109980815e-06,
"loss": 0.3462,
"mean_token_accuracy": 0.8761595580726862,
"num_tokens": 474779332.0,
"step": 552
},
{
"entropy": 0.39825439453125,
"epoch": 4.388888888888889,
"grad_norm": 0.7050720543139621,
"learning_rate": 1.8630421295532252e-06,
"loss": 0.345,
"mean_token_accuracy": 0.8770742062479258,
"num_tokens": 475659187.0,
"step": 553
},
{
"entropy": 0.403778076171875,
"epoch": 4.396825396825397,
"grad_norm": 0.7340183406802493,
"learning_rate": 1.8460362054640573e-06,
"loss": 0.3478,
"mean_token_accuracy": 0.8751401146873832,
"num_tokens": 476487458.0,
"step": 554
},
{
"entropy": 0.39990234375,
"epoch": 4.404761904761905,
"grad_norm": 0.6861105491926857,
"learning_rate": 1.8290906632878297e-06,
"loss": 0.3431,
"mean_token_accuracy": 0.8780268509872258,
"num_tokens": 477345662.0,
"step": 555
},
{
"entropy": 0.3991241455078125,
"epoch": 4.412698412698413,
"grad_norm": 0.7296898602599676,
"learning_rate": 1.8122058274430542e-06,
"loss": 0.3411,
"mean_token_accuracy": 0.8761810320429504,
"num_tokens": 478205977.0,
"step": 556
},
{
"entropy": 0.4037933349609375,
"epoch": 4.420634920634921,
"grad_norm": 1.0332210701383924,
"learning_rate": 1.7953820211860395e-06,
"loss": 0.356,
"mean_token_accuracy": 0.8737587067298591,
"num_tokens": 479048650.0,
"step": 557
},
{
"entropy": 0.3999481201171875,
"epoch": 4.428571428571429,
"grad_norm": 0.7091178286840939,
"learning_rate": 1.7786195666046935e-06,
"loss": 0.343,
"mean_token_accuracy": 0.8771154009737074,
"num_tokens": 479895873.0,
"step": 558
},
{
"entropy": 0.4032745361328125,
"epoch": 4.436507936507937,
"grad_norm": 0.6733078832793936,
"learning_rate": 1.7619187846123624e-06,
"loss": 0.3457,
"mean_token_accuracy": 0.8771998826414347,
"num_tokens": 480755429.0,
"step": 559
},
{
"entropy": 0.4007568359375,
"epoch": 4.444444444444445,
"grad_norm": 0.8490823775032588,
"learning_rate": 1.7452799949416833e-06,
"loss": 0.3517,
"mean_token_accuracy": 0.8754395125433803,
"num_tokens": 481608352.0,
"step": 560
},
{
"entropy": 0.4008026123046875,
"epoch": 4.4523809523809526,
"grad_norm": 0.7225303298169462,
"learning_rate": 1.7287035161384673e-06,
"loss": 0.35,
"mean_token_accuracy": 0.8747482905164361,
"num_tokens": 482441149.0,
"step": 561
},
{
"entropy": 0.4021148681640625,
"epoch": 4.4603174603174605,
"grad_norm": 0.6624423396335506,
"learning_rate": 1.7121896655555958e-06,
"loss": 0.347,
"mean_token_accuracy": 0.8763077296316624,
"num_tokens": 483307531.0,
"step": 562
},
{
"entropy": 0.4007720947265625,
"epoch": 4.468253968253968,
"grad_norm": 0.6783795851745674,
"learning_rate": 1.695738759346947e-06,
"loss": 0.3516,
"mean_token_accuracy": 0.8752468260936439,
"num_tokens": 484156689.0,
"step": 563
},
{
"entropy": 0.3984375,
"epoch": 4.476190476190476,
"grad_norm": 0.7230409362049561,
"learning_rate": 1.6793511124613455e-06,
"loss": 0.3405,
"mean_token_accuracy": 0.8779969648458064,
"num_tokens": 485003773.0,
"step": 564
},
{
"entropy": 0.4019317626953125,
"epoch": 4.484126984126984,
"grad_norm": 0.6858561278935235,
"learning_rate": 1.6630270386365288e-06,
"loss": 0.3462,
"mean_token_accuracy": 0.8767383908852935,
"num_tokens": 485834271.0,
"step": 565
},
{
"entropy": 0.4033966064453125,
"epoch": 4.492063492063492,
"grad_norm": 0.7715463405263099,
"learning_rate": 1.6467668503931432e-06,
"loss": 0.3406,
"mean_token_accuracy": 0.8790650884620845,
"num_tokens": 486676541.0,
"step": 566
},
{
"entropy": 0.3995513916015625,
"epoch": 4.5,
"grad_norm": 0.7299031695508553,
"learning_rate": 1.6305708590287616e-06,
"loss": 0.3413,
"mean_token_accuracy": 0.8776452434249222,
"num_tokens": 487533902.0,
"step": 567
},
{
"entropy": 0.39752197265625,
"epoch": 4.507936507936508,
"grad_norm": 0.7001696842835692,
"learning_rate": 1.6144393746119208e-06,
"loss": 0.3468,
"mean_token_accuracy": 0.8766471082344651,
"num_tokens": 488403340.0,
"step": 568
},
{
"entropy": 0.3946075439453125,
"epoch": 4.515873015873016,
"grad_norm": 0.6949363799298416,
"learning_rate": 1.5983727059761873e-06,
"loss": 0.3413,
"mean_token_accuracy": 0.8782787672244012,
"num_tokens": 489285650.0,
"step": 569
},
{
"entropy": 0.402496337890625,
"epoch": 4.523809523809524,
"grad_norm": 0.6662573552334149,
"learning_rate": 1.5823711607142428e-06,
"loss": 0.3448,
"mean_token_accuracy": 0.876647824421525,
"num_tokens": 490146251.0,
"step": 570
},
{
"entropy": 0.3963165283203125,
"epoch": 4.531746031746032,
"grad_norm": 0.6722490242200185,
"learning_rate": 1.5664350451720022e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.8809215794317424,
"num_tokens": 490981639.0,
"step": 571
},
{
"entropy": 0.401947021484375,
"epoch": 4.5396825396825395,
"grad_norm": 0.7667827684007154,
"learning_rate": 1.5505646644427375e-06,
"loss": 0.3443,
"mean_token_accuracy": 0.8768278043717146,
"num_tokens": 491819855.0,
"step": 572
},
{
"entropy": 0.4046630859375,
"epoch": 4.5476190476190474,
"grad_norm": 0.7217844340085546,
"learning_rate": 1.5347603223612462e-06,
"loss": 0.3453,
"mean_token_accuracy": 0.8769222623668611,
"num_tokens": 492664773.0,
"step": 573
},
{
"entropy": 0.396392822265625,
"epoch": 4.555555555555555,
"grad_norm": 0.6828293087400851,
"learning_rate": 1.5190223214980286e-06,
"loss": 0.3425,
"mean_token_accuracy": 0.876984007190913,
"num_tokens": 493538855.0,
"step": 574
},
{
"entropy": 0.3953704833984375,
"epoch": 4.563492063492063,
"grad_norm": 0.6985094822557292,
"learning_rate": 1.5033509631534986e-06,
"loss": 0.3481,
"mean_token_accuracy": 0.8754701013676822,
"num_tokens": 494419834.0,
"step": 575
},
{
"entropy": 0.40057373046875,
"epoch": 4.571428571428571,
"grad_norm": 0.7055750733602428,
"learning_rate": 1.4877465473522178e-06,
"loss": 0.3449,
"mean_token_accuracy": 0.8770850743167102,
"num_tokens": 495279672.0,
"step": 576
},
{
"entropy": 0.3951416015625,
"epoch": 4.579365079365079,
"grad_norm": 0.6964133064600199,
"learning_rate": 1.4722093728371427e-06,
"loss": 0.3513,
"mean_token_accuracy": 0.874992523342371,
"num_tokens": 496156072.0,
"step": 577
},
{
"entropy": 0.40093994140625,
"epoch": 4.587301587301587,
"grad_norm": 0.6585867710192563,
"learning_rate": 1.4567397370639158e-06,
"loss": 0.3481,
"mean_token_accuracy": 0.8771389788016677,
"num_tokens": 497013976.0,
"step": 578
},
{
"entropy": 0.400543212890625,
"epoch": 4.595238095238095,
"grad_norm": 0.6695268179646108,
"learning_rate": 1.4413379361951596e-06,
"loss": 0.3424,
"mean_token_accuracy": 0.8771733501926064,
"num_tokens": 497869587.0,
"step": 579
},
{
"entropy": 0.4037933349609375,
"epoch": 4.603174603174603,
"grad_norm": 0.7406939877102566,
"learning_rate": 1.4260042650948187e-06,
"loss": 0.3427,
"mean_token_accuracy": 0.8756671342998743,
"num_tokens": 498692858.0,
"step": 580
},
{
"entropy": 0.3985595703125,
"epoch": 4.611111111111111,
"grad_norm": 0.7048045979371886,
"learning_rate": 1.4107390173225045e-06,
"loss": 0.3469,
"mean_token_accuracy": 0.8772797528654337,
"num_tokens": 499558825.0,
"step": 581
},
{
"entropy": 0.4054718017578125,
"epoch": 4.619047619047619,
"grad_norm": 0.6900885423977635,
"learning_rate": 1.395542485127886e-06,
"loss": 0.3408,
"mean_token_accuracy": 0.878491104580462,
"num_tokens": 500399881.0,
"step": 582
},
{
"entropy": 0.4000701904296875,
"epoch": 4.6269841269841265,
"grad_norm": 0.6551912767795579,
"learning_rate": 1.3804149594450816e-06,
"loss": 0.3402,
"mean_token_accuracy": 0.8797827651724219,
"num_tokens": 501277242.0,
"step": 583
},
{
"entropy": 0.392608642578125,
"epoch": 4.634920634920634,
"grad_norm": 0.6872125661896025,
"learning_rate": 1.365356729887099e-06,
"loss": 0.3415,
"mean_token_accuracy": 0.8778949431143701,
"num_tokens": 502175769.0,
"step": 584
},
{
"entropy": 0.4004364013671875,
"epoch": 4.642857142857143,
"grad_norm": 0.6906654407257142,
"learning_rate": 1.3503680847402868e-06,
"loss": 0.3375,
"mean_token_accuracy": 0.879074421711266,
"num_tokens": 503037907.0,
"step": 585
},
{
"entropy": 0.3996429443359375,
"epoch": 4.650793650793651,
"grad_norm": 0.7240342341344183,
"learning_rate": 1.3354493109588145e-06,
"loss": 0.343,
"mean_token_accuracy": 0.8791590658947825,
"num_tokens": 503882004.0,
"step": 586
},
{
"entropy": 0.3933563232421875,
"epoch": 4.658730158730159,
"grad_norm": 0.6759947923545749,
"learning_rate": 1.320600694159185e-06,
"loss": 0.3418,
"mean_token_accuracy": 0.8785993568599224,
"num_tokens": 504761280.0,
"step": 587
},
{
"entropy": 0.3975372314453125,
"epoch": 4.666666666666667,
"grad_norm": 0.6810332156222548,
"learning_rate": 1.3058225186147572e-06,
"loss": 0.3419,
"mean_token_accuracy": 0.8782242434099317,
"num_tokens": 505628924.0,
"step": 588
},
{
"entropy": 0.3953857421875,
"epoch": 4.674603174603175,
"grad_norm": 0.6952323670957825,
"learning_rate": 1.2911150672503098e-06,
"loss": 0.3349,
"mean_token_accuracy": 0.8792746933177114,
"num_tokens": 506483264.0,
"step": 589
},
{
"entropy": 0.400146484375,
"epoch": 4.682539682539683,
"grad_norm": 0.6615786248003038,
"learning_rate": 1.2764786216366236e-06,
"loss": 0.342,
"mean_token_accuracy": 0.8765605296939611,
"num_tokens": 507337582.0,
"step": 590
},
{
"entropy": 0.3936614990234375,
"epoch": 4.690476190476191,
"grad_norm": 0.6423682264116367,
"learning_rate": 1.2619134619850908e-06,
"loss": 0.3403,
"mean_token_accuracy": 0.8784746997989714,
"num_tokens": 508222543.0,
"step": 591
},
{
"entropy": 0.40179443359375,
"epoch": 4.698412698412699,
"grad_norm": 0.7179320235597545,
"learning_rate": 1.2474198671423493e-06,
"loss": 0.3439,
"mean_token_accuracy": 0.8781091058626771,
"num_tokens": 509077470.0,
"step": 592
},
{
"entropy": 0.4020538330078125,
"epoch": 4.7063492063492065,
"grad_norm": 0.6640568389501444,
"learning_rate": 1.2329981145849468e-06,
"loss": 0.345,
"mean_token_accuracy": 0.8776707421056926,
"num_tokens": 509934412.0,
"step": 593
},
{
"entropy": 0.3987274169921875,
"epoch": 4.714285714285714,
"grad_norm": 0.6641554979878878,
"learning_rate": 1.2186484804140242e-06,
"loss": 0.333,
"mean_token_accuracy": 0.8802338382229209,
"num_tokens": 510796655.0,
"step": 594
},
{
"entropy": 0.3936614990234375,
"epoch": 4.722222222222222,
"grad_norm": 0.7311800509678725,
"learning_rate": 1.2043712393500355e-06,
"loss": 0.3465,
"mean_token_accuracy": 0.876534974668175,
"num_tokens": 511666478.0,
"step": 595
},
{
"entropy": 0.3982086181640625,
"epoch": 4.73015873015873,
"grad_norm": 0.654011664415763,
"learning_rate": 1.1901666647274823e-06,
"loss": 0.336,
"mean_token_accuracy": 0.8799294792115688,
"num_tokens": 512547249.0,
"step": 596
},
{
"entropy": 0.4018096923828125,
"epoch": 4.738095238095238,
"grad_norm": 0.7906669325474568,
"learning_rate": 1.1760350284896876e-06,
"loss": 0.3423,
"mean_token_accuracy": 0.8780363285914063,
"num_tokens": 513406924.0,
"step": 597
},
{
"entropy": 0.394073486328125,
"epoch": 4.746031746031746,
"grad_norm": 0.6272944465027679,
"learning_rate": 1.1619766011835832e-06,
"loss": 0.3351,
"mean_token_accuracy": 0.8792783697135746,
"num_tokens": 514278305.0,
"step": 598
},
{
"entropy": 0.3991546630859375,
"epoch": 4.753968253968254,
"grad_norm": 0.7028608754920164,
"learning_rate": 1.1479916519545326e-06,
"loss": 0.3381,
"mean_token_accuracy": 0.8802824383601546,
"num_tokens": 515127083.0,
"step": 599
},
{
"entropy": 0.3970489501953125,
"epoch": 4.761904761904762,
"grad_norm": 0.7415718014919481,
"learning_rate": 1.1340804485411783e-06,
"loss": 0.3494,
"mean_token_accuracy": 0.8775360365398228,
"num_tokens": 515982781.0,
"step": 600
},
{
"entropy": 0.400482177734375,
"epoch": 4.76984126984127,
"grad_norm": 0.6506891630519459,
"learning_rate": 1.1202432572703176e-06,
"loss": 0.3348,
"mean_token_accuracy": 0.879584884736687,
"num_tokens": 516838578.0,
"step": 601
},
{
"entropy": 0.398193359375,
"epoch": 4.777777777777778,
"grad_norm": 0.6609399081506822,
"learning_rate": 1.1064803430518002e-06,
"loss": 0.3403,
"mean_token_accuracy": 0.8773757833987474,
"num_tokens": 517695973.0,
"step": 602
},
{
"entropy": 0.3961334228515625,
"epoch": 4.785714285714286,
"grad_norm": 0.6536270048247466,
"learning_rate": 1.0927919693734618e-06,
"loss": 0.3403,
"mean_token_accuracy": 0.8781040622852743,
"num_tokens": 518570319.0,
"step": 603
},
{
"entropy": 0.398956298828125,
"epoch": 4.7936507936507935,
"grad_norm": 0.6661437239536121,
"learning_rate": 1.0791783982960736e-06,
"loss": 0.3429,
"mean_token_accuracy": 0.8768606032244861,
"num_tokens": 519417148.0,
"step": 604
},
{
"entropy": 0.399383544921875,
"epoch": 4.801587301587301,
"grad_norm": 0.6697036401243884,
"learning_rate": 1.0656398904483312e-06,
"loss": 0.3459,
"mean_token_accuracy": 0.8781998874619603,
"num_tokens": 520284498.0,
"step": 605
},
{
"entropy": 0.3967742919921875,
"epoch": 4.809523809523809,
"grad_norm": 0.6448494446348442,
"learning_rate": 1.0521767050218562e-06,
"loss": 0.3453,
"mean_token_accuracy": 0.8755287849344313,
"num_tokens": 521161180.0,
"step": 606
},
{
"entropy": 0.4019927978515625,
"epoch": 4.817460317460317,
"grad_norm": 0.695391933649051,
"learning_rate": 1.0387890997662443e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.8791229757480323,
"num_tokens": 522018351.0,
"step": 607
},
{
"entropy": 0.396759033203125,
"epoch": 4.825396825396825,
"grad_norm": 0.6885741043618135,
"learning_rate": 1.0254773309841277e-06,
"loss": 0.3452,
"mean_token_accuracy": 0.8766398807056248,
"num_tokens": 522908445.0,
"step": 608
},
{
"entropy": 0.4028167724609375,
"epoch": 4.833333333333333,
"grad_norm": 0.6824971060967869,
"learning_rate": 1.012241653526263e-06,
"loss": 0.3381,
"mean_token_accuracy": 0.8785937232896686,
"num_tokens": 523761885.0,
"step": 609
},
{
"entropy": 0.3957977294921875,
"epoch": 4.841269841269841,
"grad_norm": 0.6556820486631832,
"learning_rate": 9.990823207866578e-07,
"loss": 0.3431,
"mean_token_accuracy": 0.8786509921774268,
"num_tokens": 524634392.0,
"step": 610
},
{
"entropy": 0.3975067138671875,
"epoch": 4.849206349206349,
"grad_norm": 0.6634339315868009,
"learning_rate": 9.85999584697716e-07,
"loss": 0.3458,
"mean_token_accuracy": 0.8759105852805078,
"num_tokens": 525481711.0,
"step": 611
},
{
"entropy": 0.3944549560546875,
"epoch": 4.857142857142857,
"grad_norm": 0.6633851252480073,
"learning_rate": 9.729936957254165e-07,
"loss": 0.3348,
"mean_token_accuracy": 0.8805793649517,
"num_tokens": 526350562.0,
"step": 612
},
{
"entropy": 0.3972015380859375,
"epoch": 4.865079365079366,
"grad_norm": 0.6809718096324041,
"learning_rate": 9.600649028645215e-07,
"loss": 0.3411,
"mean_token_accuracy": 0.877831466961652,
"num_tokens": 527208722.0,
"step": 613
},
{
"entropy": 0.3993377685546875,
"epoch": 4.8730158730158735,
"grad_norm": 0.7413910925997623,
"learning_rate": 9.472134536338007e-07,
"loss": 0.3348,
"mean_token_accuracy": 0.8798928018659353,
"num_tokens": 528070537.0,
"step": 614
},
{
"entropy": 0.3999786376953125,
"epoch": 4.880952380952381,
"grad_norm": 0.6404430937655207,
"learning_rate": 9.344395940713009e-07,
"loss": 0.3482,
"mean_token_accuracy": 0.8766381270252168,
"num_tokens": 528925481.0,
"step": 615
},
{
"entropy": 0.4001312255859375,
"epoch": 4.888888888888889,
"grad_norm": 0.642942870387289,
"learning_rate": 9.217435687296305e-07,
"loss": 0.3388,
"mean_token_accuracy": 0.8791137794032693,
"num_tokens": 529762293.0,
"step": 616
},
{
"entropy": 0.3968963623046875,
"epoch": 4.896825396825397,
"grad_norm": 0.6411974157903991,
"learning_rate": 9.091256206712812e-07,
"loss": 0.3398,
"mean_token_accuracy": 0.8778149662539363,
"num_tokens": 530625692.0,
"step": 617
},
{
"entropy": 0.397918701171875,
"epoch": 4.904761904761905,
"grad_norm": 0.6795080852701798,
"learning_rate": 8.965859914639724e-07,
"loss": 0.3458,
"mean_token_accuracy": 0.8769173468463123,
"num_tokens": 531481363.0,
"step": 618
},
{
"entropy": 0.399322509765625,
"epoch": 4.912698412698413,
"grad_norm": 0.7268654576820524,
"learning_rate": 8.841249211760272e-07,
"loss": 0.3401,
"mean_token_accuracy": 0.8781247353181243,
"num_tokens": 532334443.0,
"step": 619
},
{
"entropy": 0.39544677734375,
"epoch": 4.920634920634921,
"grad_norm": 0.6991042720171687,
"learning_rate": 8.717426483717762e-07,
"loss": 0.3474,
"mean_token_accuracy": 0.8754720254801214,
"num_tokens": 533215439.0,
"step": 620
},
{
"entropy": 0.39813232421875,
"epoch": 4.928571428571429,
"grad_norm": 0.6420905529792567,
"learning_rate": 8.594394101069897e-07,
"loss": 0.3449,
"mean_token_accuracy": 0.876462968531996,
"num_tokens": 534086645.0,
"step": 621
},
{
"entropy": 0.3993988037109375,
"epoch": 4.936507936507937,
"grad_norm": 0.634959288865702,
"learning_rate": 8.472154419243411e-07,
"loss": 0.3422,
"mean_token_accuracy": 0.8784619648940861,
"num_tokens": 534968024.0,
"step": 622
},
{
"entropy": 0.3939361572265625,
"epoch": 4.944444444444445,
"grad_norm": 0.6525948648538159,
"learning_rate": 8.350709778488941e-07,
"loss": 0.3433,
"mean_token_accuracy": 0.878953296225518,
"num_tokens": 535858097.0,
"step": 623
},
{
"entropy": 0.3915557861328125,
"epoch": 4.9523809523809526,
"grad_norm": 0.6338378330558326,
"learning_rate": 8.230062503836278e-07,
"loss": 0.3403,
"mean_token_accuracy": 0.8782070642337203,
"num_tokens": 536754856.0,
"step": 624
},
{
"entropy": 0.3979949951171875,
"epoch": 4.9603174603174605,
"grad_norm": 0.6515049678243575,
"learning_rate": 8.110214905049802e-07,
"loss": 0.3447,
"mean_token_accuracy": 0.8780298097990453,
"num_tokens": 537612880.0,
"step": 625
},
{
"entropy": 0.39752197265625,
"epoch": 4.968253968253968,
"grad_norm": 0.6537688062696074,
"learning_rate": 7.991169276584281e-07,
"loss": 0.3383,
"mean_token_accuracy": 0.8791689327917993,
"num_tokens": 538459827.0,
"step": 626
},
{
"entropy": 0.4014129638671875,
"epoch": 4.976190476190476,
"grad_norm": 0.6761627263761556,
"learning_rate": 7.872927897540944e-07,
"loss": 0.3349,
"mean_token_accuracy": 0.8803192311897874,
"num_tokens": 539280570.0,
"step": 627
},
{
"entropy": 0.4014129638671875,
"epoch": 4.984126984126984,
"grad_norm": 0.6783420727462287,
"learning_rate": 7.75549303162384e-07,
"loss": 0.3441,
"mean_token_accuracy": 0.8776299306191504,
"num_tokens": 540103344.0,
"step": 628
},
{
"entropy": 0.3983917236328125,
"epoch": 4.992063492063492,
"grad_norm": 0.6003292581738014,
"learning_rate": 7.638866927096555e-07,
"loss": 0.3384,
"mean_token_accuracy": 0.8786575449630618,
"num_tokens": 540983421.0,
"step": 629
},
{
"entropy": 0.3915557861328125,
"epoch": 5.0,
"grad_norm": 0.6132257577458607,
"learning_rate": 7.523051816739074e-07,
"loss": 0.3407,
"mean_token_accuracy": 0.8787228460423648,
"num_tokens": 541874593.0,
"step": 630
},
{
"entropy": 0.3969573974609375,
"epoch": 5.007936507936508,
"grad_norm": 0.6874008020158511,
"learning_rate": 7.408049917805104e-07,
"loss": 0.3324,
"mean_token_accuracy": 0.8809576267376542,
"num_tokens": 542746900.0,
"step": 631
},
{
"entropy": 0.3990631103515625,
"epoch": 5.015873015873016,
"grad_norm": 0.6533341249855773,
"learning_rate": 7.293863431979619e-07,
"loss": 0.3325,
"mean_token_accuracy": 0.8817571788094938,
"num_tokens": 543607351.0,
"step": 632
},
{
"entropy": 0.4006195068359375,
"epoch": 5.023809523809524,
"grad_norm": 0.6743032453873651,
"learning_rate": 7.180494545336642e-07,
"loss": 0.3252,
"mean_token_accuracy": 0.8821277469396591,
"num_tokens": 544429814.0,
"step": 633
},
{
"entropy": 0.392242431640625,
"epoch": 5.031746031746032,
"grad_norm": 0.6491407357261534,
"learning_rate": 7.067945428297524e-07,
"loss": 0.329,
"mean_token_accuracy": 0.8841568692587316,
"num_tokens": 545293765.0,
"step": 634
},
{
"entropy": 0.392059326171875,
"epoch": 5.0396825396825395,
"grad_norm": 0.6509793206709636,
"learning_rate": 6.956218235589263e-07,
"loss": 0.327,
"mean_token_accuracy": 0.8831868241541088,
"num_tokens": 546166241.0,
"step": 635
},
{
"entropy": 0.394683837890625,
"epoch": 5.0476190476190474,
"grad_norm": 0.6965451021310177,
"learning_rate": 6.845315106203327e-07,
"loss": 0.3192,
"mean_token_accuracy": 0.8857454406097531,
"num_tokens": 547008292.0,
"step": 636
},
{
"entropy": 0.3927764892578125,
"epoch": 5.055555555555555,
"grad_norm": 0.7299760861326686,
"learning_rate": 6.735238163354669e-07,
"loss": 0.329,
"mean_token_accuracy": 0.8830623761750758,
"num_tokens": 547881345.0,
"step": 637
},
{
"entropy": 0.393829345703125,
"epoch": 5.063492063492063,
"grad_norm": 0.692591071370849,
"learning_rate": 6.625989514441089e-07,
"loss": 0.3263,
"mean_token_accuracy": 0.8835366195999086,
"num_tokens": 548753550.0,
"step": 638
},
{
"entropy": 0.3939666748046875,
"epoch": 5.071428571428571,
"grad_norm": 0.6832435052031313,
"learning_rate": 6.517571251002896e-07,
"loss": 0.3274,
"mean_token_accuracy": 0.8831272819079459,
"num_tokens": 549614624.0,
"step": 639
},
{
"entropy": 0.39398193359375,
"epoch": 5.079365079365079,
"grad_norm": 0.6839637384434816,
"learning_rate": 6.40998544868287e-07,
"loss": 0.3168,
"mean_token_accuracy": 0.8865264924243093,
"num_tokens": 550450538.0,
"step": 640
},
{
"entropy": 0.3957672119140625,
"epoch": 5.087301587301587,
"grad_norm": 0.6628392349359786,
"learning_rate": 6.3032341671865e-07,
"loss": 0.3251,
"mean_token_accuracy": 0.8840126856230199,
"num_tokens": 551311533.0,
"step": 641
},
{
"entropy": 0.3985137939453125,
"epoch": 5.095238095238095,
"grad_norm": 0.6552020896616739,
"learning_rate": 6.197319450242562e-07,
"loss": 0.3219,
"mean_token_accuracy": 0.8848442859016359,
"num_tokens": 552168363.0,
"step": 642
},
{
"entropy": 0.3944549560546875,
"epoch": 5.103174603174603,
"grad_norm": 0.6459530624061904,
"learning_rate": 6.092243325564007e-07,
"loss": 0.3254,
"mean_token_accuracy": 0.885173340793699,
"num_tokens": 553049237.0,
"step": 643
},
{
"entropy": 0.396759033203125,
"epoch": 5.111111111111111,
"grad_norm": 0.6485251490072942,
"learning_rate": 5.98800780480912e-07,
"loss": 0.3345,
"mean_token_accuracy": 0.8795099183917046,
"num_tokens": 553918684.0,
"step": 644
},
{
"entropy": 0.3992462158203125,
"epoch": 5.119047619047619,
"grad_norm": 0.682884688994534,
"learning_rate": 5.884614883543027e-07,
"loss": 0.3294,
"mean_token_accuracy": 0.8827598677016795,
"num_tokens": 554768021.0,
"step": 645
},
{
"entropy": 0.3957366943359375,
"epoch": 5.1269841269841265,
"grad_norm": 0.6622701991206025,
"learning_rate": 5.782066541199471e-07,
"loss": 0.3201,
"mean_token_accuracy": 0.8856850513257086,
"num_tokens": 555611362.0,
"step": 646
},
{
"entropy": 0.397430419921875,
"epoch": 5.134920634920635,
"grad_norm": 0.6494881440989898,
"learning_rate": 5.680364741042926e-07,
"loss": 0.3308,
"mean_token_accuracy": 0.8822413263842463,
"num_tokens": 556476117.0,
"step": 647
},
{
"entropy": 0.39849853515625,
"epoch": 5.142857142857143,
"grad_norm": 0.6399277286004064,
"learning_rate": 5.579511430131018e-07,
"loss": 0.3262,
"mean_token_accuracy": 0.8843574924394488,
"num_tokens": 557321161.0,
"step": 648
},
{
"entropy": 0.3938751220703125,
"epoch": 5.150793650793651,
"grad_norm": 0.6414200561803504,
"learning_rate": 5.479508539277229e-07,
"loss": 0.3262,
"mean_token_accuracy": 0.8831641948781908,
"num_tokens": 558195818.0,
"step": 649
},
{
"entropy": 0.3946533203125,
"epoch": 5.158730158730159,
"grad_norm": 0.6400500966934808,
"learning_rate": 5.380357983013962e-07,
"loss": 0.3247,
"mean_token_accuracy": 0.8846798562444746,
"num_tokens": 559060077.0,
"step": 650
},
{
"entropy": 0.3952484130859375,
"epoch": 5.166666666666667,
"grad_norm": 0.665030112678579,
"learning_rate": 5.282061659555854e-07,
"loss": 0.3306,
"mean_token_accuracy": 0.8817098373547196,
"num_tokens": 559919625.0,
"step": 651
},
{
"entropy": 0.3953704833984375,
"epoch": 5.174603174603175,
"grad_norm": 0.6442855912711039,
"learning_rate": 5.184621450763455e-07,
"loss": 0.3286,
"mean_token_accuracy": 0.8841330683790147,
"num_tokens": 560767619.0,
"step": 652
},
{
"entropy": 0.3937225341796875,
"epoch": 5.182539682539683,
"grad_norm": 0.711237148073599,
"learning_rate": 5.088039222107205e-07,
"loss": 0.3317,
"mean_token_accuracy": 0.8824787489138544,
"num_tokens": 561614252.0,
"step": 653
},
{
"entropy": 0.3915252685546875,
"epoch": 5.190476190476191,
"grad_norm": 0.664655406106165,
"learning_rate": 4.992316822631693e-07,
"loss": 0.3247,
"mean_token_accuracy": 0.883714787196368,
"num_tokens": 562479272.0,
"step": 654
},
{
"entropy": 0.3939361572265625,
"epoch": 5.198412698412699,
"grad_norm": 0.6423893808843507,
"learning_rate": 4.897456084920282e-07,
"loss": 0.3233,
"mean_token_accuracy": 0.8836758630350232,
"num_tokens": 563325903.0,
"step": 655
},
{
"entropy": 0.391815185546875,
"epoch": 5.2063492063492065,
"grad_norm": 0.609576579332021,
"learning_rate": 4.803458825060042e-07,
"loss": 0.3234,
"mean_token_accuracy": 0.8831925024278462,
"num_tokens": 564204898.0,
"step": 656
},
{
"entropy": 0.3961944580078125,
"epoch": 5.214285714285714,
"grad_norm": 0.6355056690611225,
"learning_rate": 4.710326842606927e-07,
"loss": 0.3209,
"mean_token_accuracy": 0.8843817953020334,
"num_tokens": 565054657.0,
"step": 657
},
{
"entropy": 0.3915863037109375,
"epoch": 5.222222222222222,
"grad_norm": 0.6715411599255142,
"learning_rate": 4.618061920551381e-07,
"loss": 0.3268,
"mean_token_accuracy": 0.8825240274891257,
"num_tokens": 565926348.0,
"step": 658
},
{
"entropy": 0.3952484130859375,
"epoch": 5.23015873015873,
"grad_norm": 0.6474113419989475,
"learning_rate": 4.526665825284132e-07,
"loss": 0.3344,
"mean_token_accuracy": 0.8826401890255511,
"num_tokens": 566799563.0,
"step": 659
},
{
"entropy": 0.393402099609375,
"epoch": 5.238095238095238,
"grad_norm": 0.6520320734154383,
"learning_rate": 4.4361403065624475e-07,
"loss": 0.3283,
"mean_token_accuracy": 0.881974630523473,
"num_tokens": 567686904.0,
"step": 660
},
{
"entropy": 0.3928680419921875,
"epoch": 5.246031746031746,
"grad_norm": 0.6254784285922858,
"learning_rate": 4.3464870974766314e-07,
"loss": 0.3299,
"mean_token_accuracy": 0.882287971675396,
"num_tokens": 568563817.0,
"step": 661
},
{
"entropy": 0.396759033203125,
"epoch": 5.253968253968254,
"grad_norm": 0.6524894750875436,
"learning_rate": 4.257707914416781e-07,
"loss": 0.319,
"mean_token_accuracy": 0.8853690237738192,
"num_tokens": 569412950.0,
"step": 662
},
{
"entropy": 0.390960693359375,
"epoch": 5.261904761904762,
"grad_norm": 0.6550596337749973,
"learning_rate": 4.169804457039972e-07,
"loss": 0.3281,
"mean_token_accuracy": 0.8837977671064436,
"num_tokens": 570290370.0,
"step": 663
},
{
"entropy": 0.39520263671875,
"epoch": 5.26984126984127,
"grad_norm": 0.6599640261368089,
"learning_rate": 4.082778408237731e-07,
"loss": 0.3312,
"mean_token_accuracy": 0.8819447602145374,
"num_tokens": 571139089.0,
"step": 664
},
{
"entropy": 0.3977203369140625,
"epoch": 5.277777777777778,
"grad_norm": 0.6325202825006885,
"learning_rate": 3.996631434103776e-07,
"loss": 0.3216,
"mean_token_accuracy": 0.8842552327550948,
"num_tokens": 571974486.0,
"step": 665
},
{
"entropy": 0.3957366943359375,
"epoch": 5.285714285714286,
"grad_norm": 0.629795019163263,
"learning_rate": 3.911365183902166e-07,
"loss": 0.3244,
"mean_token_accuracy": 0.883813981898129,
"num_tokens": 572833941.0,
"step": 666
},
{
"entropy": 0.393707275390625,
"epoch": 5.2936507936507935,
"grad_norm": 0.6615713948839467,
"learning_rate": 3.826981290035692e-07,
"loss": 0.3358,
"mean_token_accuracy": 0.8800787003710866,
"num_tokens": 573696025.0,
"step": 667
},
{
"entropy": 0.3954620361328125,
"epoch": 5.301587301587301,
"grad_norm": 0.6448584761089627,
"learning_rate": 3.7434813680146234e-07,
"loss": 0.3258,
"mean_token_accuracy": 0.8832776751369238,
"num_tokens": 574541353.0,
"step": 668
},
{
"entropy": 0.3994140625,
"epoch": 5.309523809523809,
"grad_norm": 0.6253174175232413,
"learning_rate": 3.6608670164258065e-07,
"loss": 0.328,
"mean_token_accuracy": 0.8827647585421801,
"num_tokens": 575378417.0,
"step": 669
},
{
"entropy": 0.397003173828125,
"epoch": 5.317460317460317,
"grad_norm": 0.6543163664275344,
"learning_rate": 3.5791398169020384e-07,
"loss": 0.3223,
"mean_token_accuracy": 0.8838555999100208,
"num_tokens": 576216763.0,
"step": 670
},
{
"entropy": 0.3980865478515625,
"epoch": 5.325396825396825,
"grad_norm": 0.6727975870724803,
"learning_rate": 3.4983013340918024e-07,
"loss": 0.3319,
"mean_token_accuracy": 0.881680119317025,
"num_tokens": 577068285.0,
"step": 671
},
{
"entropy": 0.397216796875,
"epoch": 5.333333333333333,
"grad_norm": 0.6908497952151735,
"learning_rate": 3.4183531156292913e-07,
"loss": 0.3199,
"mean_token_accuracy": 0.8852394479326904,
"num_tokens": 577910888.0,
"step": 672
},
{
"entropy": 0.3982391357421875,
"epoch": 5.341269841269841,
"grad_norm": 0.6271777851924285,
"learning_rate": 3.3392966921047984e-07,
"loss": 0.3323,
"mean_token_accuracy": 0.8813259471207857,
"num_tokens": 578760857.0,
"step": 673
},
{
"entropy": 0.39141845703125,
"epoch": 5.349206349206349,
"grad_norm": 0.6420853255417144,
"learning_rate": 3.261133577035408e-07,
"loss": 0.3276,
"mean_token_accuracy": 0.8815803048200905,
"num_tokens": 579639850.0,
"step": 674
},
{
"entropy": 0.39239501953125,
"epoch": 5.357142857142857,
"grad_norm": 0.6400029343318571,
"learning_rate": 3.1838652668360173e-07,
"loss": 0.3208,
"mean_token_accuracy": 0.8846969213336706,
"num_tokens": 580506527.0,
"step": 675
},
{
"entropy": 0.4005584716796875,
"epoch": 5.365079365079365,
"grad_norm": 0.6337302117382984,
"learning_rate": 3.1074932407906823e-07,
"loss": 0.3313,
"mean_token_accuracy": 0.8819033140316606,
"num_tokens": 581347475.0,
"step": 676
},
{
"entropy": 0.39678955078125,
"epoch": 5.3730158730158735,
"grad_norm": 0.6531503190443603,
"learning_rate": 3.0320189610243303e-07,
"loss": 0.3226,
"mean_token_accuracy": 0.8830904331989586,
"num_tokens": 582201245.0,
"step": 677
},
{
"entropy": 0.39251708984375,
"epoch": 5.380952380952381,
"grad_norm": 0.6298282608886412,
"learning_rate": 2.957443872474713e-07,
"loss": 0.3249,
"mean_token_accuracy": 0.8838722719810903,
"num_tokens": 583076088.0,
"step": 678
},
{
"entropy": 0.3914947509765625,
"epoch": 5.388888888888889,
"grad_norm": 0.6217255913819979,
"learning_rate": 2.883769402864789e-07,
"loss": 0.3235,
"mean_token_accuracy": 0.8837307607755065,
"num_tokens": 583938273.0,
"step": 679
},
{
"entropy": 0.3894500732421875,
"epoch": 5.396825396825397,
"grad_norm": 0.9295484786245685,
"learning_rate": 2.810996962675361e-07,
"loss": 0.3289,
"mean_token_accuracy": 0.8828169428743422,
"num_tokens": 584828853.0,
"step": 680
},
{
"entropy": 0.39373779296875,
"epoch": 5.404761904761905,
"grad_norm": 0.6460727447308977,
"learning_rate": 2.739127945118092e-07,
"loss": 0.3332,
"mean_token_accuracy": 0.8817042661830783,
"num_tokens": 585681013.0,
"step": 681
},
{
"entropy": 0.3941650390625,
"epoch": 5.412698412698413,
"grad_norm": 0.6371402076274487,
"learning_rate": 2.668163726108841e-07,
"loss": 0.3294,
"mean_token_accuracy": 0.8820854951627553,
"num_tokens": 586567674.0,
"step": 682
},
{
"entropy": 0.3912506103515625,
"epoch": 5.420634920634921,
"grad_norm": 0.6599817270342999,
"learning_rate": 2.5981056642412796e-07,
"loss": 0.3274,
"mean_token_accuracy": 0.884115984197706,
"num_tokens": 587445470.0,
"step": 683
},
{
"entropy": 0.3984527587890625,
"epoch": 5.428571428571429,
"grad_norm": 0.6190960844327497,
"learning_rate": 2.528955100760938e-07,
"loss": 0.3225,
"mean_token_accuracy": 0.8846618658863008,
"num_tokens": 588268783.0,
"step": 684
},
{
"entropy": 0.396881103515625,
"epoch": 5.436507936507937,
"grad_norm": 0.6277108055718937,
"learning_rate": 2.460713359539474e-07,
"loss": 0.3247,
"mean_token_accuracy": 0.8857448240742087,
"num_tokens": 589106246.0,
"step": 685
},
{
"entropy": 0.393524169921875,
"epoch": 5.444444444444445,
"grad_norm": 0.6346008314831126,
"learning_rate": 2.3933817470493445e-07,
"loss": 0.319,
"mean_token_accuracy": 0.8859792477451265,
"num_tokens": 589927765.0,
"step": 686
},
{
"entropy": 0.396209716796875,
"epoch": 5.4523809523809526,
"grad_norm": 0.6418782134867133,
"learning_rate": 2.3269615523388355e-07,
"loss": 0.3276,
"mean_token_accuracy": 0.8844058201648295,
"num_tokens": 590768483.0,
"step": 687
},
{
"entropy": 0.393310546875,
"epoch": 5.4603174603174605,
"grad_norm": 0.6145408122390106,
"learning_rate": 2.2614540470073276e-07,
"loss": 0.3276,
"mean_token_accuracy": 0.8833085368387401,
"num_tokens": 591653011.0,
"step": 688
},
{
"entropy": 0.391082763671875,
"epoch": 5.468253968253968,
"grad_norm": 0.6290181808829338,
"learning_rate": 2.1968604851809738e-07,
"loss": 0.3344,
"mean_token_accuracy": 0.8824751214124262,
"num_tokens": 592555939.0,
"step": 689
},
{
"entropy": 0.393218994140625,
"epoch": 5.476190476190476,
"grad_norm": 0.6052791805512318,
"learning_rate": 2.1331821034886846e-07,
"loss": 0.3268,
"mean_token_accuracy": 0.8844651393592358,
"num_tokens": 593422295.0,
"step": 690
},
{
"entropy": 0.3939666748046875,
"epoch": 5.484126984126984,
"grad_norm": 0.6765361632797722,
"learning_rate": 2.0704201210384634e-07,
"loss": 0.3294,
"mean_token_accuracy": 0.8817935772240162,
"num_tokens": 594275578.0,
"step": 691
},
{
"entropy": 0.3902130126953125,
"epoch": 5.492063492063492,
"grad_norm": 0.6558765212307175,
"learning_rate": 2.0085757393940586e-07,
"loss": 0.3276,
"mean_token_accuracy": 0.8839560803025961,
"num_tokens": 595158105.0,
"step": 692
},
{
"entropy": 0.3959808349609375,
"epoch": 5.5,
"grad_norm": 0.6728354108367586,
"learning_rate": 1.9476501425519656e-07,
"loss": 0.3314,
"mean_token_accuracy": 0.8821393130347133,
"num_tokens": 596012207.0,
"step": 693
},
{
"entropy": 0.3945465087890625,
"epoch": 5.507936507936508,
"grad_norm": 0.6161700876601213,
"learning_rate": 1.8876444969187557e-07,
"loss": 0.3252,
"mean_token_accuracy": 0.884291214402765,
"num_tokens": 596867579.0,
"step": 694
},
{
"entropy": 0.3921051025390625,
"epoch": 5.515873015873016,
"grad_norm": 0.6659432185652765,
"learning_rate": 1.828559951288733e-07,
"loss": 0.3294,
"mean_token_accuracy": 0.8830416211858392,
"num_tokens": 597729003.0,
"step": 695
},
{
"entropy": 0.3937225341796875,
"epoch": 5.523809523809524,
"grad_norm": 0.6337625434766239,
"learning_rate": 1.7703976368219633e-07,
"loss": 0.3387,
"mean_token_accuracy": 0.8793549695983529,
"num_tokens": 598610243.0,
"step": 696
},
{
"entropy": 0.398223876953125,
"epoch": 5.531746031746032,
"grad_norm": 0.6425672701806616,
"learning_rate": 1.713158667022613e-07,
"loss": 0.3282,
"mean_token_accuracy": 0.8831517458893359,
"num_tokens": 599468184.0,
"step": 697
},
{
"entropy": 0.390777587890625,
"epoch": 5.5396825396825395,
"grad_norm": 0.6588868479383553,
"learning_rate": 1.656844137717617e-07,
"loss": 0.3241,
"mean_token_accuracy": 0.8842832935042679,
"num_tokens": 600335530.0,
"step": 698
},
{
"entropy": 0.39007568359375,
"epoch": 5.5476190476190474,
"grad_norm": 0.6391375374634255,
"learning_rate": 1.601455127035717e-07,
"loss": 0.3303,
"mean_token_accuracy": 0.8812260185368359,
"num_tokens": 601219591.0,
"step": 699
},
{
"entropy": 0.3961639404296875,
"epoch": 5.555555555555555,
"grad_norm": 0.6312177790645791,
"learning_rate": 1.5469926953868063e-07,
"loss": 0.3277,
"mean_token_accuracy": 0.8838478000834584,
"num_tokens": 602080711.0,
"step": 700
},
{
"entropy": 0.3947296142578125,
"epoch": 5.563492063492063,
"grad_norm": 0.6168029707269869,
"learning_rate": 1.4934578854416403e-07,
"loss": 0.3273,
"mean_token_accuracy": 0.8831527666188776,
"num_tokens": 602939251.0,
"step": 701
},
{
"entropy": 0.3935546875,
"epoch": 5.571428571428571,
"grad_norm": 0.6196463005466791,
"learning_rate": 1.440851722111858e-07,
"loss": 0.3214,
"mean_token_accuracy": 0.8847082569263875,
"num_tokens": 603814211.0,
"step": 702
},
{
"entropy": 0.3931884765625,
"epoch": 5.579365079365079,
"grad_norm": 0.6582430849772537,
"learning_rate": 1.389175212530397e-07,
"loss": 0.3279,
"mean_token_accuracy": 0.8828860782086849,
"num_tokens": 604668809.0,
"step": 703
},
{
"entropy": 0.395904541015625,
"epoch": 5.587301587301587,
"grad_norm": 0.6557024272945261,
"learning_rate": 1.3384293460321662e-07,
"loss": 0.3316,
"mean_token_accuracy": 0.8824727293103933,
"num_tokens": 605530653.0,
"step": 704
},
{
"entropy": 0.39093017578125,
"epoch": 5.595238095238095,
"grad_norm": 0.5952866739722456,
"learning_rate": 1.2886150941351317e-07,
"loss": 0.3282,
"mean_token_accuracy": 0.8835100992582738,
"num_tokens": 606405426.0,
"step": 705
},
{
"entropy": 0.394195556640625,
"epoch": 5.603174603174603,
"grad_norm": 0.6329738035693675,
"learning_rate": 1.2397334105217097e-07,
"loss": 0.3251,
"mean_token_accuracy": 0.8846405958756804,
"num_tokens": 607272519.0,
"step": 706
},
{
"entropy": 0.3961181640625,
"epoch": 5.611111111111111,
"grad_norm": 0.6167631677238792,
"learning_rate": 1.1917852310205147e-07,
"loss": 0.3279,
"mean_token_accuracy": 0.883335932623595,
"num_tokens": 608126636.0,
"step": 707
},
{
"entropy": 0.3947296142578125,
"epoch": 5.619047619047619,
"grad_norm": 0.6123371507088005,
"learning_rate": 1.1447714735884463e-07,
"loss": 0.3197,
"mean_token_accuracy": 0.8852512533776462,
"num_tokens": 608972728.0,
"step": 708
},
{
"entropy": 0.3878631591796875,
"epoch": 5.6269841269841265,
"grad_norm": 0.6206905437393018,
"learning_rate": 1.0986930382930916e-07,
"loss": 0.3251,
"mean_token_accuracy": 0.8831897312775254,
"num_tokens": 609880767.0,
"step": 709
},
{
"entropy": 0.39312744140625,
"epoch": 5.634920634920634,
"grad_norm": 0.6362558759153903,
"learning_rate": 1.0535508072955225e-07,
"loss": 0.3284,
"mean_token_accuracy": 0.8815909679979086,
"num_tokens": 610732354.0,
"step": 710
},
{
"entropy": 0.3891448974609375,
"epoch": 5.642857142857143,
"grad_norm": 0.6132840766865005,
"learning_rate": 1.0093456448333872e-07,
"loss": 0.3259,
"mean_token_accuracy": 0.8822133978828788,
"num_tokens": 611602054.0,
"step": 711
},
{
"entropy": 0.393310546875,
"epoch": 5.650793650793651,
"grad_norm": 0.6300293214423813,
"learning_rate": 9.660783972043786e-08,
"loss": 0.3285,
"mean_token_accuracy": 0.8818607972934842,
"num_tokens": 612448801.0,
"step": 712
},
{
"entropy": 0.3934173583984375,
"epoch": 5.658730158730159,
"grad_norm": 0.6904998334631229,
"learning_rate": 9.237498927500088e-08,
"loss": 0.3302,
"mean_token_accuracy": 0.881088858935982,
"num_tokens": 613302281.0,
"step": 713
},
{
"entropy": 0.3985748291015625,
"epoch": 5.666666666666667,
"grad_norm": 0.6336591385840509,
"learning_rate": 8.823609418397939e-08,
"loss": 0.324,
"mean_token_accuracy": 0.8830747129395604,
"num_tokens": 614133343.0,
"step": 714
},
{
"entropy": 0.3961181640625,
"epoch": 5.674603174603175,
"grad_norm": 0.6560814482182532,
"learning_rate": 8.419123368556991e-08,
"loss": 0.3281,
"mean_token_accuracy": 0.8820374673232436,
"num_tokens": 614971152.0,
"step": 715
},
{
"entropy": 0.395751953125,
"epoch": 5.682539682539683,
"grad_norm": 0.61946329385844,
"learning_rate": 8.024048521769745e-08,
"loss": 0.3244,
"mean_token_accuracy": 0.8853642977774143,
"num_tokens": 615848347.0,
"step": 716
},
{
"entropy": 0.3966522216796875,
"epoch": 5.690476190476191,
"grad_norm": 0.6677220378111695,
"learning_rate": 7.638392441653542e-08,
"loss": 0.3315,
"mean_token_accuracy": 0.8841713918372989,
"num_tokens": 616696985.0,
"step": 717
},
{
"entropy": 0.386993408203125,
"epoch": 5.698412698412699,
"grad_norm": 0.6448605664251128,
"learning_rate": 7.262162511505466e-08,
"loss": 0.323,
"mean_token_accuracy": 0.8844177662394941,
"num_tokens": 617578479.0,
"step": 718
},
{
"entropy": 0.3964385986328125,
"epoch": 5.7063492063492065,
"grad_norm": 0.6465949886444489,
"learning_rate": 6.895365934161236e-08,
"loss": 0.3265,
"mean_token_accuracy": 0.883903234731406,
"num_tokens": 618415443.0,
"step": 719
},
{
"entropy": 0.3905792236328125,
"epoch": 5.714285714285714,
"grad_norm": 0.7626270402963101,
"learning_rate": 6.538009731857087e-08,
"loss": 0.3266,
"mean_token_accuracy": 0.8833001307211816,
"num_tokens": 619287864.0,
"step": 720
},
{
"entropy": 0.3910675048828125,
"epoch": 5.722222222222222,
"grad_norm": 0.6559784419468065,
"learning_rate": 6.190100746095495e-08,
"loss": 0.3243,
"mean_token_accuracy": 0.8850177507847548,
"num_tokens": 620198481.0,
"step": 721
},
{
"entropy": 0.3970184326171875,
"epoch": 5.73015873015873,
"grad_norm": 0.6389982422035683,
"learning_rate": 5.851645637514114e-08,
"loss": 0.327,
"mean_token_accuracy": 0.8827162678353488,
"num_tokens": 621065449.0,
"step": 722
},
{
"entropy": 0.3971710205078125,
"epoch": 5.738095238095238,
"grad_norm": 0.6140180252911043,
"learning_rate": 5.522650885758374e-08,
"loss": 0.3204,
"mean_token_accuracy": 0.8850936810486019,
"num_tokens": 621906875.0,
"step": 723
},
{
"entropy": 0.3934326171875,
"epoch": 5.746031746031746,
"grad_norm": 0.6533501187294389,
"learning_rate": 5.203122789357307e-08,
"loss": 0.3342,
"mean_token_accuracy": 0.881910024676472,
"num_tokens": 622774268.0,
"step": 724
},
{
"entropy": 0.3961334228515625,
"epoch": 5.753968253968254,
"grad_norm": 0.6833130980259546,
"learning_rate": 4.893067465602863e-08,
"loss": 0.3307,
"mean_token_accuracy": 0.8820976046845317,
"num_tokens": 623625107.0,
"step": 725
},
{
"entropy": 0.3968048095703125,
"epoch": 5.761904761904762,
"grad_norm": 0.763571756315237,
"learning_rate": 4.5924908504331735e-08,
"loss": 0.3303,
"mean_token_accuracy": 0.8829025984741747,
"num_tokens": 624511530.0,
"step": 726
},
{
"entropy": 0.392822265625,
"epoch": 5.76984126984127,
"grad_norm": 0.6168843969129469,
"learning_rate": 4.3013986983184705e-08,
"loss": 0.3234,
"mean_token_accuracy": 0.884893387556076,
"num_tokens": 625349169.0,
"step": 727
},
{
"entropy": 0.393035888671875,
"epoch": 5.777777777777778,
"grad_norm": 0.6294236264104904,
"learning_rate": 4.019796582151181e-08,
"loss": 0.3231,
"mean_token_accuracy": 0.883484820369631,
"num_tokens": 626205023.0,
"step": 728
},
{
"entropy": 0.392425537109375,
"epoch": 5.785714285714286,
"grad_norm": 0.7014456722284323,
"learning_rate": 3.747689893139228e-08,
"loss": 0.3253,
"mean_token_accuracy": 0.884223835542798,
"num_tokens": 627072409.0,
"step": 729
},
{
"entropy": 0.39483642578125,
"epoch": 5.7936507936507935,
"grad_norm": 0.6354033492793867,
"learning_rate": 3.4850838407027297e-08,
"loss": 0.3351,
"mean_token_accuracy": 0.8803266729228199,
"num_tokens": 627933762.0,
"step": 730
},
{
"entropy": 0.392974853515625,
"epoch": 5.801587301587301,
"grad_norm": 0.6324944348230324,
"learning_rate": 3.2319834523742435e-08,
"loss": 0.3248,
"mean_token_accuracy": 0.8832277562469244,
"num_tokens": 628815182.0,
"step": 731
},
{
"entropy": 0.3950042724609375,
"epoch": 5.809523809523809,
"grad_norm": 0.629955389311813,
"learning_rate": 2.988393573702675e-08,
"loss": 0.3201,
"mean_token_accuracy": 0.884325556922704,
"num_tokens": 629666458.0,
"step": 732
},
{
"entropy": 0.3938446044921875,
"epoch": 5.817460317460317,
"grad_norm": 0.6166361263236042,
"learning_rate": 2.754318868160244e-08,
"loss": 0.3221,
"mean_token_accuracy": 0.8852822785265744,
"num_tokens": 630529787.0,
"step": 733
},
{
"entropy": 0.397705078125,
"epoch": 5.825396825396825,
"grad_norm": 0.644139271435762,
"learning_rate": 2.5297638170535542e-08,
"loss": 0.3219,
"mean_token_accuracy": 0.8845392796210945,
"num_tokens": 631374518.0,
"step": 734
},
{
"entropy": 0.3916778564453125,
"epoch": 5.833333333333333,
"grad_norm": 0.6747464037741341,
"learning_rate": 2.31473271943744e-08,
"loss": 0.3364,
"mean_token_accuracy": 0.8802501196041703,
"num_tokens": 632234249.0,
"step": 735
},
{
"entropy": 0.3955078125,
"epoch": 5.841269841269841,
"grad_norm": 0.639174800920214,
"learning_rate": 2.109229692032977e-08,
"loss": 0.3255,
"mean_token_accuracy": 0.8846253966912627,
"num_tokens": 633096164.0,
"step": 736
},
{
"entropy": 0.3962554931640625,
"epoch": 5.849206349206349,
"grad_norm": 0.6365328488931461,
"learning_rate": 1.9132586691484323e-08,
"loss": 0.32,
"mean_token_accuracy": 0.8840158293023705,
"num_tokens": 633966696.0,
"step": 737
},
{
"entropy": 0.396240234375,
"epoch": 5.857142857142857,
"grad_norm": 0.6274839579353274,
"learning_rate": 1.7268234026041053e-08,
"loss": 0.3254,
"mean_token_accuracy": 0.8834780002944171,
"num_tokens": 634817676.0,
"step": 738
},
{
"entropy": 0.394012451171875,
"epoch": 5.865079365079366,
"grad_norm": 0.6482673934422294,
"learning_rate": 1.5499274616602723e-08,
"loss": 0.3246,
"mean_token_accuracy": 0.8846540525555611,
"num_tokens": 635677818.0,
"step": 739
},
{
"entropy": 0.3939056396484375,
"epoch": 5.8730158730158735,
"grad_norm": 0.6092517161901388,
"learning_rate": 1.3825742329492408e-08,
"loss": 0.3286,
"mean_token_accuracy": 0.8829479278065264,
"num_tokens": 636552146.0,
"step": 740
},
{
"entropy": 0.3929901123046875,
"epoch": 5.880952380952381,
"grad_norm": 0.6568730517308575,
"learning_rate": 1.2247669204100699e-08,
"loss": 0.3324,
"mean_token_accuracy": 0.8809677893295884,
"num_tokens": 637434649.0,
"step": 741
},
{
"entropy": 0.3914947509765625,
"epoch": 5.888888888888889,
"grad_norm": 0.6200572094186191,
"learning_rate": 1.0765085452275614e-08,
"loss": 0.3292,
"mean_token_accuracy": 0.8837377014569938,
"num_tokens": 638294457.0,
"step": 742
},
{
"entropy": 0.3955535888671875,
"epoch": 5.896825396825397,
"grad_norm": 0.6375108635856576,
"learning_rate": 9.378019457743082e-09,
"loss": 0.3276,
"mean_token_accuracy": 0.8823563028126955,
"num_tokens": 639157905.0,
"step": 743
},
{
"entropy": 0.3954620361328125,
"epoch": 5.904761904761905,
"grad_norm": 0.6632626274807544,
"learning_rate": 8.086497775562918e-09,
"loss": 0.3306,
"mean_token_accuracy": 0.8825922501273453,
"num_tokens": 640011175.0,
"step": 744
},
{
"entropy": 0.3955535888671875,
"epoch": 5.912698412698413,
"grad_norm": 0.6203028600677455,
"learning_rate": 6.890545131621462e-09,
"loss": 0.3296,
"mean_token_accuracy": 0.8806997863575816,
"num_tokens": 640861100.0,
"step": 745
},
{
"entropy": 0.3971710205078125,
"epoch": 5.920634920634921,
"grad_norm": 0.608418609243511,
"learning_rate": 5.790184422158063e-09,
"loss": 0.3201,
"mean_token_accuracy": 0.8849551575258374,
"num_tokens": 641710588.0,
"step": 746
},
{
"entropy": 0.398040771484375,
"epoch": 5.928571428571429,
"grad_norm": 0.6703364371814936,
"learning_rate": 4.785436713324876e-09,
"loss": 0.3223,
"mean_token_accuracy": 0.8841964863240719,
"num_tokens": 642574184.0,
"step": 747
},
{
"entropy": 0.390533447265625,
"epoch": 5.936507936507937,
"grad_norm": 0.6254194421549316,
"learning_rate": 3.876321240786629e-09,
"loss": 0.3255,
"mean_token_accuracy": 0.8837793176062405,
"num_tokens": 643427789.0,
"step": 748
},
{
"entropy": 0.391357421875,
"epoch": 5.944444444444445,
"grad_norm": 0.6047693920470687,
"learning_rate": 3.062855409350918e-09,
"loss": 0.3226,
"mean_token_accuracy": 0.8838337506167591,
"num_tokens": 644312139.0,
"step": 749
},
{
"entropy": 0.3914794921875,
"epoch": 5.9523809523809526,
"grad_norm": 0.620245834415008,
"learning_rate": 2.345054792634027e-09,
"loss": 0.3192,
"mean_token_accuracy": 0.8871927126310766,
"num_tokens": 645164595.0,
"step": 750
},
{
"entropy": 0.3953399658203125,
"epoch": 5.9603174603174605,
"grad_norm": 0.597155253447394,
"learning_rate": 1.7229331327633935e-09,
"loss": 0.3258,
"mean_token_accuracy": 0.8841970260255039,
"num_tokens": 646011434.0,
"step": 751
},
{
"entropy": 0.3984527587890625,
"epoch": 5.968253968253968,
"grad_norm": 0.6471414852719458,
"learning_rate": 1.1965023401161457e-09,
"loss": 0.3224,
"mean_token_accuracy": 0.8848558394238353,
"num_tokens": 646838463.0,
"step": 752
},
{
"entropy": 0.3917388916015625,
"epoch": 5.976190476190476,
"grad_norm": 0.6216717165417032,
"learning_rate": 7.657724930887344e-10,
"loss": 0.3189,
"mean_token_accuracy": 0.8857344668358564,
"num_tokens": 647701193.0,
"step": 753
},
{
"entropy": 0.398895263671875,
"epoch": 5.984126984126984,
"grad_norm": 0.6120300659931632,
"learning_rate": 4.3075183790541875e-10,
"loss": 0.3228,
"mean_token_accuracy": 0.8856973070651293,
"num_tokens": 648543554.0,
"step": 754
},
{
"entropy": 0.39404296875,
"epoch": 5.992063492063492,
"grad_norm": 0.6483787873623111,
"learning_rate": 1.9144678845950393e-10,
"loss": 0.3344,
"mean_token_accuracy": 0.8800923773087561,
"num_tokens": 649397139.0,
"step": 755
},
{
"entropy": 0.3967437744140625,
"epoch": 6.0,
"grad_norm": 0.6554944488124251,
"learning_rate": 4.786192619121721e-11,
"loss": 0.3239,
"mean_token_accuracy": 0.883914896287024,
"num_tokens": 650235668.0,
"step": 756
},
{
"epoch": 6.0,
"step": 756,
"total_flos": 1202499003482112.0,
"train_loss": 0.440722111040953,
"train_runtime": 115067.3138,
"train_samples_per_second": 1.28,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 756,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 63,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1202499003482112.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}