Files
eus-latn-10mb-after-ppt-Dp-…/checkpoint-500/trainer_state.json
ModelHub XC 35fe8e09a3 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/eus-latn-10mb-after-ppt-Dp-10mb-ckpt500_seed3407
Source: Original Platform
2026-06-28 06:31:18 +08:00

1046 lines
28 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.42973785990545765,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 7.6312949657440186,
"epoch": 0.004297378599054577,
"grad_norm": 0.94921875,
"learning_rate": 2e-06,
"loss": 7.384,
"mean_token_accuracy": 0.09047168418765068,
"num_tokens": 10107.0,
"step": 5
},
{
"entropy": 7.674387979507446,
"epoch": 0.008594757198109154,
"grad_norm": 1.1484375,
"learning_rate": 4.5e-06,
"loss": 7.3814,
"mean_token_accuracy": 0.09915048182010651,
"num_tokens": 18391.0,
"step": 10
},
{
"entropy": 7.658490705490112,
"epoch": 0.01289213579716373,
"grad_norm": 1.015625,
"learning_rate": 7e-06,
"loss": 7.4194,
"mean_token_accuracy": 0.09372682273387908,
"num_tokens": 27061.0,
"step": 15
},
{
"entropy": 7.6485553741455075,
"epoch": 0.017189514396218308,
"grad_norm": 1.09375,
"learning_rate": 9.5e-06,
"loss": 7.4387,
"mean_token_accuracy": 0.09950413554906845,
"num_tokens": 36339.0,
"step": 20
},
{
"entropy": 7.655299663543701,
"epoch": 0.021486892995272882,
"grad_norm": 0.95703125,
"learning_rate": 1.2e-05,
"loss": 7.4336,
"mean_token_accuracy": 0.09199422970414162,
"num_tokens": 45770.0,
"step": 25
},
{
"entropy": 7.707321071624756,
"epoch": 0.02578427159432746,
"grad_norm": 0.96875,
"learning_rate": 1.4500000000000002e-05,
"loss": 7.4406,
"mean_token_accuracy": 0.09267855286598206,
"num_tokens": 54575.0,
"step": 30
},
{
"entropy": 7.718957376480103,
"epoch": 0.030081650193382038,
"grad_norm": 0.97265625,
"learning_rate": 1.7000000000000003e-05,
"loss": 7.5222,
"mean_token_accuracy": 0.08976790606975556,
"num_tokens": 66403.0,
"step": 35
},
{
"entropy": 7.742082262039185,
"epoch": 0.034379028792436615,
"grad_norm": 0.87890625,
"learning_rate": 1.95e-05,
"loss": 7.4377,
"mean_token_accuracy": 0.09164252653717994,
"num_tokens": 76510.0,
"step": 40
},
{
"entropy": 7.745701646804809,
"epoch": 0.03867640739149119,
"grad_norm": 0.99609375,
"learning_rate": 2.2e-05,
"loss": 7.358,
"mean_token_accuracy": 0.0955798089504242,
"num_tokens": 84836.0,
"step": 45
},
{
"entropy": 7.780595874786377,
"epoch": 0.042973785990545764,
"grad_norm": 0.984375,
"learning_rate": 2.4500000000000003e-05,
"loss": 7.3289,
"mean_token_accuracy": 0.10552914068102837,
"num_tokens": 93197.0,
"step": 50
},
{
"entropy": 7.764179325103759,
"epoch": 0.047271164589600345,
"grad_norm": 0.98828125,
"learning_rate": 2.7e-05,
"loss": 7.3234,
"mean_token_accuracy": 0.09917277097702026,
"num_tokens": 101546.0,
"step": 55
},
{
"entropy": 7.719727945327759,
"epoch": 0.05156854318865492,
"grad_norm": 0.8515625,
"learning_rate": 2.95e-05,
"loss": 7.4172,
"mean_token_accuracy": 0.0928034670650959,
"num_tokens": 111703.0,
"step": 60
},
{
"entropy": 7.748228645324707,
"epoch": 0.055865921787709494,
"grad_norm": 0.95703125,
"learning_rate": 3.2e-05,
"loss": 7.3403,
"mean_token_accuracy": 0.10037123262882233,
"num_tokens": 119894.0,
"step": 65
},
{
"entropy": 7.714352416992187,
"epoch": 0.060163300386764075,
"grad_norm": 0.89453125,
"learning_rate": 3.4500000000000005e-05,
"loss": 7.2915,
"mean_token_accuracy": 0.1022428810596466,
"num_tokens": 128885.0,
"step": 70
},
{
"entropy": 7.679376173019409,
"epoch": 0.06446067898581866,
"grad_norm": 0.8984375,
"learning_rate": 3.7e-05,
"loss": 7.4226,
"mean_token_accuracy": 0.0972097434103489,
"num_tokens": 138106.0,
"step": 75
},
{
"entropy": 7.72790002822876,
"epoch": 0.06875805758487323,
"grad_norm": 1.140625,
"learning_rate": 3.95e-05,
"loss": 7.3294,
"mean_token_accuracy": 0.1022751808166504,
"num_tokens": 146691.0,
"step": 80
},
{
"entropy": 7.730126142501831,
"epoch": 0.0730554361839278,
"grad_norm": 0.99609375,
"learning_rate": 4.2000000000000004e-05,
"loss": 7.382,
"mean_token_accuracy": 0.09973402544856072,
"num_tokens": 155792.0,
"step": 85
},
{
"entropy": 7.727601718902588,
"epoch": 0.07735281478298238,
"grad_norm": 0.89453125,
"learning_rate": 4.45e-05,
"loss": 7.4474,
"mean_token_accuracy": 0.08758748695254326,
"num_tokens": 166944.0,
"step": 90
},
{
"entropy": 7.782265329360962,
"epoch": 0.08165019338203695,
"grad_norm": 0.98828125,
"learning_rate": 4.7000000000000004e-05,
"loss": 7.2886,
"mean_token_accuracy": 0.1041356198489666,
"num_tokens": 175303.0,
"step": 95
},
{
"entropy": 7.751953029632569,
"epoch": 0.08594757198109153,
"grad_norm": 1.0078125,
"learning_rate": 4.9500000000000004e-05,
"loss": 7.3403,
"mean_token_accuracy": 0.09793160557746887,
"num_tokens": 184708.0,
"step": 100
},
{
"entropy": 7.702822208404541,
"epoch": 0.09024495058014612,
"grad_norm": 0.921875,
"learning_rate": 5.2e-05,
"loss": 7.3117,
"mean_token_accuracy": 0.09851032048463822,
"num_tokens": 193835.0,
"step": 105
},
{
"entropy": 7.686660861968994,
"epoch": 0.09454232917920069,
"grad_norm": 1.1328125,
"learning_rate": 5.45e-05,
"loss": 7.3479,
"mean_token_accuracy": 0.0979080393910408,
"num_tokens": 203344.0,
"step": 110
},
{
"entropy": 7.698584461212159,
"epoch": 0.09883970777825526,
"grad_norm": 0.9296875,
"learning_rate": 5.7e-05,
"loss": 7.4586,
"mean_token_accuracy": 0.09130895733833314,
"num_tokens": 213048.0,
"step": 115
},
{
"entropy": 7.781258678436279,
"epoch": 0.10313708637730984,
"grad_norm": 1.109375,
"learning_rate": 5.9499999999999996e-05,
"loss": 7.3094,
"mean_token_accuracy": 0.10353164449334144,
"num_tokens": 221784.0,
"step": 120
},
{
"entropy": 7.650211572647095,
"epoch": 0.10743446497636441,
"grad_norm": 1.0078125,
"learning_rate": 6.2e-05,
"loss": 7.3189,
"mean_token_accuracy": 0.09726176261901856,
"num_tokens": 230971.0,
"step": 125
},
{
"entropy": 7.655170726776123,
"epoch": 0.11173184357541899,
"grad_norm": 0.96484375,
"learning_rate": 6.450000000000001e-05,
"loss": 7.2818,
"mean_token_accuracy": 0.1042576052248478,
"num_tokens": 240524.0,
"step": 130
},
{
"entropy": 7.7341550350189205,
"epoch": 0.11602922217447358,
"grad_norm": 0.88671875,
"learning_rate": 6.7e-05,
"loss": 7.2512,
"mean_token_accuracy": 0.1007460281252861,
"num_tokens": 249220.0,
"step": 135
},
{
"entropy": 7.745693302154541,
"epoch": 0.12032660077352815,
"grad_norm": 1.0234375,
"learning_rate": 6.950000000000001e-05,
"loss": 7.3688,
"mean_token_accuracy": 0.10030856803059578,
"num_tokens": 258934.0,
"step": 140
},
{
"entropy": 7.694993305206299,
"epoch": 0.12462397937258272,
"grad_norm": 1.0234375,
"learning_rate": 7.2e-05,
"loss": 7.2936,
"mean_token_accuracy": 0.10321335718035698,
"num_tokens": 267680.0,
"step": 145
},
{
"entropy": 7.719129991531372,
"epoch": 0.1289213579716373,
"grad_norm": 1.0078125,
"learning_rate": 7.45e-05,
"loss": 7.3236,
"mean_token_accuracy": 0.10207543894648552,
"num_tokens": 276227.0,
"step": 150
},
{
"entropy": 7.648375129699707,
"epoch": 0.1332187365706919,
"grad_norm": 0.94921875,
"learning_rate": 7.7e-05,
"loss": 7.2203,
"mean_token_accuracy": 0.1059327855706215,
"num_tokens": 286342.0,
"step": 155
},
{
"entropy": 7.674158382415771,
"epoch": 0.13751611516974646,
"grad_norm": 1.0625,
"learning_rate": 7.950000000000001e-05,
"loss": 7.2988,
"mean_token_accuracy": 0.09665355160832405,
"num_tokens": 294994.0,
"step": 160
},
{
"entropy": 7.717900514602661,
"epoch": 0.14181349376880104,
"grad_norm": 1.046875,
"learning_rate": 8.2e-05,
"loss": 7.2704,
"mean_token_accuracy": 0.10349940955638885,
"num_tokens": 303882.0,
"step": 165
},
{
"entropy": 7.6729988098144535,
"epoch": 0.1461108723678556,
"grad_norm": 0.9609375,
"learning_rate": 8.450000000000001e-05,
"loss": 7.3104,
"mean_token_accuracy": 0.10128599181771278,
"num_tokens": 312515.0,
"step": 170
},
{
"entropy": 7.739007139205933,
"epoch": 0.15040825096691018,
"grad_norm": 1.2109375,
"learning_rate": 8.7e-05,
"loss": 7.27,
"mean_token_accuracy": 0.10081852003931999,
"num_tokens": 320801.0,
"step": 175
},
{
"entropy": 7.720875406265259,
"epoch": 0.15470562956596476,
"grad_norm": 1.015625,
"learning_rate": 8.95e-05,
"loss": 7.2872,
"mean_token_accuracy": 0.10100285485386848,
"num_tokens": 329382.0,
"step": 180
},
{
"entropy": 7.66646089553833,
"epoch": 0.15900300816501933,
"grad_norm": 1.0390625,
"learning_rate": 9.2e-05,
"loss": 7.2814,
"mean_token_accuracy": 0.1028428927063942,
"num_tokens": 337894.0,
"step": 185
},
{
"entropy": 7.772510719299317,
"epoch": 0.1633003867640739,
"grad_norm": 1.125,
"learning_rate": 9.45e-05,
"loss": 7.2803,
"mean_token_accuracy": 0.10378619506955147,
"num_tokens": 346380.0,
"step": 190
},
{
"entropy": 7.690706968307495,
"epoch": 0.16759776536312848,
"grad_norm": 0.890625,
"learning_rate": 9.7e-05,
"loss": 7.3588,
"mean_token_accuracy": 0.09733301475644111,
"num_tokens": 356305.0,
"step": 195
},
{
"entropy": 7.79454927444458,
"epoch": 0.17189514396218306,
"grad_norm": 1.0078125,
"learning_rate": 9.95e-05,
"loss": 7.306,
"mean_token_accuracy": 0.09683404862880707,
"num_tokens": 364899.0,
"step": 200
},
{
"entropy": 7.694888687133789,
"epoch": 0.17619252256123766,
"grad_norm": 1.015625,
"learning_rate": 0.000102,
"loss": 7.2938,
"mean_token_accuracy": 0.09810400977730752,
"num_tokens": 373663.0,
"step": 205
},
{
"entropy": 7.748025798797608,
"epoch": 0.18048990116029223,
"grad_norm": 1.1640625,
"learning_rate": 0.00010449999999999999,
"loss": 7.2566,
"mean_token_accuracy": 0.10043591782450675,
"num_tokens": 382730.0,
"step": 210
},
{
"entropy": 7.706165361404419,
"epoch": 0.1847872797593468,
"grad_norm": 1.1328125,
"learning_rate": 0.000107,
"loss": 7.3157,
"mean_token_accuracy": 0.09612104147672654,
"num_tokens": 392676.0,
"step": 215
},
{
"entropy": 7.760982656478882,
"epoch": 0.18908465835840138,
"grad_norm": 1.2265625,
"learning_rate": 0.0001095,
"loss": 7.2955,
"mean_token_accuracy": 0.10281639397144318,
"num_tokens": 401050.0,
"step": 220
},
{
"entropy": 7.626513719558716,
"epoch": 0.19338203695745596,
"grad_norm": 1.078125,
"learning_rate": 0.000112,
"loss": 7.2692,
"mean_token_accuracy": 0.10119878426194191,
"num_tokens": 410009.0,
"step": 225
},
{
"entropy": 7.726489019393921,
"epoch": 0.19767941555651053,
"grad_norm": 0.98828125,
"learning_rate": 0.0001145,
"loss": 7.2683,
"mean_token_accuracy": 0.10186234638094901,
"num_tokens": 419302.0,
"step": 230
},
{
"entropy": 7.643717670440674,
"epoch": 0.2019767941555651,
"grad_norm": 1.109375,
"learning_rate": 0.00011700000000000001,
"loss": 7.1665,
"mean_token_accuracy": 0.10647615045309067,
"num_tokens": 427296.0,
"step": 235
},
{
"entropy": 7.666737127304077,
"epoch": 0.20627417275461968,
"grad_norm": 1.125,
"learning_rate": 0.00011949999999999999,
"loss": 7.3139,
"mean_token_accuracy": 0.10131902173161507,
"num_tokens": 436368.0,
"step": 240
},
{
"entropy": 7.772911167144775,
"epoch": 0.21057155135367425,
"grad_norm": 1.046875,
"learning_rate": 0.000122,
"loss": 7.2112,
"mean_token_accuracy": 0.1055280588567257,
"num_tokens": 445535.0,
"step": 245
},
{
"entropy": 7.602903366088867,
"epoch": 0.21486892995272883,
"grad_norm": 1.046875,
"learning_rate": 0.0001245,
"loss": 7.2153,
"mean_token_accuracy": 0.10406075567007064,
"num_tokens": 454769.0,
"step": 250
},
{
"entropy": 7.693030595779419,
"epoch": 0.2191663085517834,
"grad_norm": 1.125,
"learning_rate": 0.000127,
"loss": 7.2315,
"mean_token_accuracy": 0.10270996242761612,
"num_tokens": 463975.0,
"step": 255
},
{
"entropy": 7.637308835983276,
"epoch": 0.22346368715083798,
"grad_norm": 1.109375,
"learning_rate": 0.0001295,
"loss": 7.2542,
"mean_token_accuracy": 0.10225536078214645,
"num_tokens": 472899.0,
"step": 260
},
{
"entropy": 7.740519666671753,
"epoch": 0.22776106574989258,
"grad_norm": 1.09375,
"learning_rate": 0.000132,
"loss": 7.229,
"mean_token_accuracy": 0.1005932256579399,
"num_tokens": 481556.0,
"step": 265
},
{
"entropy": 7.654651689529419,
"epoch": 0.23205844434894715,
"grad_norm": 1.0625,
"learning_rate": 0.00013450000000000002,
"loss": 7.2258,
"mean_token_accuracy": 0.10702893435955048,
"num_tokens": 490253.0,
"step": 270
},
{
"entropy": 7.660864973068238,
"epoch": 0.23635582294800173,
"grad_norm": 1.2265625,
"learning_rate": 0.00013700000000000002,
"loss": 7.2451,
"mean_token_accuracy": 0.10333684608340263,
"num_tokens": 498444.0,
"step": 275
},
{
"entropy": 7.637535953521729,
"epoch": 0.2406532015470563,
"grad_norm": 0.98046875,
"learning_rate": 0.0001395,
"loss": 7.191,
"mean_token_accuracy": 0.10794568434357643,
"num_tokens": 508330.0,
"step": 280
},
{
"entropy": 7.6566917419433596,
"epoch": 0.24495058014611087,
"grad_norm": 1.234375,
"learning_rate": 0.00014199999999999998,
"loss": 7.3004,
"mean_token_accuracy": 0.10417937636375427,
"num_tokens": 517900.0,
"step": 285
},
{
"entropy": 7.670303010940552,
"epoch": 0.24924795874516545,
"grad_norm": 1.1484375,
"learning_rate": 0.0001445,
"loss": 7.2276,
"mean_token_accuracy": 0.10308908969163895,
"num_tokens": 527808.0,
"step": 290
},
{
"entropy": 7.719700765609741,
"epoch": 0.25354533734422,
"grad_norm": 1.1484375,
"learning_rate": 0.000147,
"loss": 7.2415,
"mean_token_accuracy": 0.10010977610945701,
"num_tokens": 536931.0,
"step": 295
},
{
"entropy": 7.668509387969971,
"epoch": 0.2578427159432746,
"grad_norm": 1.1796875,
"learning_rate": 0.0001495,
"loss": 7.279,
"mean_token_accuracy": 0.10248880609869956,
"num_tokens": 545758.0,
"step": 300
},
{
"entropy": 7.700217819213867,
"epoch": 0.26214009454232917,
"grad_norm": 1.0390625,
"learning_rate": 0.000152,
"loss": 7.2819,
"mean_token_accuracy": 0.10198702886700631,
"num_tokens": 555165.0,
"step": 305
},
{
"entropy": 7.6267822265625,
"epoch": 0.2664374731413838,
"grad_norm": 1.1171875,
"learning_rate": 0.00015450000000000001,
"loss": 7.2035,
"mean_token_accuracy": 0.10117841735482216,
"num_tokens": 564719.0,
"step": 310
},
{
"entropy": 7.646708202362061,
"epoch": 0.2707348517404383,
"grad_norm": 1.0859375,
"learning_rate": 0.000157,
"loss": 7.1638,
"mean_token_accuracy": 0.10670615658164025,
"num_tokens": 573572.0,
"step": 315
},
{
"entropy": 7.759027910232544,
"epoch": 0.2750322303394929,
"grad_norm": 1.3984375,
"learning_rate": 0.0001595,
"loss": 7.3476,
"mean_token_accuracy": 0.10210367739200592,
"num_tokens": 581497.0,
"step": 320
},
{
"entropy": 7.590592908859253,
"epoch": 0.27932960893854747,
"grad_norm": 1.125,
"learning_rate": 0.000162,
"loss": 7.2138,
"mean_token_accuracy": 0.10664469674229622,
"num_tokens": 591107.0,
"step": 325
},
{
"entropy": 7.70356388092041,
"epoch": 0.28362698753760207,
"grad_norm": 1.0546875,
"learning_rate": 0.00016450000000000001,
"loss": 7.2482,
"mean_token_accuracy": 0.1050640620291233,
"num_tokens": 600241.0,
"step": 330
},
{
"entropy": 7.639587259292602,
"epoch": 0.2879243661366566,
"grad_norm": 1.0703125,
"learning_rate": 0.00016700000000000002,
"loss": 7.161,
"mean_token_accuracy": 0.1065776713192463,
"num_tokens": 608697.0,
"step": 335
},
{
"entropy": 7.602131795883179,
"epoch": 0.2922217447357112,
"grad_norm": 1.1484375,
"learning_rate": 0.00016950000000000003,
"loss": 7.1698,
"mean_token_accuracy": 0.1098954938352108,
"num_tokens": 617275.0,
"step": 340
},
{
"entropy": 7.669042348861694,
"epoch": 0.29651912333476577,
"grad_norm": 1.0859375,
"learning_rate": 0.00017199999999999998,
"loss": 7.2602,
"mean_token_accuracy": 0.1007254920899868,
"num_tokens": 626644.0,
"step": 345
},
{
"entropy": 7.623440217971802,
"epoch": 0.30081650193382037,
"grad_norm": 1.1171875,
"learning_rate": 0.00017449999999999999,
"loss": 7.1639,
"mean_token_accuracy": 0.1080157920718193,
"num_tokens": 635110.0,
"step": 350
},
{
"entropy": 7.711002826690674,
"epoch": 0.30511388053287497,
"grad_norm": 0.97265625,
"learning_rate": 0.000177,
"loss": 7.3139,
"mean_token_accuracy": 0.10216462090611458,
"num_tokens": 644746.0,
"step": 355
},
{
"entropy": 7.708708238601685,
"epoch": 0.3094112591319295,
"grad_norm": 1.234375,
"learning_rate": 0.0001795,
"loss": 7.2216,
"mean_token_accuracy": 0.1021303728222847,
"num_tokens": 654281.0,
"step": 360
},
{
"entropy": 7.534019136428833,
"epoch": 0.3137086377309841,
"grad_norm": 1.234375,
"learning_rate": 0.000182,
"loss": 7.2333,
"mean_token_accuracy": 0.10576817691326142,
"num_tokens": 663174.0,
"step": 365
},
{
"entropy": 7.660452365875244,
"epoch": 0.31800601633003867,
"grad_norm": 1.0625,
"learning_rate": 0.0001845,
"loss": 7.1525,
"mean_token_accuracy": 0.10541519671678543,
"num_tokens": 672178.0,
"step": 370
},
{
"entropy": 7.651990938186645,
"epoch": 0.32230339492909327,
"grad_norm": 1.1484375,
"learning_rate": 0.000187,
"loss": 7.1748,
"mean_token_accuracy": 0.10421534106135369,
"num_tokens": 681323.0,
"step": 375
},
{
"entropy": 7.537337684631348,
"epoch": 0.3266007735281478,
"grad_norm": 0.98046875,
"learning_rate": 0.0001895,
"loss": 7.1001,
"mean_token_accuracy": 0.11140918657183647,
"num_tokens": 690461.0,
"step": 380
},
{
"entropy": 7.596573305130005,
"epoch": 0.3308981521272024,
"grad_norm": 1.2734375,
"learning_rate": 0.000192,
"loss": 7.1461,
"mean_token_accuracy": 0.10594902262091636,
"num_tokens": 699199.0,
"step": 385
},
{
"entropy": 7.566946506500244,
"epoch": 0.33519553072625696,
"grad_norm": 1.2265625,
"learning_rate": 0.0001945,
"loss": 7.109,
"mean_token_accuracy": 0.11522968709468842,
"num_tokens": 707949.0,
"step": 390
},
{
"entropy": 7.66830849647522,
"epoch": 0.33949290932531156,
"grad_norm": 1.15625,
"learning_rate": 0.00019700000000000002,
"loss": 7.1843,
"mean_token_accuracy": 0.10416831225156784,
"num_tokens": 715752.0,
"step": 395
},
{
"entropy": 7.619978666305542,
"epoch": 0.3437902879243661,
"grad_norm": 1.2734375,
"learning_rate": 0.00019950000000000002,
"loss": 7.1119,
"mean_token_accuracy": 0.11198346018791198,
"num_tokens": 724416.0,
"step": 400
},
{
"entropy": 7.594716548919678,
"epoch": 0.3480876665234207,
"grad_norm": 1.3203125,
"learning_rate": 0.000202,
"loss": 7.1774,
"mean_token_accuracy": 0.10296614542603492,
"num_tokens": 733116.0,
"step": 405
},
{
"entropy": 7.614369249343872,
"epoch": 0.3523850451224753,
"grad_norm": 1.265625,
"learning_rate": 0.00020449999999999998,
"loss": 7.1639,
"mean_token_accuracy": 0.10737873241305351,
"num_tokens": 742093.0,
"step": 410
},
{
"entropy": 7.532227945327759,
"epoch": 0.35668242372152986,
"grad_norm": 1.1640625,
"learning_rate": 0.000207,
"loss": 7.1385,
"mean_token_accuracy": 0.11264142915606498,
"num_tokens": 750402.0,
"step": 415
},
{
"entropy": 7.510246276855469,
"epoch": 0.36097980232058446,
"grad_norm": 1.0625,
"learning_rate": 0.0002095,
"loss": 7.1129,
"mean_token_accuracy": 0.11108387559652329,
"num_tokens": 760961.0,
"step": 420
},
{
"entropy": 7.720337963104248,
"epoch": 0.365277180919639,
"grad_norm": 1.171875,
"learning_rate": 0.000212,
"loss": 7.2042,
"mean_token_accuracy": 0.10612902790307999,
"num_tokens": 770554.0,
"step": 425
},
{
"entropy": 7.437310361862183,
"epoch": 0.3695745595186936,
"grad_norm": 1.328125,
"learning_rate": 0.0002145,
"loss": 7.1596,
"mean_token_accuracy": 0.11299800872802734,
"num_tokens": 779172.0,
"step": 430
},
{
"entropy": 7.663910818099976,
"epoch": 0.37387193811774816,
"grad_norm": 1.1953125,
"learning_rate": 0.00021700000000000002,
"loss": 7.2239,
"mean_token_accuracy": 0.10290571823716163,
"num_tokens": 788040.0,
"step": 435
},
{
"entropy": 7.589281415939331,
"epoch": 0.37816931671680276,
"grad_norm": 1.125,
"learning_rate": 0.0002195,
"loss": 7.1461,
"mean_token_accuracy": 0.10722599253058433,
"num_tokens": 796786.0,
"step": 440
},
{
"entropy": 7.543337059020996,
"epoch": 0.3824666953158573,
"grad_norm": 1.4296875,
"learning_rate": 0.000222,
"loss": 7.1192,
"mean_token_accuracy": 0.10885161831974983,
"num_tokens": 805520.0,
"step": 445
},
{
"entropy": 7.486078453063965,
"epoch": 0.3867640739149119,
"grad_norm": 1.3125,
"learning_rate": 0.0002245,
"loss": 7.074,
"mean_token_accuracy": 0.10658745989203453,
"num_tokens": 814939.0,
"step": 450
},
{
"entropy": 7.534557342529297,
"epoch": 0.39106145251396646,
"grad_norm": 1.2421875,
"learning_rate": 0.00022700000000000002,
"loss": 7.0766,
"mean_token_accuracy": 0.11227057129144669,
"num_tokens": 823862.0,
"step": 455
},
{
"entropy": 7.5476549625396725,
"epoch": 0.39535883111302106,
"grad_norm": 1.15625,
"learning_rate": 0.00022950000000000002,
"loss": 7.1124,
"mean_token_accuracy": 0.10576009079813957,
"num_tokens": 832820.0,
"step": 460
},
{
"entropy": 7.601094675064087,
"epoch": 0.39965620971207566,
"grad_norm": 1.234375,
"learning_rate": 0.00023200000000000003,
"loss": 7.0697,
"mean_token_accuracy": 0.11121490225195885,
"num_tokens": 841538.0,
"step": 465
},
{
"entropy": 7.544060945510864,
"epoch": 0.4039535883111302,
"grad_norm": 1.1953125,
"learning_rate": 0.00023449999999999998,
"loss": 7.2069,
"mean_token_accuracy": 0.10181558132171631,
"num_tokens": 851123.0,
"step": 470
},
{
"entropy": 7.549469089508056,
"epoch": 0.4082509669101848,
"grad_norm": 1.1875,
"learning_rate": 0.000237,
"loss": 7.1633,
"mean_token_accuracy": 0.11091246008872986,
"num_tokens": 860357.0,
"step": 475
},
{
"entropy": 7.547894096374511,
"epoch": 0.41254834550923936,
"grad_norm": 1.234375,
"learning_rate": 0.0002395,
"loss": 7.0874,
"mean_token_accuracy": 0.10722309574484826,
"num_tokens": 869980.0,
"step": 480
},
{
"entropy": 7.507503604888916,
"epoch": 0.41684572410829396,
"grad_norm": 1.2421875,
"learning_rate": 0.000242,
"loss": 7.0572,
"mean_token_accuracy": 0.11242355704307556,
"num_tokens": 878250.0,
"step": 485
},
{
"entropy": 7.5191121101379395,
"epoch": 0.4211431027073485,
"grad_norm": 1.125,
"learning_rate": 0.0002445,
"loss": 7.1411,
"mean_token_accuracy": 0.11158529818058013,
"num_tokens": 887624.0,
"step": 490
},
{
"entropy": 7.454204320907593,
"epoch": 0.4254404813064031,
"grad_norm": 1.1640625,
"learning_rate": 0.000247,
"loss": 7.1159,
"mean_token_accuracy": 0.11260272860527039,
"num_tokens": 897120.0,
"step": 495
},
{
"entropy": 7.495032835006714,
"epoch": 0.42973785990545765,
"grad_norm": 1.140625,
"learning_rate": 0.0002495,
"loss": 7.0795,
"mean_token_accuracy": 0.11134620234370232,
"num_tokens": 906215.0,
"step": 500
},
{
"epoch": 0.42973785990545765,
"eval_entropy": 7.203803374960616,
"eval_loss": 7.096514701843262,
"eval_mean_token_accuracy": 0.11462040213649874,
"eval_num_tokens": 906215.0,
"eval_runtime": 2.0645,
"eval_samples_per_second": 1719.022,
"eval_steps_per_second": 215.059,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 11630,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 204362498211840.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}