Model: fpadovani/eus-latn-10mb-after-ppt-Dp-10mb-ckpt500_seed3407 Source: Original Platform
4079 lines
110 KiB
JSON
4079 lines
110 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.7185217017619252,
|
|
"eval_steps": 500,
|
|
"global_step": 2000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 7.6312949657440186,
|
|
"epoch": 0.004297378599054577,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 2e-06,
|
|
"loss": 7.384,
|
|
"mean_token_accuracy": 0.09047168418765068,
|
|
"num_tokens": 10107.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 7.674387979507446,
|
|
"epoch": 0.008594757198109154,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 7.3814,
|
|
"mean_token_accuracy": 0.09915048182010651,
|
|
"num_tokens": 18391.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 7.658490705490112,
|
|
"epoch": 0.01289213579716373,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 7e-06,
|
|
"loss": 7.4194,
|
|
"mean_token_accuracy": 0.09372682273387908,
|
|
"num_tokens": 27061.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 7.6485553741455075,
|
|
"epoch": 0.017189514396218308,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 7.4387,
|
|
"mean_token_accuracy": 0.09950413554906845,
|
|
"num_tokens": 36339.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 7.655299663543701,
|
|
"epoch": 0.021486892995272882,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 7.4336,
|
|
"mean_token_accuracy": 0.09199422970414162,
|
|
"num_tokens": 45770.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 7.707321071624756,
|
|
"epoch": 0.02578427159432746,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 7.4406,
|
|
"mean_token_accuracy": 0.09267855286598206,
|
|
"num_tokens": 54575.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 7.718957376480103,
|
|
"epoch": 0.030081650193382038,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 7.5222,
|
|
"mean_token_accuracy": 0.08976790606975556,
|
|
"num_tokens": 66403.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 7.742082262039185,
|
|
"epoch": 0.034379028792436615,
|
|
"grad_norm": 0.87890625,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 7.4377,
|
|
"mean_token_accuracy": 0.09164252653717994,
|
|
"num_tokens": 76510.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 7.745701646804809,
|
|
"epoch": 0.03867640739149119,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 7.358,
|
|
"mean_token_accuracy": 0.0955798089504242,
|
|
"num_tokens": 84836.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 7.780595874786377,
|
|
"epoch": 0.042973785990545764,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 7.3289,
|
|
"mean_token_accuracy": 0.10552914068102837,
|
|
"num_tokens": 93197.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 7.764179325103759,
|
|
"epoch": 0.047271164589600345,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 7.3234,
|
|
"mean_token_accuracy": 0.09917277097702026,
|
|
"num_tokens": 101546.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 7.719727945327759,
|
|
"epoch": 0.05156854318865492,
|
|
"grad_norm": 0.8515625,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 7.4172,
|
|
"mean_token_accuracy": 0.0928034670650959,
|
|
"num_tokens": 111703.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 7.748228645324707,
|
|
"epoch": 0.055865921787709494,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 7.3403,
|
|
"mean_token_accuracy": 0.10037123262882233,
|
|
"num_tokens": 119894.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 7.714352416992187,
|
|
"epoch": 0.060163300386764075,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 7.2915,
|
|
"mean_token_accuracy": 0.1022428810596466,
|
|
"num_tokens": 128885.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 7.679376173019409,
|
|
"epoch": 0.06446067898581866,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 7.4226,
|
|
"mean_token_accuracy": 0.0972097434103489,
|
|
"num_tokens": 138106.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 7.72790002822876,
|
|
"epoch": 0.06875805758487323,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 7.3294,
|
|
"mean_token_accuracy": 0.1022751808166504,
|
|
"num_tokens": 146691.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 7.730126142501831,
|
|
"epoch": 0.0730554361839278,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 7.382,
|
|
"mean_token_accuracy": 0.09973402544856072,
|
|
"num_tokens": 155792.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 7.727601718902588,
|
|
"epoch": 0.07735281478298238,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 7.4474,
|
|
"mean_token_accuracy": 0.08758748695254326,
|
|
"num_tokens": 166944.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 7.782265329360962,
|
|
"epoch": 0.08165019338203695,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 7.2886,
|
|
"mean_token_accuracy": 0.1041356198489666,
|
|
"num_tokens": 175303.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 7.751953029632569,
|
|
"epoch": 0.08594757198109153,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 7.3403,
|
|
"mean_token_accuracy": 0.09793160557746887,
|
|
"num_tokens": 184708.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 7.702822208404541,
|
|
"epoch": 0.09024495058014612,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 7.3117,
|
|
"mean_token_accuracy": 0.09851032048463822,
|
|
"num_tokens": 193835.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 7.686660861968994,
|
|
"epoch": 0.09454232917920069,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 7.3479,
|
|
"mean_token_accuracy": 0.0979080393910408,
|
|
"num_tokens": 203344.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 7.698584461212159,
|
|
"epoch": 0.09883970777825526,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 7.4586,
|
|
"mean_token_accuracy": 0.09130895733833314,
|
|
"num_tokens": 213048.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 7.781258678436279,
|
|
"epoch": 0.10313708637730984,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 7.3094,
|
|
"mean_token_accuracy": 0.10353164449334144,
|
|
"num_tokens": 221784.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 7.650211572647095,
|
|
"epoch": 0.10743446497636441,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 7.3189,
|
|
"mean_token_accuracy": 0.09726176261901856,
|
|
"num_tokens": 230971.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 7.655170726776123,
|
|
"epoch": 0.11173184357541899,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 7.2818,
|
|
"mean_token_accuracy": 0.1042576052248478,
|
|
"num_tokens": 240524.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 7.7341550350189205,
|
|
"epoch": 0.11602922217447358,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 7.2512,
|
|
"mean_token_accuracy": 0.1007460281252861,
|
|
"num_tokens": 249220.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 7.745693302154541,
|
|
"epoch": 0.12032660077352815,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 7.3688,
|
|
"mean_token_accuracy": 0.10030856803059578,
|
|
"num_tokens": 258934.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 7.694993305206299,
|
|
"epoch": 0.12462397937258272,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 7.2936,
|
|
"mean_token_accuracy": 0.10321335718035698,
|
|
"num_tokens": 267680.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 7.719129991531372,
|
|
"epoch": 0.1289213579716373,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 7.3236,
|
|
"mean_token_accuracy": 0.10207543894648552,
|
|
"num_tokens": 276227.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 7.648375129699707,
|
|
"epoch": 0.1332187365706919,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 7.2203,
|
|
"mean_token_accuracy": 0.1059327855706215,
|
|
"num_tokens": 286342.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 7.674158382415771,
|
|
"epoch": 0.13751611516974646,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 7.2988,
|
|
"mean_token_accuracy": 0.09665355160832405,
|
|
"num_tokens": 294994.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 7.717900514602661,
|
|
"epoch": 0.14181349376880104,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 7.2704,
|
|
"mean_token_accuracy": 0.10349940955638885,
|
|
"num_tokens": 303882.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 7.6729988098144535,
|
|
"epoch": 0.1461108723678556,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 7.3104,
|
|
"mean_token_accuracy": 0.10128599181771278,
|
|
"num_tokens": 312515.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 7.739007139205933,
|
|
"epoch": 0.15040825096691018,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 7.27,
|
|
"mean_token_accuracy": 0.10081852003931999,
|
|
"num_tokens": 320801.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 7.720875406265259,
|
|
"epoch": 0.15470562956596476,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.2872,
|
|
"mean_token_accuracy": 0.10100285485386848,
|
|
"num_tokens": 329382.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 7.66646089553833,
|
|
"epoch": 0.15900300816501933,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.2814,
|
|
"mean_token_accuracy": 0.1028428927063942,
|
|
"num_tokens": 337894.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.772510719299317,
|
|
"epoch": 0.1633003867640739,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 7.2803,
|
|
"mean_token_accuracy": 0.10378619506955147,
|
|
"num_tokens": 346380.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.690706968307495,
|
|
"epoch": 0.16759776536312848,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.3588,
|
|
"mean_token_accuracy": 0.09733301475644111,
|
|
"num_tokens": 356305.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.79454927444458,
|
|
"epoch": 0.17189514396218306,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.306,
|
|
"mean_token_accuracy": 0.09683404862880707,
|
|
"num_tokens": 364899.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.694888687133789,
|
|
"epoch": 0.17619252256123766,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000102,
|
|
"loss": 7.2938,
|
|
"mean_token_accuracy": 0.09810400977730752,
|
|
"num_tokens": 373663.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.748025798797608,
|
|
"epoch": 0.18048990116029223,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.2566,
|
|
"mean_token_accuracy": 0.10043591782450675,
|
|
"num_tokens": 382730.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.706165361404419,
|
|
"epoch": 0.1847872797593468,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.3157,
|
|
"mean_token_accuracy": 0.09612104147672654,
|
|
"num_tokens": 392676.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 7.760982656478882,
|
|
"epoch": 0.18908465835840138,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.2955,
|
|
"mean_token_accuracy": 0.10281639397144318,
|
|
"num_tokens": 401050.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.626513719558716,
|
|
"epoch": 0.19338203695745596,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.2692,
|
|
"mean_token_accuracy": 0.10119878426194191,
|
|
"num_tokens": 410009.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 7.726489019393921,
|
|
"epoch": 0.19767941555651053,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.2683,
|
|
"mean_token_accuracy": 0.10186234638094901,
|
|
"num_tokens": 419302.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.643717670440674,
|
|
"epoch": 0.2019767941555651,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.1665,
|
|
"mean_token_accuracy": 0.10647615045309067,
|
|
"num_tokens": 427296.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.666737127304077,
|
|
"epoch": 0.20627417275461968,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.3139,
|
|
"mean_token_accuracy": 0.10131902173161507,
|
|
"num_tokens": 436368.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.772911167144775,
|
|
"epoch": 0.21057155135367425,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.2112,
|
|
"mean_token_accuracy": 0.1055280588567257,
|
|
"num_tokens": 445535.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.602903366088867,
|
|
"epoch": 0.21486892995272883,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.2153,
|
|
"mean_token_accuracy": 0.10406075567007064,
|
|
"num_tokens": 454769.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.693030595779419,
|
|
"epoch": 0.2191663085517834,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.2315,
|
|
"mean_token_accuracy": 0.10270996242761612,
|
|
"num_tokens": 463975.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.637308835983276,
|
|
"epoch": 0.22346368715083798,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.2542,
|
|
"mean_token_accuracy": 0.10225536078214645,
|
|
"num_tokens": 472899.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.740519666671753,
|
|
"epoch": 0.22776106574989258,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.229,
|
|
"mean_token_accuracy": 0.1005932256579399,
|
|
"num_tokens": 481556.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.654651689529419,
|
|
"epoch": 0.23205844434894715,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.2258,
|
|
"mean_token_accuracy": 0.10702893435955048,
|
|
"num_tokens": 490253.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.660864973068238,
|
|
"epoch": 0.23635582294800173,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.2451,
|
|
"mean_token_accuracy": 0.10333684608340263,
|
|
"num_tokens": 498444.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.637535953521729,
|
|
"epoch": 0.2406532015470563,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.191,
|
|
"mean_token_accuracy": 0.10794568434357643,
|
|
"num_tokens": 508330.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.6566917419433596,
|
|
"epoch": 0.24495058014611087,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.3004,
|
|
"mean_token_accuracy": 0.10417937636375427,
|
|
"num_tokens": 517900.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.670303010940552,
|
|
"epoch": 0.24924795874516545,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.2276,
|
|
"mean_token_accuracy": 0.10308908969163895,
|
|
"num_tokens": 527808.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.719700765609741,
|
|
"epoch": 0.25354533734422,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.2415,
|
|
"mean_token_accuracy": 0.10010977610945701,
|
|
"num_tokens": 536931.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.668509387969971,
|
|
"epoch": 0.2578427159432746,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.279,
|
|
"mean_token_accuracy": 0.10248880609869956,
|
|
"num_tokens": 545758.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.700217819213867,
|
|
"epoch": 0.26214009454232917,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.2819,
|
|
"mean_token_accuracy": 0.10198702886700631,
|
|
"num_tokens": 555165.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.6267822265625,
|
|
"epoch": 0.2664374731413838,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.2035,
|
|
"mean_token_accuracy": 0.10117841735482216,
|
|
"num_tokens": 564719.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 7.646708202362061,
|
|
"epoch": 0.2707348517404383,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.1638,
|
|
"mean_token_accuracy": 0.10670615658164025,
|
|
"num_tokens": 573572.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.759027910232544,
|
|
"epoch": 0.2750322303394929,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.3476,
|
|
"mean_token_accuracy": 0.10210367739200592,
|
|
"num_tokens": 581497.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.590592908859253,
|
|
"epoch": 0.27932960893854747,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.2138,
|
|
"mean_token_accuracy": 0.10664469674229622,
|
|
"num_tokens": 591107.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.70356388092041,
|
|
"epoch": 0.28362698753760207,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.2482,
|
|
"mean_token_accuracy": 0.1050640620291233,
|
|
"num_tokens": 600241.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.639587259292602,
|
|
"epoch": 0.2879243661366566,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.161,
|
|
"mean_token_accuracy": 0.1065776713192463,
|
|
"num_tokens": 608697.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.602131795883179,
|
|
"epoch": 0.2922217447357112,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.1698,
|
|
"mean_token_accuracy": 0.1098954938352108,
|
|
"num_tokens": 617275.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 7.669042348861694,
|
|
"epoch": 0.29651912333476577,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.2602,
|
|
"mean_token_accuracy": 0.1007254920899868,
|
|
"num_tokens": 626644.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.623440217971802,
|
|
"epoch": 0.30081650193382037,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.1639,
|
|
"mean_token_accuracy": 0.1080157920718193,
|
|
"num_tokens": 635110.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.711002826690674,
|
|
"epoch": 0.30511388053287497,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.3139,
|
|
"mean_token_accuracy": 0.10216462090611458,
|
|
"num_tokens": 644746.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 7.708708238601685,
|
|
"epoch": 0.3094112591319295,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.2216,
|
|
"mean_token_accuracy": 0.1021303728222847,
|
|
"num_tokens": 654281.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.534019136428833,
|
|
"epoch": 0.3137086377309841,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.2333,
|
|
"mean_token_accuracy": 0.10576817691326142,
|
|
"num_tokens": 663174.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.660452365875244,
|
|
"epoch": 0.31800601633003867,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.1525,
|
|
"mean_token_accuracy": 0.10541519671678543,
|
|
"num_tokens": 672178.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.651990938186645,
|
|
"epoch": 0.32230339492909327,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.1748,
|
|
"mean_token_accuracy": 0.10421534106135369,
|
|
"num_tokens": 681323.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.537337684631348,
|
|
"epoch": 0.3266007735281478,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.1001,
|
|
"mean_token_accuracy": 0.11140918657183647,
|
|
"num_tokens": 690461.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 7.596573305130005,
|
|
"epoch": 0.3308981521272024,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.1461,
|
|
"mean_token_accuracy": 0.10594902262091636,
|
|
"num_tokens": 699199.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.566946506500244,
|
|
"epoch": 0.33519553072625696,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.109,
|
|
"mean_token_accuracy": 0.11522968709468842,
|
|
"num_tokens": 707949.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.66830849647522,
|
|
"epoch": 0.33949290932531156,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.1843,
|
|
"mean_token_accuracy": 0.10416831225156784,
|
|
"num_tokens": 715752.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.619978666305542,
|
|
"epoch": 0.3437902879243661,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.1119,
|
|
"mean_token_accuracy": 0.11198346018791198,
|
|
"num_tokens": 724416.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.594716548919678,
|
|
"epoch": 0.3480876665234207,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.1774,
|
|
"mean_token_accuracy": 0.10296614542603492,
|
|
"num_tokens": 733116.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.614369249343872,
|
|
"epoch": 0.3523850451224753,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.1639,
|
|
"mean_token_accuracy": 0.10737873241305351,
|
|
"num_tokens": 742093.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.532227945327759,
|
|
"epoch": 0.35668242372152986,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.1385,
|
|
"mean_token_accuracy": 0.11264142915606498,
|
|
"num_tokens": 750402.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.510246276855469,
|
|
"epoch": 0.36097980232058446,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.1129,
|
|
"mean_token_accuracy": 0.11108387559652329,
|
|
"num_tokens": 760961.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.720337963104248,
|
|
"epoch": 0.365277180919639,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.2042,
|
|
"mean_token_accuracy": 0.10612902790307999,
|
|
"num_tokens": 770554.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.437310361862183,
|
|
"epoch": 0.3695745595186936,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 7.1596,
|
|
"mean_token_accuracy": 0.11299800872802734,
|
|
"num_tokens": 779172.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.663910818099976,
|
|
"epoch": 0.37387193811774816,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 7.2239,
|
|
"mean_token_accuracy": 0.10290571823716163,
|
|
"num_tokens": 788040.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.589281415939331,
|
|
"epoch": 0.37816931671680276,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.1461,
|
|
"mean_token_accuracy": 0.10722599253058433,
|
|
"num_tokens": 796786.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.543337059020996,
|
|
"epoch": 0.3824666953158573,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000222,
|
|
"loss": 7.1192,
|
|
"mean_token_accuracy": 0.10885161831974983,
|
|
"num_tokens": 805520.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.486078453063965,
|
|
"epoch": 0.3867640739149119,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 7.074,
|
|
"mean_token_accuracy": 0.10658745989203453,
|
|
"num_tokens": 814939.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.534557342529297,
|
|
"epoch": 0.39106145251396646,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 7.0766,
|
|
"mean_token_accuracy": 0.11227057129144669,
|
|
"num_tokens": 823862.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.5476549625396725,
|
|
"epoch": 0.39535883111302106,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 7.1124,
|
|
"mean_token_accuracy": 0.10576009079813957,
|
|
"num_tokens": 832820.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.601094675064087,
|
|
"epoch": 0.39965620971207566,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 7.0697,
|
|
"mean_token_accuracy": 0.11121490225195885,
|
|
"num_tokens": 841538.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.544060945510864,
|
|
"epoch": 0.4039535883111302,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.2069,
|
|
"mean_token_accuracy": 0.10181558132171631,
|
|
"num_tokens": 851123.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.549469089508056,
|
|
"epoch": 0.4082509669101848,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000237,
|
|
"loss": 7.1633,
|
|
"mean_token_accuracy": 0.11091246008872986,
|
|
"num_tokens": 860357.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.547894096374511,
|
|
"epoch": 0.41254834550923936,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.0874,
|
|
"mean_token_accuracy": 0.10722309574484826,
|
|
"num_tokens": 869980.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.507503604888916,
|
|
"epoch": 0.41684572410829396,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000242,
|
|
"loss": 7.0572,
|
|
"mean_token_accuracy": 0.11242355704307556,
|
|
"num_tokens": 878250.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.5191121101379395,
|
|
"epoch": 0.4211431027073485,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 7.1411,
|
|
"mean_token_accuracy": 0.11158529818058013,
|
|
"num_tokens": 887624.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.454204320907593,
|
|
"epoch": 0.4254404813064031,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000247,
|
|
"loss": 7.1159,
|
|
"mean_token_accuracy": 0.11260272860527039,
|
|
"num_tokens": 897120.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.495032835006714,
|
|
"epoch": 0.42973785990545765,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 7.0795,
|
|
"mean_token_accuracy": 0.11134620234370232,
|
|
"num_tokens": 906215.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.42973785990545765,
|
|
"eval_entropy": 7.203803374960616,
|
|
"eval_loss": 7.096514701843262,
|
|
"eval_mean_token_accuracy": 0.11462040213649874,
|
|
"eval_num_tokens": 906215.0,
|
|
"eval_runtime": 2.0645,
|
|
"eval_samples_per_second": 1719.022,
|
|
"eval_steps_per_second": 215.059,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.447824621200562,
|
|
"epoch": 0.43403523850451226,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000252,
|
|
"loss": 7.0811,
|
|
"mean_token_accuracy": 0.1122453585267067,
|
|
"num_tokens": 915181.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.498021125793457,
|
|
"epoch": 0.4383326171035668,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 7.1044,
|
|
"mean_token_accuracy": 0.10958386138081551,
|
|
"num_tokens": 924377.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.607626008987427,
|
|
"epoch": 0.4426299957026214,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000257,
|
|
"loss": 7.1944,
|
|
"mean_token_accuracy": 0.10655399709939957,
|
|
"num_tokens": 933114.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.6139122486114506,
|
|
"epoch": 0.44692737430167595,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.1453,
|
|
"mean_token_accuracy": 0.11119715198874473,
|
|
"num_tokens": 943306.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.436026573181152,
|
|
"epoch": 0.45122475290073055,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000262,
|
|
"loss": 7.0354,
|
|
"mean_token_accuracy": 0.11904665902256965,
|
|
"num_tokens": 951515.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.494698238372803,
|
|
"epoch": 0.45552213149978515,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 7.1519,
|
|
"mean_token_accuracy": 0.10504961535334587,
|
|
"num_tokens": 962686.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.572213172912598,
|
|
"epoch": 0.4598195100988397,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 7.1449,
|
|
"mean_token_accuracy": 0.11348244249820709,
|
|
"num_tokens": 972136.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.405817127227783,
|
|
"epoch": 0.4641168886978943,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 7.0518,
|
|
"mean_token_accuracy": 0.1100372053682804,
|
|
"num_tokens": 981301.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.484500360488892,
|
|
"epoch": 0.46841426729694885,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 7.0823,
|
|
"mean_token_accuracy": 0.1120329774916172,
|
|
"num_tokens": 990360.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.573296546936035,
|
|
"epoch": 0.47271164589600345,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 7.1293,
|
|
"mean_token_accuracy": 0.10760239511728287,
|
|
"num_tokens": 999415.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.419287919998169,
|
|
"epoch": 0.477009024495058,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000277,
|
|
"loss": 7.057,
|
|
"mean_token_accuracy": 0.10999582111835479,
|
|
"num_tokens": 1008762.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.44342451095581,
|
|
"epoch": 0.4813064030941126,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 7.0505,
|
|
"mean_token_accuracy": 0.11702658385038375,
|
|
"num_tokens": 1017704.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.457871007919311,
|
|
"epoch": 0.48560378169316715,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 7.018,
|
|
"mean_token_accuracy": 0.11318592131137847,
|
|
"num_tokens": 1026251.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.356105470657349,
|
|
"epoch": 0.48990116029222175,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 7.0083,
|
|
"mean_token_accuracy": 0.11355392187833786,
|
|
"num_tokens": 1036191.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.5119133472442625,
|
|
"epoch": 0.4941985388912763,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000287,
|
|
"loss": 7.0501,
|
|
"mean_token_accuracy": 0.11168754398822785,
|
|
"num_tokens": 1044936.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.406773805618286,
|
|
"epoch": 0.4984959174903309,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 7.0476,
|
|
"mean_token_accuracy": 0.1135815680027008,
|
|
"num_tokens": 1053683.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.3828895568847654,
|
|
"epoch": 0.5027932960893855,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000292,
|
|
"loss": 7.0283,
|
|
"mean_token_accuracy": 0.11782724559307098,
|
|
"num_tokens": 1062932.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.4789910316467285,
|
|
"epoch": 0.50709067468844,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 7.0524,
|
|
"mean_token_accuracy": 0.11150057762861251,
|
|
"num_tokens": 1072313.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.458136653900146,
|
|
"epoch": 0.5113880532874946,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 7.033,
|
|
"mean_token_accuracy": 0.10738502442836761,
|
|
"num_tokens": 1081675.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.437460470199585,
|
|
"epoch": 0.5156854318865493,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 7.0392,
|
|
"mean_token_accuracy": 0.11078862249851226,
|
|
"num_tokens": 1091541.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.43347053527832,
|
|
"epoch": 0.5199828104856038,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000302,
|
|
"loss": 7.0467,
|
|
"mean_token_accuracy": 0.11545747444033623,
|
|
"num_tokens": 1100724.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.34070782661438,
|
|
"epoch": 0.5242801890846583,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 7.0062,
|
|
"mean_token_accuracy": 0.11681902781128883,
|
|
"num_tokens": 1108869.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.513333511352539,
|
|
"epoch": 0.5285775676837129,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000307,
|
|
"loss": 7.0303,
|
|
"mean_token_accuracy": 0.11391275599598885,
|
|
"num_tokens": 1117314.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.237616014480591,
|
|
"epoch": 0.5328749462827675,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.969,
|
|
"mean_token_accuracy": 0.11866867989301681,
|
|
"num_tokens": 1126786.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.403380393981934,
|
|
"epoch": 0.5371723248818221,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.983,
|
|
"mean_token_accuracy": 0.11322688534855843,
|
|
"num_tokens": 1136013.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.355997228622437,
|
|
"epoch": 0.5414697034808766,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 7.0163,
|
|
"mean_token_accuracy": 0.1159099243581295,
|
|
"num_tokens": 1144970.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.416441440582275,
|
|
"epoch": 0.5457670820799312,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.9784,
|
|
"mean_token_accuracy": 0.12343248203396798,
|
|
"num_tokens": 1153810.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.320913982391358,
|
|
"epoch": 0.5500644606789858,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.96,
|
|
"mean_token_accuracy": 0.11895549520850182,
|
|
"num_tokens": 1162498.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 7.383200359344483,
|
|
"epoch": 0.5543618392780404,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000322,
|
|
"loss": 7.0441,
|
|
"mean_token_accuracy": 0.11171148270368576,
|
|
"num_tokens": 1172091.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.465569925308228,
|
|
"epoch": 0.5586592178770949,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 7.0379,
|
|
"mean_token_accuracy": 0.1126454509794712,
|
|
"num_tokens": 1181400.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.29718279838562,
|
|
"epoch": 0.5629565964761496,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 7.0066,
|
|
"mean_token_accuracy": 0.11692977026104927,
|
|
"num_tokens": 1189780.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.376112461090088,
|
|
"epoch": 0.5672539750752041,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.9708,
|
|
"mean_token_accuracy": 0.11179102137684822,
|
|
"num_tokens": 1198671.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.406812715530395,
|
|
"epoch": 0.5715513536742587,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.9887,
|
|
"mean_token_accuracy": 0.11439693570137024,
|
|
"num_tokens": 1207173.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.267558336257935,
|
|
"epoch": 0.5758487322733132,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.9252,
|
|
"mean_token_accuracy": 0.11824023947119713,
|
|
"num_tokens": 1216387.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.466721105575561,
|
|
"epoch": 0.5801461108723679,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.9093,
|
|
"mean_token_accuracy": 0.11586858034133911,
|
|
"num_tokens": 1224461.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.260802936553955,
|
|
"epoch": 0.5844434894714224,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.9855,
|
|
"mean_token_accuracy": 0.1176436722278595,
|
|
"num_tokens": 1233774.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.267514610290528,
|
|
"epoch": 0.588740868070477,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.9319,
|
|
"mean_token_accuracy": 0.12313097864389419,
|
|
"num_tokens": 1242812.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.451924133300781,
|
|
"epoch": 0.5930382466695315,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 7.0445,
|
|
"mean_token_accuracy": 0.1125735655426979,
|
|
"num_tokens": 1252872.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.1216278076171875,
|
|
"epoch": 0.5973356252685862,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.8314,
|
|
"mean_token_accuracy": 0.1210754469037056,
|
|
"num_tokens": 1260852.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 7.292500305175781,
|
|
"epoch": 0.6016330038676407,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.9419,
|
|
"mean_token_accuracy": 0.1167706459760666,
|
|
"num_tokens": 1268925.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.384844732284546,
|
|
"epoch": 0.6059303824666953,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.9849,
|
|
"mean_token_accuracy": 0.11300796419382095,
|
|
"num_tokens": 1278994.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 7.286926889419556,
|
|
"epoch": 0.6102277610657499,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.9847,
|
|
"mean_token_accuracy": 0.11259545534849166,
|
|
"num_tokens": 1287698.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 7.337662601470948,
|
|
"epoch": 0.6145251396648045,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.9117,
|
|
"mean_token_accuracy": 0.12028303518891334,
|
|
"num_tokens": 1297475.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 7.265739297866821,
|
|
"epoch": 0.618822518263859,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.9558,
|
|
"mean_token_accuracy": 0.11790136769413948,
|
|
"num_tokens": 1306836.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 7.3774675846099855,
|
|
"epoch": 0.6231198968629136,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.9932,
|
|
"mean_token_accuracy": 0.11299360319972038,
|
|
"num_tokens": 1315872.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 7.3129335880279545,
|
|
"epoch": 0.6274172754619682,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.9353,
|
|
"mean_token_accuracy": 0.12453719973564148,
|
|
"num_tokens": 1324624.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.300215101242065,
|
|
"epoch": 0.6317146540610228,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.9246,
|
|
"mean_token_accuracy": 0.12120431885123253,
|
|
"num_tokens": 1333058.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 7.065497016906738,
|
|
"epoch": 0.6360120326600773,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.8904,
|
|
"mean_token_accuracy": 0.11625659838318825,
|
|
"num_tokens": 1342376.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.412401533126831,
|
|
"epoch": 0.6403094112591319,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.9293,
|
|
"mean_token_accuracy": 0.11268759667873382,
|
|
"num_tokens": 1351386.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 7.194233036041259,
|
|
"epoch": 0.6446067898581865,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.8338,
|
|
"mean_token_accuracy": 0.12849506586790085,
|
|
"num_tokens": 1358958.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 7.3347986221313475,
|
|
"epoch": 0.6489041684572411,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.988,
|
|
"mean_token_accuracy": 0.11507417485117913,
|
|
"num_tokens": 1368599.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 7.380126667022705,
|
|
"epoch": 0.6532015470562956,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 7.0127,
|
|
"mean_token_accuracy": 0.111283528059721,
|
|
"num_tokens": 1378529.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 7.157611989974976,
|
|
"epoch": 0.6574989256553503,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.8052,
|
|
"mean_token_accuracy": 0.1265752285718918,
|
|
"num_tokens": 1386993.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 7.21686282157898,
|
|
"epoch": 0.6617963042544048,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.8936,
|
|
"mean_token_accuracy": 0.12180712148547172,
|
|
"num_tokens": 1395790.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 7.166302919387817,
|
|
"epoch": 0.6660936828534594,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.9063,
|
|
"mean_token_accuracy": 0.11845313757658005,
|
|
"num_tokens": 1405587.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 7.20961365699768,
|
|
"epoch": 0.6703910614525139,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.8702,
|
|
"mean_token_accuracy": 0.12274195328354835,
|
|
"num_tokens": 1414478.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 7.319825458526611,
|
|
"epoch": 0.6746884400515686,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.9317,
|
|
"mean_token_accuracy": 0.12083822339773179,
|
|
"num_tokens": 1423791.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 7.313541460037231,
|
|
"epoch": 0.6789858186506231,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.975,
|
|
"mean_token_accuracy": 0.11185284182429314,
|
|
"num_tokens": 1432955.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 7.242367315292358,
|
|
"epoch": 0.6832831972496777,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.9394,
|
|
"mean_token_accuracy": 0.11529579535126686,
|
|
"num_tokens": 1441907.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 7.173644304275513,
|
|
"epoch": 0.6875805758487322,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.8059,
|
|
"mean_token_accuracy": 0.12198502644896507,
|
|
"num_tokens": 1451062.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 7.2840491771698,
|
|
"epoch": 0.6918779544477869,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.8894,
|
|
"mean_token_accuracy": 0.11644295528531075,
|
|
"num_tokens": 1460132.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 7.085446500778199,
|
|
"epoch": 0.6961753330468414,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.7896,
|
|
"mean_token_accuracy": 0.12437586709856988,
|
|
"num_tokens": 1469582.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 7.180881690979004,
|
|
"epoch": 0.700472711645896,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.8844,
|
|
"mean_token_accuracy": 0.11694586053490638,
|
|
"num_tokens": 1479053.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 7.176044559478759,
|
|
"epoch": 0.7047700902449506,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.8874,
|
|
"mean_token_accuracy": 0.11812442615628242,
|
|
"num_tokens": 1488189.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 7.071721315383911,
|
|
"epoch": 0.7090674688440052,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.7495,
|
|
"mean_token_accuracy": 0.12273769155144691,
|
|
"num_tokens": 1497324.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 7.243275499343872,
|
|
"epoch": 0.7133648474430597,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.8631,
|
|
"mean_token_accuracy": 0.12297548577189446,
|
|
"num_tokens": 1506543.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 7.1102629661560055,
|
|
"epoch": 0.7176622260421143,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.8571,
|
|
"mean_token_accuracy": 0.1257997862994671,
|
|
"num_tokens": 1516737.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 7.015081739425659,
|
|
"epoch": 0.7219596046411689,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.7311,
|
|
"mean_token_accuracy": 0.12102818563580513,
|
|
"num_tokens": 1525561.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 7.17170901298523,
|
|
"epoch": 0.7262569832402235,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.757,
|
|
"mean_token_accuracy": 0.12571127861738204,
|
|
"num_tokens": 1533323.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 7.173940944671631,
|
|
"epoch": 0.730554361839278,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.821,
|
|
"mean_token_accuracy": 0.12750849053263663,
|
|
"num_tokens": 1542632.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 7.148316097259522,
|
|
"epoch": 0.7348517404383326,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.7649,
|
|
"mean_token_accuracy": 0.12507490813732147,
|
|
"num_tokens": 1551236.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.981910467147827,
|
|
"epoch": 0.7391491190373872,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.7641,
|
|
"mean_token_accuracy": 0.12514904662966728,
|
|
"num_tokens": 1559674.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 7.186282157897949,
|
|
"epoch": 0.7434464976364418,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.8498,
|
|
"mean_token_accuracy": 0.1250532478094101,
|
|
"num_tokens": 1569481.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 7.118600702285766,
|
|
"epoch": 0.7477438762354963,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.8888,
|
|
"mean_token_accuracy": 0.1209896370768547,
|
|
"num_tokens": 1578488.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 7.105226039886475,
|
|
"epoch": 0.752041254834551,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.7736,
|
|
"mean_token_accuracy": 0.12527675032615662,
|
|
"num_tokens": 1586675.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 7.185068035125733,
|
|
"epoch": 0.7563386334336055,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.8782,
|
|
"mean_token_accuracy": 0.1180253192782402,
|
|
"num_tokens": 1595411.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 7.179415893554688,
|
|
"epoch": 0.7606360120326601,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.8619,
|
|
"mean_token_accuracy": 0.12292847484350204,
|
|
"num_tokens": 1604046.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 7.130577564239502,
|
|
"epoch": 0.7649333906317146,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.8566,
|
|
"mean_token_accuracy": 0.11715829819440841,
|
|
"num_tokens": 1613759.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 7.111226511001587,
|
|
"epoch": 0.7692307692307693,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.8191,
|
|
"mean_token_accuracy": 0.1252148814499378,
|
|
"num_tokens": 1623323.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 7.097943353652954,
|
|
"epoch": 0.7735281478298238,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.7922,
|
|
"mean_token_accuracy": 0.11943844705820084,
|
|
"num_tokens": 1631727.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 7.073408317565918,
|
|
"epoch": 0.7778255264288784,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.7454,
|
|
"mean_token_accuracy": 0.12582483813166617,
|
|
"num_tokens": 1639544.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 7.1905022144317625,
|
|
"epoch": 0.7821229050279329,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.8716,
|
|
"mean_token_accuracy": 0.11673429310321808,
|
|
"num_tokens": 1648931.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 7.032827425003052,
|
|
"epoch": 0.7864202836269876,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.7325,
|
|
"mean_token_accuracy": 0.12737771049141883,
|
|
"num_tokens": 1657688.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 7.160619735717773,
|
|
"epoch": 0.7907176622260421,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.8191,
|
|
"mean_token_accuracy": 0.11969996094703675,
|
|
"num_tokens": 1666879.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 7.016655492782593,
|
|
"epoch": 0.7950150408250967,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.7912,
|
|
"mean_token_accuracy": 0.12404834032058716,
|
|
"num_tokens": 1676773.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 7.205742454528808,
|
|
"epoch": 0.7993124194241513,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.8942,
|
|
"mean_token_accuracy": 0.11682869419455529,
|
|
"num_tokens": 1686144.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 7.093483018875122,
|
|
"epoch": 0.8036097980232059,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.8555,
|
|
"mean_token_accuracy": 0.11735839322209358,
|
|
"num_tokens": 1695476.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 7.090408611297607,
|
|
"epoch": 0.8079071766222604,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.7525,
|
|
"mean_token_accuracy": 0.12118161767721176,
|
|
"num_tokens": 1704907.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 7.016019344329834,
|
|
"epoch": 0.812204555221315,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.7924,
|
|
"mean_token_accuracy": 0.12617168575525284,
|
|
"num_tokens": 1714564.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 7.132166576385498,
|
|
"epoch": 0.8165019338203696,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.8135,
|
|
"mean_token_accuracy": 0.12022659555077553,
|
|
"num_tokens": 1725285.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 7.00044469833374,
|
|
"epoch": 0.8207993124194242,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.8177,
|
|
"mean_token_accuracy": 0.12241263464093208,
|
|
"num_tokens": 1734331.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 7.126689529418945,
|
|
"epoch": 0.8250966910184787,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.749,
|
|
"mean_token_accuracy": 0.11530287116765976,
|
|
"num_tokens": 1742340.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 7.05500750541687,
|
|
"epoch": 0.8293940696175333,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.7383,
|
|
"mean_token_accuracy": 0.12545244619250298,
|
|
"num_tokens": 1751725.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.894489717483521,
|
|
"epoch": 0.8336914482165879,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.6736,
|
|
"mean_token_accuracy": 0.12856126353144645,
|
|
"num_tokens": 1760294.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 7.036704349517822,
|
|
"epoch": 0.8379888268156425,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.7265,
|
|
"mean_token_accuracy": 0.1231304183602333,
|
|
"num_tokens": 1768912.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 7.092654848098755,
|
|
"epoch": 0.842286205414697,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.9187,
|
|
"mean_token_accuracy": 0.12804483920335769,
|
|
"num_tokens": 1778633.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 7.090839195251465,
|
|
"epoch": 0.8465835840137517,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.7883,
|
|
"mean_token_accuracy": 0.12408955544233322,
|
|
"num_tokens": 1787275.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 7.0695414543151855,
|
|
"epoch": 0.8508809626128062,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.7844,
|
|
"mean_token_accuracy": 0.12348324134945869,
|
|
"num_tokens": 1795994.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.964667177200317,
|
|
"epoch": 0.8551783412118608,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.7175,
|
|
"mean_token_accuracy": 0.12602235972881318,
|
|
"num_tokens": 1806379.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 7.061655473709107,
|
|
"epoch": 0.8594757198109153,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.7479,
|
|
"mean_token_accuracy": 0.13024335727095604,
|
|
"num_tokens": 1816135.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.8594757198109153,
|
|
"eval_entropy": 6.75515693050247,
|
|
"eval_loss": 6.752710819244385,
|
|
"eval_mean_token_accuracy": 0.12811107195175445,
|
|
"eval_num_tokens": 1816135.0,
|
|
"eval_runtime": 2.0604,
|
|
"eval_samples_per_second": 1722.442,
|
|
"eval_steps_per_second": 215.487,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.9897054672241214,
|
|
"epoch": 0.86377309840997,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004999998427807679,
|
|
"loss": 6.7314,
|
|
"mean_token_accuracy": 0.12282020673155784,
|
|
"num_tokens": 1824777.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.925821113586426,
|
|
"epoch": 0.8680704770090245,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004999992040780138,
|
|
"loss": 6.8085,
|
|
"mean_token_accuracy": 0.1247783549129963,
|
|
"num_tokens": 1833807.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 7.123036670684814,
|
|
"epoch": 0.8723678556080791,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999980740669294,
|
|
"loss": 6.754,
|
|
"mean_token_accuracy": 0.12499897480010987,
|
|
"num_tokens": 1843375.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 7.027141857147217,
|
|
"epoch": 0.8766652342071336,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999964527499823,
|
|
"loss": 6.8155,
|
|
"mean_token_accuracy": 0.12067028507590294,
|
|
"num_tokens": 1853036.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 7.018357038497925,
|
|
"epoch": 0.8809626128061883,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999943401307127,
|
|
"loss": 6.7605,
|
|
"mean_token_accuracy": 0.12497071847319603,
|
|
"num_tokens": 1862041.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.984006929397583,
|
|
"epoch": 0.8852599914052428,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004999917362137337,
|
|
"loss": 6.6885,
|
|
"mean_token_accuracy": 0.12735832259058952,
|
|
"num_tokens": 1870707.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.964999151229859,
|
|
"epoch": 0.8895573700042974,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999886410047312,
|
|
"loss": 6.6849,
|
|
"mean_token_accuracy": 0.12543184384703637,
|
|
"num_tokens": 1879787.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 7.046022748947143,
|
|
"epoch": 0.8938547486033519,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999850545104638,
|
|
"loss": 6.7336,
|
|
"mean_token_accuracy": 0.12585699930787086,
|
|
"num_tokens": 1889413.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.9450146675109865,
|
|
"epoch": 0.8981521272024066,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004999809767387633,
|
|
"loss": 6.7291,
|
|
"mean_token_accuracy": 0.12462790235877037,
|
|
"num_tokens": 1898283.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.982704973220825,
|
|
"epoch": 0.9024495058014611,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999764076985337,
|
|
"loss": 6.7474,
|
|
"mean_token_accuracy": 0.12953734770417213,
|
|
"num_tokens": 1907175.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.947793340682983,
|
|
"epoch": 0.9067468844005157,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999713473997519,
|
|
"loss": 6.7933,
|
|
"mean_token_accuracy": 0.12337937280535698,
|
|
"num_tokens": 1918223.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 7.053569555282593,
|
|
"epoch": 0.9110442629995703,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999657958534677,
|
|
"loss": 6.7435,
|
|
"mean_token_accuracy": 0.11936211958527565,
|
|
"num_tokens": 1928801.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.874362564086914,
|
|
"epoch": 0.9153416415986249,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999597530718034,
|
|
"loss": 6.7076,
|
|
"mean_token_accuracy": 0.12535862401127815,
|
|
"num_tokens": 1937406.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.924251508712769,
|
|
"epoch": 0.9196390201976794,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499953219067954,
|
|
"loss": 6.7025,
|
|
"mean_token_accuracy": 0.12463184967637062,
|
|
"num_tokens": 1947184.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 7.056308698654175,
|
|
"epoch": 0.923936398796734,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999461938561873,
|
|
"loss": 6.7241,
|
|
"mean_token_accuracy": 0.12476856112480164,
|
|
"num_tokens": 1956293.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.90220274925232,
|
|
"epoch": 0.9282337773957886,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999386774518432,
|
|
"loss": 6.6968,
|
|
"mean_token_accuracy": 0.12625648751854895,
|
|
"num_tokens": 1964791.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.965981435775757,
|
|
"epoch": 0.9325311559948432,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999306698713349,
|
|
"loss": 6.616,
|
|
"mean_token_accuracy": 0.12837354317307473,
|
|
"num_tokens": 1973754.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.929974555969238,
|
|
"epoch": 0.9368285345938977,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999221711321477,
|
|
"loss": 6.6857,
|
|
"mean_token_accuracy": 0.12695353776216506,
|
|
"num_tokens": 1983035.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.804391956329345,
|
|
"epoch": 0.9411259131929522,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999131812528393,
|
|
"loss": 6.7126,
|
|
"mean_token_accuracy": 0.12742481231689454,
|
|
"num_tokens": 1992584.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 7.0129533290863035,
|
|
"epoch": 0.9454232917920069,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.00049990370025304,
|
|
"loss": 6.745,
|
|
"mean_token_accuracy": 0.1250165306031704,
|
|
"num_tokens": 2001876.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.9361108303070065,
|
|
"epoch": 0.9497206703910615,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998937281534526,
|
|
"loss": 6.6354,
|
|
"mean_token_accuracy": 0.1352070689201355,
|
|
"num_tokens": 2011067.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 7.00281867980957,
|
|
"epoch": 0.954018048990116,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998832649758521,
|
|
"loss": 6.7191,
|
|
"mean_token_accuracy": 0.12910578772425652,
|
|
"num_tokens": 2020763.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.846075534820557,
|
|
"epoch": 0.9583154275891707,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004998723107430862,
|
|
"loss": 6.702,
|
|
"mean_token_accuracy": 0.12597106099128724,
|
|
"num_tokens": 2029534.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.979312801361084,
|
|
"epoch": 0.9626128061882252,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998608654790741,
|
|
"loss": 6.6576,
|
|
"mean_token_accuracy": 0.12685178518295287,
|
|
"num_tokens": 2039143.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.840395832061768,
|
|
"epoch": 0.9669101847872797,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499848929208808,
|
|
"loss": 6.619,
|
|
"mean_token_accuracy": 0.13090287074446677,
|
|
"num_tokens": 2048253.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.833210182189942,
|
|
"epoch": 0.9712075633863343,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004998365019583519,
|
|
"loss": 6.6747,
|
|
"mean_token_accuracy": 0.13630941957235337,
|
|
"num_tokens": 2057234.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 7.008919525146484,
|
|
"epoch": 0.975504941985389,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998235837548417,
|
|
"loss": 6.7058,
|
|
"mean_token_accuracy": 0.12927891165018082,
|
|
"num_tokens": 2065431.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.887974071502685,
|
|
"epoch": 0.9798023205844435,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499810174626486,
|
|
"loss": 6.7146,
|
|
"mean_token_accuracy": 0.1267981804907322,
|
|
"num_tokens": 2074723.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.909135150909424,
|
|
"epoch": 0.984099699183498,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004997962746025646,
|
|
"loss": 6.5835,
|
|
"mean_token_accuracy": 0.13582983165979384,
|
|
"num_tokens": 2084509.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.8790112972259525,
|
|
"epoch": 0.9883970777825526,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997818837134298,
|
|
"loss": 6.7192,
|
|
"mean_token_accuracy": 0.13046733066439628,
|
|
"num_tokens": 2093110.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.820547676086425,
|
|
"epoch": 0.9926944563816072,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997670019905057,
|
|
"loss": 6.5939,
|
|
"mean_token_accuracy": 0.12773325443267822,
|
|
"num_tokens": 2102355.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.849571800231933,
|
|
"epoch": 0.9969918349806618,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004997516294662876,
|
|
"loss": 6.6207,
|
|
"mean_token_accuracy": 0.1278907351195812,
|
|
"num_tokens": 2110418.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.932281441158718,
|
|
"epoch": 1.0008594757198108,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004997357661743433,
|
|
"loss": 6.6076,
|
|
"mean_token_accuracy": 0.13429299659199184,
|
|
"num_tokens": 2117866.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.776707983016967,
|
|
"epoch": 1.0051568543188656,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997194121493118,
|
|
"loss": 6.4353,
|
|
"mean_token_accuracy": 0.14019777849316598,
|
|
"num_tokens": 2126082.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.887734413146973,
|
|
"epoch": 1.0094542329179201,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997025674269037,
|
|
"loss": 6.4211,
|
|
"mean_token_accuracy": 0.13955733701586723,
|
|
"num_tokens": 2134042.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.774314117431641,
|
|
"epoch": 1.0137516115169747,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004996852320439013,
|
|
"loss": 6.4895,
|
|
"mean_token_accuracy": 0.13937605321407318,
|
|
"num_tokens": 2142570.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.8031017780303955,
|
|
"epoch": 1.0180489901160292,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996674060381578,
|
|
"loss": 6.4187,
|
|
"mean_token_accuracy": 0.13786159604787826,
|
|
"num_tokens": 2151310.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.884524583816528,
|
|
"epoch": 1.0223463687150838,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004996490894485985,
|
|
"loss": 6.4993,
|
|
"mean_token_accuracy": 0.1331610009074211,
|
|
"num_tokens": 2160662.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.801689147949219,
|
|
"epoch": 1.0266437473141383,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996302823152193,
|
|
"loss": 6.445,
|
|
"mean_token_accuracy": 0.13591438457369803,
|
|
"num_tokens": 2170067.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.76284008026123,
|
|
"epoch": 1.0309411259131929,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004996109846790873,
|
|
"loss": 6.4084,
|
|
"mean_token_accuracy": 0.14033972024917601,
|
|
"num_tokens": 2178850.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.71863865852356,
|
|
"epoch": 1.0352385045122476,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004995911965823412,
|
|
"loss": 6.4263,
|
|
"mean_token_accuracy": 0.1453915849328041,
|
|
"num_tokens": 2188307.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.847736549377442,
|
|
"epoch": 1.0395358831113022,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004995709180681899,
|
|
"loss": 6.4144,
|
|
"mean_token_accuracy": 0.1416982263326645,
|
|
"num_tokens": 2197026.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.729686546325683,
|
|
"epoch": 1.0438332617103567,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499550149180914,
|
|
"loss": 6.4003,
|
|
"mean_token_accuracy": 0.13990466818213462,
|
|
"num_tokens": 2205537.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.780020618438721,
|
|
"epoch": 1.0481306403094113,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004995288899658641,
|
|
"loss": 6.4298,
|
|
"mean_token_accuracy": 0.1448238343000412,
|
|
"num_tokens": 2214508.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.842759847640991,
|
|
"epoch": 1.0524280189084658,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004995071404694619,
|
|
"loss": 6.5391,
|
|
"mean_token_accuracy": 0.1354886084794998,
|
|
"num_tokens": 2223084.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.7924669742584225,
|
|
"epoch": 1.0567253975075204,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994849007391996,
|
|
"loss": 6.4679,
|
|
"mean_token_accuracy": 0.13138427063822747,
|
|
"num_tokens": 2231406.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.731750345230102,
|
|
"epoch": 1.061022776106575,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994621708236401,
|
|
"loss": 6.3805,
|
|
"mean_token_accuracy": 0.14119497835636138,
|
|
"num_tokens": 2239867.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.745153379440308,
|
|
"epoch": 1.0653201547056295,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000499438950772416,
|
|
"loss": 6.4467,
|
|
"mean_token_accuracy": 0.1372622825205326,
|
|
"num_tokens": 2248844.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.710582876205445,
|
|
"epoch": 1.0696175333046842,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994152406362311,
|
|
"loss": 6.3633,
|
|
"mean_token_accuracy": 0.14102791994810104,
|
|
"num_tokens": 2257599.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.773756074905395,
|
|
"epoch": 1.0739149119037388,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004993910404668586,
|
|
"loss": 6.418,
|
|
"mean_token_accuracy": 0.13638516888022423,
|
|
"num_tokens": 2266510.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.720381832122802,
|
|
"epoch": 1.0782122905027933,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499366350317142,
|
|
"loss": 6.4145,
|
|
"mean_token_accuracy": 0.1418795846402645,
|
|
"num_tokens": 2275462.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.712311601638794,
|
|
"epoch": 1.0825096691018479,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993411702409948,
|
|
"loss": 6.3874,
|
|
"mean_token_accuracy": 0.1354715533554554,
|
|
"num_tokens": 2283826.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.76007399559021,
|
|
"epoch": 1.0868070477009024,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993155002934002,
|
|
"loss": 6.3997,
|
|
"mean_token_accuracy": 0.13856483697891236,
|
|
"num_tokens": 2292967.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.8389280319213865,
|
|
"epoch": 1.091104426299957,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004992893405304111,
|
|
"loss": 6.5262,
|
|
"mean_token_accuracy": 0.13781826868653296,
|
|
"num_tokens": 2302336.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.64991979598999,
|
|
"epoch": 1.0954018048990115,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049926269100915,
|
|
"loss": 6.4293,
|
|
"mean_token_accuracy": 0.1432204395532608,
|
|
"num_tokens": 2311465.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.792691707611084,
|
|
"epoch": 1.0996991834980663,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992355517878087,
|
|
"loss": 6.542,
|
|
"mean_token_accuracy": 0.13071493357419967,
|
|
"num_tokens": 2320281.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.689556837081909,
|
|
"epoch": 1.1039965620971208,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992079229256484,
|
|
"loss": 6.4431,
|
|
"mean_token_accuracy": 0.1360026031732559,
|
|
"num_tokens": 2329755.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.6757041931152346,
|
|
"epoch": 1.1082939406961754,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004991798044829996,
|
|
"loss": 6.3861,
|
|
"mean_token_accuracy": 0.1369478650391102,
|
|
"num_tokens": 2338807.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.7733612060546875,
|
|
"epoch": 1.11259131929523,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004991511965212618,
|
|
"loss": 6.4719,
|
|
"mean_token_accuracy": 0.13780709579586983,
|
|
"num_tokens": 2348056.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.688971424102784,
|
|
"epoch": 1.1168886978942845,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991220991029032,
|
|
"loss": 6.4868,
|
|
"mean_token_accuracy": 0.13366840407252312,
|
|
"num_tokens": 2357780.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.773650407791138,
|
|
"epoch": 1.121186076493339,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000499092512291461,
|
|
"loss": 6.4446,
|
|
"mean_token_accuracy": 0.13651487827301026,
|
|
"num_tokens": 2367060.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.7718230247497555,
|
|
"epoch": 1.1254834550923936,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499062436151541,
|
|
"loss": 6.441,
|
|
"mean_token_accuracy": 0.1382215812802315,
|
|
"num_tokens": 2375751.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.800968360900879,
|
|
"epoch": 1.129780833691448,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990318707488173,
|
|
"loss": 6.5069,
|
|
"mean_token_accuracy": 0.13017478883266448,
|
|
"num_tokens": 2385013.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.692961692810059,
|
|
"epoch": 1.1340782122905029,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004990008161500327,
|
|
"loss": 6.3937,
|
|
"mean_token_accuracy": 0.14006393477320672,
|
|
"num_tokens": 2392935.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.706206512451172,
|
|
"epoch": 1.1383755908895574,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498969272422998,
|
|
"loss": 6.4188,
|
|
"mean_token_accuracy": 0.1468452200293541,
|
|
"num_tokens": 2401560.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.711210012435913,
|
|
"epoch": 1.142672969488612,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004989372396365921,
|
|
"loss": 6.3447,
|
|
"mean_token_accuracy": 0.1455326870083809,
|
|
"num_tokens": 2410050.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.756243276596069,
|
|
"epoch": 1.1469703480876665,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004989047178607618,
|
|
"loss": 6.4505,
|
|
"mean_token_accuracy": 0.13842038065195084,
|
|
"num_tokens": 2418980.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.671654081344604,
|
|
"epoch": 1.151267726686721,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004988717071665215,
|
|
"loss": 6.4407,
|
|
"mean_token_accuracy": 0.13684784546494483,
|
|
"num_tokens": 2427992.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.762688112258911,
|
|
"epoch": 1.1555651052857756,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004988382076259537,
|
|
"loss": 6.3572,
|
|
"mean_token_accuracy": 0.14135119169950486,
|
|
"num_tokens": 2436368.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.5892657279968265,
|
|
"epoch": 1.1598624838848304,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004988042193122077,
|
|
"loss": 6.3456,
|
|
"mean_token_accuracy": 0.14492984861135483,
|
|
"num_tokens": 2445499.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.752876138687133,
|
|
"epoch": 1.164159862483885,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004987697422995005,
|
|
"loss": 6.3818,
|
|
"mean_token_accuracy": 0.13490121066570282,
|
|
"num_tokens": 2454312.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.647862577438355,
|
|
"epoch": 1.1684572410829395,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004987347766631161,
|
|
"loss": 6.4437,
|
|
"mean_token_accuracy": 0.1407245770096779,
|
|
"num_tokens": 2462922.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.755164289474488,
|
|
"epoch": 1.172754619681994,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004986993224794055,
|
|
"loss": 6.4781,
|
|
"mean_token_accuracy": 0.13789629712700843,
|
|
"num_tokens": 2472195.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.6456316947937015,
|
|
"epoch": 1.1770519982810486,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004986633798257865,
|
|
"loss": 6.3829,
|
|
"mean_token_accuracy": 0.14376115351915358,
|
|
"num_tokens": 2481021.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.657115125656128,
|
|
"epoch": 1.181349376880103,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004986269487807434,
|
|
"loss": 6.405,
|
|
"mean_token_accuracy": 0.13883866667747496,
|
|
"num_tokens": 2490250.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.763047981262207,
|
|
"epoch": 1.1856467554791577,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000498590029423827,
|
|
"loss": 6.4581,
|
|
"mean_token_accuracy": 0.14272229447960855,
|
|
"num_tokens": 2499122.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.686977815628052,
|
|
"epoch": 1.1899441340782122,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004985526218356546,
|
|
"loss": 6.4227,
|
|
"mean_token_accuracy": 0.13726608753204345,
|
|
"num_tokens": 2508454.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.699887418746949,
|
|
"epoch": 1.1942415126772667,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004985147260979093,
|
|
"loss": 6.3632,
|
|
"mean_token_accuracy": 0.1465839110314846,
|
|
"num_tokens": 2517353.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.691904354095459,
|
|
"epoch": 1.1985388912763215,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004984763422933402,
|
|
"loss": 6.3821,
|
|
"mean_token_accuracy": 0.14337702393531798,
|
|
"num_tokens": 2526321.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.6859358787536625,
|
|
"epoch": 1.202836269875376,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004984374705057623,
|
|
"loss": 6.4144,
|
|
"mean_token_accuracy": 0.14242582842707635,
|
|
"num_tokens": 2535924.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.640392780303955,
|
|
"epoch": 1.2071336484744306,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004983981108200561,
|
|
"loss": 6.3922,
|
|
"mean_token_accuracy": 0.1401688925921917,
|
|
"num_tokens": 2545606.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.649671459197998,
|
|
"epoch": 1.2114310270734852,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004983582633221672,
|
|
"loss": 6.3859,
|
|
"mean_token_accuracy": 0.1407300591468811,
|
|
"num_tokens": 2554947.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.765527582168579,
|
|
"epoch": 1.2157284056725397,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004983179280991068,
|
|
"loss": 6.5354,
|
|
"mean_token_accuracy": 0.13627680763602257,
|
|
"num_tokens": 2564462.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.688222122192383,
|
|
"epoch": 1.2200257842715942,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982771052389508,
|
|
"loss": 6.3743,
|
|
"mean_token_accuracy": 0.1444454774260521,
|
|
"num_tokens": 2573124.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.700618696212769,
|
|
"epoch": 1.224323162870649,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004982357948308401,
|
|
"loss": 6.4798,
|
|
"mean_token_accuracy": 0.13040754944086075,
|
|
"num_tokens": 2581829.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.7136975765228275,
|
|
"epoch": 1.2286205414697036,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004981939969649799,
|
|
"loss": 6.3405,
|
|
"mean_token_accuracy": 0.1422662131488323,
|
|
"num_tokens": 2590631.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.661464500427246,
|
|
"epoch": 1.232917920068758,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004981517117326404,
|
|
"loss": 6.4484,
|
|
"mean_token_accuracy": 0.13987314701080322,
|
|
"num_tokens": 2600684.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.6479767799377445,
|
|
"epoch": 1.2372152986678127,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004981089392261553,
|
|
"loss": 6.3605,
|
|
"mean_token_accuracy": 0.14449947997927665,
|
|
"num_tokens": 2609667.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.643135976791382,
|
|
"epoch": 1.2415126772668672,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000498065679538923,
|
|
"loss": 6.4317,
|
|
"mean_token_accuracy": 0.14703501164913177,
|
|
"num_tokens": 2620025.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.672731685638428,
|
|
"epoch": 1.2458100558659218,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004980219327654049,
|
|
"loss": 6.351,
|
|
"mean_token_accuracy": 0.14008775800466539,
|
|
"num_tokens": 2629032.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.605780506134034,
|
|
"epoch": 1.2501074344649763,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000497977699001127,
|
|
"loss": 6.3357,
|
|
"mean_token_accuracy": 0.1428795799612999,
|
|
"num_tokens": 2638303.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.698618459701538,
|
|
"epoch": 1.2544048130640308,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004979329783426778,
|
|
"loss": 6.3527,
|
|
"mean_token_accuracy": 0.14518981352448462,
|
|
"num_tokens": 2647902.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.619544601440429,
|
|
"epoch": 1.2587021916630854,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004978877708877094,
|
|
"loss": 6.4046,
|
|
"mean_token_accuracy": 0.1414396196603775,
|
|
"num_tokens": 2657902.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.67303991317749,
|
|
"epoch": 1.2629995702621402,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004978420767349368,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.14340997561812402,
|
|
"num_tokens": 2667082.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.647952270507813,
|
|
"epoch": 1.2672969488611947,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004977958959841379,
|
|
"loss": 6.4223,
|
|
"mean_token_accuracy": 0.1364084042608738,
|
|
"num_tokens": 2676855.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.6442427158355715,
|
|
"epoch": 1.2715943274602493,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000497749228736153,
|
|
"loss": 6.3546,
|
|
"mean_token_accuracy": 0.145116026699543,
|
|
"num_tokens": 2685750.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.597840929031372,
|
|
"epoch": 1.2758917060593038,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004977020750928845,
|
|
"loss": 6.4075,
|
|
"mean_token_accuracy": 0.14761355221271516,
|
|
"num_tokens": 2695272.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.709882497787476,
|
|
"epoch": 1.2801890846583583,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004976544351572973,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.1418570265173912,
|
|
"num_tokens": 2704806.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.533363771438599,
|
|
"epoch": 1.2844864632574131,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004976063090334179,
|
|
"loss": 6.4036,
|
|
"mean_token_accuracy": 0.1452034071087837,
|
|
"num_tokens": 2713521.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.7042053699493405,
|
|
"epoch": 1.2887838418564677,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004975576968263346,
|
|
"loss": 6.3966,
|
|
"mean_token_accuracy": 0.1381194919347763,
|
|
"num_tokens": 2721848.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.2887838418564677,
|
|
"eval_entropy": 6.494678375957249,
|
|
"eval_loss": 6.482933044433594,
|
|
"eval_mean_token_accuracy": 0.14236528785513328,
|
|
"eval_num_tokens": 2721848.0,
|
|
"eval_runtime": 2.0538,
|
|
"eval_samples_per_second": 1728.039,
|
|
"eval_steps_per_second": 216.187,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.592136430740356,
|
|
"epoch": 1.2930812204555222,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000497508598642197,
|
|
"loss": 6.3613,
|
|
"mean_token_accuracy": 0.14413030222058296,
|
|
"num_tokens": 2731473.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.610020494461059,
|
|
"epoch": 1.2973785990545768,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000497459014588216,
|
|
"loss": 6.4326,
|
|
"mean_token_accuracy": 0.141157578676939,
|
|
"num_tokens": 2739867.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.684322929382324,
|
|
"epoch": 1.3016759776536313,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000497408944772663,
|
|
"loss": 6.3442,
|
|
"mean_token_accuracy": 0.14187844544649125,
|
|
"num_tokens": 2748903.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.512551116943359,
|
|
"epoch": 1.3059733562526858,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004973583893048707,
|
|
"loss": 6.3389,
|
|
"mean_token_accuracy": 0.14152248129248618,
|
|
"num_tokens": 2757711.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.74653639793396,
|
|
"epoch": 1.3102707348517404,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004973073482952321,
|
|
"loss": 6.358,
|
|
"mean_token_accuracy": 0.140853676199913,
|
|
"num_tokens": 2765633.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.572407674789429,
|
|
"epoch": 1.314568113450795,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004972558218552004,
|
|
"loss": 6.3982,
|
|
"mean_token_accuracy": 0.14053191468119622,
|
|
"num_tokens": 2774495.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.645643854141236,
|
|
"epoch": 1.3188654920498495,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004972038100972885,
|
|
"loss": 6.4066,
|
|
"mean_token_accuracy": 0.1426756389439106,
|
|
"num_tokens": 2782665.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.549836540222168,
|
|
"epoch": 1.323162870648904,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004971513131350697,
|
|
"loss": 6.356,
|
|
"mean_token_accuracy": 0.13861292153596877,
|
|
"num_tokens": 2791394.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.566079998016358,
|
|
"epoch": 1.3274602492479588,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004970983310831759,
|
|
"loss": 6.3437,
|
|
"mean_token_accuracy": 0.1422226123511791,
|
|
"num_tokens": 2800488.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.6656012535095215,
|
|
"epoch": 1.3317576278470133,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004970448640572989,
|
|
"loss": 6.4644,
|
|
"mean_token_accuracy": 0.14133307337760925,
|
|
"num_tokens": 2810116.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.59561824798584,
|
|
"epoch": 1.336055006446068,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004969909121741895,
|
|
"loss": 6.2592,
|
|
"mean_token_accuracy": 0.14750397205352783,
|
|
"num_tokens": 2819205.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.559555625915527,
|
|
"epoch": 1.3403523850451224,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004969364755516569,
|
|
"loss": 6.3311,
|
|
"mean_token_accuracy": 0.14398850798606871,
|
|
"num_tokens": 2828017.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.688138008117676,
|
|
"epoch": 1.344649763644177,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004968815543085689,
|
|
"loss": 6.3815,
|
|
"mean_token_accuracy": 0.145321074873209,
|
|
"num_tokens": 2837125.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.569426822662353,
|
|
"epoch": 1.3489471422432318,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004968261485648516,
|
|
"loss": 6.3921,
|
|
"mean_token_accuracy": 0.14212561994791031,
|
|
"num_tokens": 2845438.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.608628225326538,
|
|
"epoch": 1.3532445208422863,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000496770258441489,
|
|
"loss": 6.3689,
|
|
"mean_token_accuracy": 0.1471138596534729,
|
|
"num_tokens": 2854389.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.556783771514892,
|
|
"epoch": 1.3575418994413408,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004967138840605228,
|
|
"loss": 6.3281,
|
|
"mean_token_accuracy": 0.14712274819612503,
|
|
"num_tokens": 2863654.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.517911720275879,
|
|
"epoch": 1.3618392780403954,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000496657025545052,
|
|
"loss": 6.2482,
|
|
"mean_token_accuracy": 0.15075734853744507,
|
|
"num_tokens": 2872871.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.5070977210998535,
|
|
"epoch": 1.36613665663945,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000496599683019233,
|
|
"loss": 6.3373,
|
|
"mean_token_accuracy": 0.1449791297316551,
|
|
"num_tokens": 2881140.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.6506085872650145,
|
|
"epoch": 1.3704340352385045,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000496541856608279,
|
|
"loss": 6.3251,
|
|
"mean_token_accuracy": 0.14629032611846923,
|
|
"num_tokens": 2889945.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.464802026748657,
|
|
"epoch": 1.374731413837559,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004964835464384595,
|
|
"loss": 6.254,
|
|
"mean_token_accuracy": 0.14956037551164628,
|
|
"num_tokens": 2898897.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.606829452514648,
|
|
"epoch": 1.3790287924366136,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000496424752637101,
|
|
"loss": 6.2819,
|
|
"mean_token_accuracy": 0.15412394404411317,
|
|
"num_tokens": 2907717.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.513754224777221,
|
|
"epoch": 1.3833261710356681,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004963654753325853,
|
|
"loss": 6.2693,
|
|
"mean_token_accuracy": 0.1435668349266052,
|
|
"num_tokens": 2916213.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.6343999862670895,
|
|
"epoch": 1.387623549634723,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004963057146543505,
|
|
"loss": 6.4423,
|
|
"mean_token_accuracy": 0.13986597284674646,
|
|
"num_tokens": 2925706.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.570179843902588,
|
|
"epoch": 1.3919209282337774,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00049624547073289,
|
|
"loss": 6.3511,
|
|
"mean_token_accuracy": 0.13794696033000947,
|
|
"num_tokens": 2934464.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.570999479293823,
|
|
"epoch": 1.396218306832832,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004961847436997526,
|
|
"loss": 6.2482,
|
|
"mean_token_accuracy": 0.1511821575462818,
|
|
"num_tokens": 2944095.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.450803470611572,
|
|
"epoch": 1.4005156854318865,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004961235336875416,
|
|
"loss": 6.249,
|
|
"mean_token_accuracy": 0.1513315513730049,
|
|
"num_tokens": 2953357.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.5238546371459964,
|
|
"epoch": 1.404813064030941,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004960618408299154,
|
|
"loss": 6.4089,
|
|
"mean_token_accuracy": 0.1346985176205635,
|
|
"num_tokens": 2963020.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.61925859451294,
|
|
"epoch": 1.4091104426299956,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004959996652615865,
|
|
"loss": 6.2427,
|
|
"mean_token_accuracy": 0.1468616619706154,
|
|
"num_tokens": 2971955.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.584984397888183,
|
|
"epoch": 1.4134078212290504,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004959370071183216,
|
|
"loss": 6.3097,
|
|
"mean_token_accuracy": 0.14391712918877603,
|
|
"num_tokens": 2980662.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.6156212329864506,
|
|
"epoch": 1.417705199828105,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004958738665369407,
|
|
"loss": 6.439,
|
|
"mean_token_accuracy": 0.12904247269034386,
|
|
"num_tokens": 2990038.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.566392660140991,
|
|
"epoch": 1.4220025784271595,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004958102436553179,
|
|
"loss": 6.3627,
|
|
"mean_token_accuracy": 0.1401166081428528,
|
|
"num_tokens": 2999835.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.622867441177368,
|
|
"epoch": 1.426299957026214,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00049574613861238,
|
|
"loss": 6.3528,
|
|
"mean_token_accuracy": 0.1401872843503952,
|
|
"num_tokens": 3009593.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.564433908462524,
|
|
"epoch": 1.4305973356252686,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004956815515481069,
|
|
"loss": 6.3748,
|
|
"mean_token_accuracy": 0.14576212018728257,
|
|
"num_tokens": 3019187.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.528054189682007,
|
|
"epoch": 1.4348947142243231,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004956164826035309,
|
|
"loss": 6.2839,
|
|
"mean_token_accuracy": 0.14402172416448594,
|
|
"num_tokens": 3027875.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.481614637374878,
|
|
"epoch": 1.4391920928233777,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004955509319207363,
|
|
"loss": 6.3184,
|
|
"mean_token_accuracy": 0.14420104324817656,
|
|
"num_tokens": 3036902.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.46042537689209,
|
|
"epoch": 1.4434894714224322,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004954848996428601,
|
|
"loss": 6.2969,
|
|
"mean_token_accuracy": 0.1498032405972481,
|
|
"num_tokens": 3046653.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.64046082496643,
|
|
"epoch": 1.4477868500214868,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.00049541838591409,
|
|
"loss": 6.3977,
|
|
"mean_token_accuracy": 0.14052897915244103,
|
|
"num_tokens": 3056273.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.529829502105713,
|
|
"epoch": 1.4520842286205415,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004953513908796657,
|
|
"loss": 6.2999,
|
|
"mean_token_accuracy": 0.13732842430472375,
|
|
"num_tokens": 3065662.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.594562721252442,
|
|
"epoch": 1.456381607219596,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004952839146858773,
|
|
"loss": 6.3277,
|
|
"mean_token_accuracy": 0.14757051467895507,
|
|
"num_tokens": 3073970.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.531829500198365,
|
|
"epoch": 1.4606789858186506,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004952159574800658,
|
|
"loss": 6.3209,
|
|
"mean_token_accuracy": 0.14381522089242935,
|
|
"num_tokens": 3082500.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.566446447372437,
|
|
"epoch": 1.4649763644177052,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004951475194106229,
|
|
"loss": 6.2777,
|
|
"mean_token_accuracy": 0.14633866250514985,
|
|
"num_tokens": 3091574.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.512380361557007,
|
|
"epoch": 1.4692737430167597,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004950786006269898,
|
|
"loss": 6.3852,
|
|
"mean_token_accuracy": 0.13938545510172845,
|
|
"num_tokens": 3102402.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.59727463722229,
|
|
"epoch": 1.4735711216158143,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004950092012796576,
|
|
"loss": 6.2072,
|
|
"mean_token_accuracy": 0.15373199433088303,
|
|
"num_tokens": 3111347.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.486224889755249,
|
|
"epoch": 1.477868500214869,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004949393215201666,
|
|
"loss": 6.2833,
|
|
"mean_token_accuracy": 0.14614666104316712,
|
|
"num_tokens": 3120018.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.4936051845550535,
|
|
"epoch": 1.4821658788139236,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004948689615011065,
|
|
"loss": 6.3484,
|
|
"mean_token_accuracy": 0.13831731379032136,
|
|
"num_tokens": 3129669.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.6139086246490475,
|
|
"epoch": 1.4864632574129781,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004947981213761154,
|
|
"loss": 6.2794,
|
|
"mean_token_accuracy": 0.15020036697387695,
|
|
"num_tokens": 3139112.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.510036754608154,
|
|
"epoch": 1.4907606360120327,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004947268012998797,
|
|
"loss": 6.2427,
|
|
"mean_token_accuracy": 0.15479698032140732,
|
|
"num_tokens": 3148437.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.490271472930909,
|
|
"epoch": 1.4950580146110872,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000494655001428134,
|
|
"loss": 6.2146,
|
|
"mean_token_accuracy": 0.15289759933948516,
|
|
"num_tokens": 3158165.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.521289396286011,
|
|
"epoch": 1.4993553932101418,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004945827219176604,
|
|
"loss": 6.3026,
|
|
"mean_token_accuracy": 0.1522263005375862,
|
|
"num_tokens": 3167262.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.448360395431519,
|
|
"epoch": 1.5036527718091963,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004945099629262888,
|
|
"loss": 6.2841,
|
|
"mean_token_accuracy": 0.14779476150870324,
|
|
"num_tokens": 3176696.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.6200721740722654,
|
|
"epoch": 1.5079501504082509,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004944367246128954,
|
|
"loss": 6.3626,
|
|
"mean_token_accuracy": 0.1411810874938965,
|
|
"num_tokens": 3185857.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.497649145126343,
|
|
"epoch": 1.5122475290073054,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004943630071374036,
|
|
"loss": 6.2129,
|
|
"mean_token_accuracy": 0.15686369836330413,
|
|
"num_tokens": 3194687.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.447890901565552,
|
|
"epoch": 1.51654490760636,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004942888106607828,
|
|
"loss": 6.2715,
|
|
"mean_token_accuracy": 0.14421172440052032,
|
|
"num_tokens": 3204913.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.556134462356567,
|
|
"epoch": 1.5208422862054147,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004942141353450486,
|
|
"loss": 6.2587,
|
|
"mean_token_accuracy": 0.14712465703487396,
|
|
"num_tokens": 3213312.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.4831544876098635,
|
|
"epoch": 1.5251396648044693,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004941389813532619,
|
|
"loss": 6.1822,
|
|
"mean_token_accuracy": 0.1586100459098816,
|
|
"num_tokens": 3222992.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.385056638717652,
|
|
"epoch": 1.5294370434035238,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000494063348849529,
|
|
"loss": 6.2213,
|
|
"mean_token_accuracy": 0.15424711555242537,
|
|
"num_tokens": 3232836.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.574507141113282,
|
|
"epoch": 1.5337344220025786,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004939872379990011,
|
|
"loss": 6.3769,
|
|
"mean_token_accuracy": 0.14118290394544603,
|
|
"num_tokens": 3243171.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.56547212600708,
|
|
"epoch": 1.5380318006016331,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004939106489678739,
|
|
"loss": 6.2954,
|
|
"mean_token_accuracy": 0.15190573930740356,
|
|
"num_tokens": 3251995.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.440187692642212,
|
|
"epoch": 1.5423291792006877,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000493833581923387,
|
|
"loss": 6.2474,
|
|
"mean_token_accuracy": 0.14897289276123046,
|
|
"num_tokens": 3260841.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.5475788593292235,
|
|
"epoch": 1.5466265577997422,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004937560370338244,
|
|
"loss": 6.382,
|
|
"mean_token_accuracy": 0.14083073958754538,
|
|
"num_tokens": 3270979.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.536606645584106,
|
|
"epoch": 1.5509239363987968,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000493678014468513,
|
|
"loss": 6.307,
|
|
"mean_token_accuracy": 0.1528750814497471,
|
|
"num_tokens": 3279848.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.46652889251709,
|
|
"epoch": 1.5552213149978513,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004935995143978227,
|
|
"loss": 6.311,
|
|
"mean_token_accuracy": 0.14874453395605086,
|
|
"num_tokens": 3289172.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.480955171585083,
|
|
"epoch": 1.5595186935969059,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004935205369931664,
|
|
"loss": 6.2107,
|
|
"mean_token_accuracy": 0.15236888080835342,
|
|
"num_tokens": 3297432.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.62280158996582,
|
|
"epoch": 1.5638160721959604,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004934410824269992,
|
|
"loss": 6.2391,
|
|
"mean_token_accuracy": 0.1460478588938713,
|
|
"num_tokens": 3307486.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.396580219268799,
|
|
"epoch": 1.568113450795015,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004933611508728182,
|
|
"loss": 6.2234,
|
|
"mean_token_accuracy": 0.15543457493185997,
|
|
"num_tokens": 3316296.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.48117151260376,
|
|
"epoch": 1.5724108293940695,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000493280742505162,
|
|
"loss": 6.2496,
|
|
"mean_token_accuracy": 0.14565204530954362,
|
|
"num_tokens": 3326080.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.399107646942139,
|
|
"epoch": 1.576708207993124,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004931998574996102,
|
|
"loss": 6.1637,
|
|
"mean_token_accuracy": 0.1557439833879471,
|
|
"num_tokens": 3334826.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.395985460281372,
|
|
"epoch": 1.5810055865921788,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004931184960327832,
|
|
"loss": 6.166,
|
|
"mean_token_accuracy": 0.15891503393650055,
|
|
"num_tokens": 3343261.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.439464569091797,
|
|
"epoch": 1.5853029651912334,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004930366582823421,
|
|
"loss": 6.2095,
|
|
"mean_token_accuracy": 0.14784578159451484,
|
|
"num_tokens": 3352513.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.446910238265991,
|
|
"epoch": 1.589600343790288,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004929543444269879,
|
|
"loss": 6.2679,
|
|
"mean_token_accuracy": 0.15295199751853944,
|
|
"num_tokens": 3361577.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.4689103126525875,
|
|
"epoch": 1.5938977223893425,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000492871554646461,
|
|
"loss": 6.327,
|
|
"mean_token_accuracy": 0.14370332658290863,
|
|
"num_tokens": 3370591.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.443254470825195,
|
|
"epoch": 1.5981951009883972,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004927882891215413,
|
|
"loss": 6.2437,
|
|
"mean_token_accuracy": 0.14615294709801674,
|
|
"num_tokens": 3379761.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.549100685119629,
|
|
"epoch": 1.6024924795874518,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004927045480340475,
|
|
"loss": 6.3212,
|
|
"mean_token_accuracy": 0.1414845257997513,
|
|
"num_tokens": 3388974.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.428477334976196,
|
|
"epoch": 1.6067898581865063,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004926203315668363,
|
|
"loss": 6.2385,
|
|
"mean_token_accuracy": 0.15081687197089194,
|
|
"num_tokens": 3398339.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.499061059951782,
|
|
"epoch": 1.6110872367855609,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004925356399038032,
|
|
"loss": 6.2121,
|
|
"mean_token_accuracy": 0.15119217038154603,
|
|
"num_tokens": 3408292.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.460348415374756,
|
|
"epoch": 1.6153846153846154,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004924504732298808,
|
|
"loss": 6.1809,
|
|
"mean_token_accuracy": 0.15673429295420646,
|
|
"num_tokens": 3417057.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.498525190353393,
|
|
"epoch": 1.61968199398367,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004923648317310391,
|
|
"loss": 6.2886,
|
|
"mean_token_accuracy": 0.15057691931724548,
|
|
"num_tokens": 3425830.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.466361808776855,
|
|
"epoch": 1.6239793725827245,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004922787155942849,
|
|
"loss": 6.3261,
|
|
"mean_token_accuracy": 0.14087508171796798,
|
|
"num_tokens": 3435513.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.480417251586914,
|
|
"epoch": 1.628276751181779,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004921921250076611,
|
|
"loss": 6.2319,
|
|
"mean_token_accuracy": 0.1488749422132969,
|
|
"num_tokens": 3444684.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.398703765869141,
|
|
"epoch": 1.6325741297808336,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004921050601602475,
|
|
"loss": 6.309,
|
|
"mean_token_accuracy": 0.15032647401094437,
|
|
"num_tokens": 3453454.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.512422227859497,
|
|
"epoch": 1.6368715083798882,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004920175212421587,
|
|
"loss": 6.2317,
|
|
"mean_token_accuracy": 0.1462756022810936,
|
|
"num_tokens": 3463228.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.298534250259399,
|
|
"epoch": 1.6411688869789427,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004919295084445445,
|
|
"loss": 6.1203,
|
|
"mean_token_accuracy": 0.15622290521860122,
|
|
"num_tokens": 3472131.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.46199779510498,
|
|
"epoch": 1.6454662655779975,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004918410219595899,
|
|
"loss": 6.1947,
|
|
"mean_token_accuracy": 0.15805622637271882,
|
|
"num_tokens": 3480642.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.536061143875122,
|
|
"epoch": 1.649763644177052,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000491752061980514,
|
|
"loss": 6.1748,
|
|
"mean_token_accuracy": 0.15212914645671843,
|
|
"num_tokens": 3489346.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.385542201995849,
|
|
"epoch": 1.6540610227761066,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004916626287015697,
|
|
"loss": 6.2236,
|
|
"mean_token_accuracy": 0.1506744407117367,
|
|
"num_tokens": 3498473.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.4339292526245115,
|
|
"epoch": 1.658358401375161,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004915727223180436,
|
|
"loss": 6.2184,
|
|
"mean_token_accuracy": 0.1503354400396347,
|
|
"num_tokens": 3507415.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.472232723236084,
|
|
"epoch": 1.6626557799742159,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004914823430262554,
|
|
"loss": 6.3466,
|
|
"mean_token_accuracy": 0.13937689363956451,
|
|
"num_tokens": 3516873.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.475211191177368,
|
|
"epoch": 1.6669531585732704,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004913914910235573,
|
|
"loss": 6.2023,
|
|
"mean_token_accuracy": 0.15309734791517257,
|
|
"num_tokens": 3525047.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.334531784057617,
|
|
"epoch": 1.671250537172325,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004913001665083337,
|
|
"loss": 6.2098,
|
|
"mean_token_accuracy": 0.1510941930115223,
|
|
"num_tokens": 3534354.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.499793291091919,
|
|
"epoch": 1.6755479157713795,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004912083696800008,
|
|
"loss": 6.2384,
|
|
"mean_token_accuracy": 0.14515842348337174,
|
|
"num_tokens": 3543830.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.334777593612671,
|
|
"epoch": 1.679845294370434,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004911161007390063,
|
|
"loss": 6.1344,
|
|
"mean_token_accuracy": 0.1552545964717865,
|
|
"num_tokens": 3552314.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.398986530303955,
|
|
"epoch": 1.6841426729694886,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004910233598868287,
|
|
"loss": 6.2232,
|
|
"mean_token_accuracy": 0.14675267040729523,
|
|
"num_tokens": 3561656.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.426092958450317,
|
|
"epoch": 1.6884400515685432,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004909301473259769,
|
|
"loss": 6.2232,
|
|
"mean_token_accuracy": 0.14848204478621482,
|
|
"num_tokens": 3571784.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.454012489318847,
|
|
"epoch": 1.6927374301675977,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004908364632599899,
|
|
"loss": 6.1775,
|
|
"mean_token_accuracy": 0.15773458033800125,
|
|
"num_tokens": 3580626.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.337477779388427,
|
|
"epoch": 1.6970348087666522,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004907423078934362,
|
|
"loss": 6.2001,
|
|
"mean_token_accuracy": 0.14792972654104233,
|
|
"num_tokens": 3589916.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.395978736877441,
|
|
"epoch": 1.7013321873657068,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004906476814319134,
|
|
"loss": 6.2045,
|
|
"mean_token_accuracy": 0.15436216294765473,
|
|
"num_tokens": 3599128.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.384798145294189,
|
|
"epoch": 1.7056295659647613,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004905525840820481,
|
|
"loss": 6.2156,
|
|
"mean_token_accuracy": 0.1487440824508667,
|
|
"num_tokens": 3608764.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.519760847091675,
|
|
"epoch": 1.709926944563816,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004904570160514948,
|
|
"loss": 6.2587,
|
|
"mean_token_accuracy": 0.14064486026763917,
|
|
"num_tokens": 3619082.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.396596527099609,
|
|
"epoch": 1.7142243231628707,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004903609775489358,
|
|
"loss": 6.2232,
|
|
"mean_token_accuracy": 0.14829822033643722,
|
|
"num_tokens": 3628695.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.453386020660401,
|
|
"epoch": 1.7185217017619252,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004902644687840809,
|
|
"loss": 6.2106,
|
|
"mean_token_accuracy": 0.14628567397594452,
|
|
"num_tokens": 3637599.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.7185217017619252,
|
|
"eval_entropy": 6.120181280213433,
|
|
"eval_loss": 6.287801742553711,
|
|
"eval_mean_token_accuracy": 0.15146609128931085,
|
|
"eval_num_tokens": 3637599.0,
|
|
"eval_runtime": 2.0623,
|
|
"eval_samples_per_second": 1720.853,
|
|
"eval_steps_per_second": 215.289,
|
|
"step": 2000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 11630,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 820105322803200.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|