Model: fpadovani/eus-latn-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407 Source: Original Platform
6046 lines
164 KiB
JSON
6046 lines
164 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.26492405510420347,
|
|
"eval_steps": 3000,
|
|
"global_step": 3000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.790674829483033,
|
|
"epoch": 0.0004415400918403391,
|
|
"grad_norm": 13.0625,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.4349,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 9390.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.818728256225586,
|
|
"epoch": 0.0008830801836806782,
|
|
"grad_norm": 14.1875,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.4117,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 18671.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.857943296432495,
|
|
"epoch": 0.0013246202755210173,
|
|
"grad_norm": 16.25,
|
|
"learning_rate": 7e-06,
|
|
"loss": 14.1693,
|
|
"mean_token_accuracy": 0.00014005602570250631,
|
|
"num_tokens": 27614.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.027469444274902,
|
|
"epoch": 0.0017661603673613563,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.7713,
|
|
"mean_token_accuracy": 8.547008619643747e-05,
|
|
"num_tokens": 37850.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 6.5380439281463625,
|
|
"epoch": 0.0022077004592016957,
|
|
"grad_norm": 26.375,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 12.2509,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 47166.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 9.970003509521485,
|
|
"epoch": 0.0026492405510420347,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.9371,
|
|
"mean_token_accuracy": 0.00022374301915988325,
|
|
"num_tokens": 55500.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.680026626586914,
|
|
"epoch": 0.0030907806428823736,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.6238,
|
|
"mean_token_accuracy": 0.009453117521479726,
|
|
"num_tokens": 63851.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.703511428833007,
|
|
"epoch": 0.0035323207347227126,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.3602,
|
|
"mean_token_accuracy": 0.029037438705563544,
|
|
"num_tokens": 73697.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.651962184906006,
|
|
"epoch": 0.003973860826563052,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 10.0115,
|
|
"mean_token_accuracy": 0.05894971713423729,
|
|
"num_tokens": 83000.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.439279747009277,
|
|
"epoch": 0.004415400918403391,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.8132,
|
|
"mean_token_accuracy": 0.05815875120460987,
|
|
"num_tokens": 92982.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.318083763122559,
|
|
"epoch": 0.00485694101024373,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6231,
|
|
"mean_token_accuracy": 0.05530005097389221,
|
|
"num_tokens": 101455.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.381121063232422,
|
|
"epoch": 0.005298481102084069,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5438,
|
|
"mean_token_accuracy": 0.05805549845099449,
|
|
"num_tokens": 110782.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.360444736480712,
|
|
"epoch": 0.005740021193924408,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4168,
|
|
"mean_token_accuracy": 0.060499183088541034,
|
|
"num_tokens": 119241.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.300647163391114,
|
|
"epoch": 0.006181561285764747,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4178,
|
|
"mean_token_accuracy": 0.055320289358496665,
|
|
"num_tokens": 127903.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.332123184204102,
|
|
"epoch": 0.006623101377605086,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3721,
|
|
"mean_token_accuracy": 0.05736841931939125,
|
|
"num_tokens": 137370.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.290982055664063,
|
|
"epoch": 0.007064641469445425,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.2214,
|
|
"mean_token_accuracy": 0.06618293710052967,
|
|
"num_tokens": 146582.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.196907424926758,
|
|
"epoch": 0.007506181561285765,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1585,
|
|
"mean_token_accuracy": 0.05961471572518349,
|
|
"num_tokens": 154933.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.205323791503906,
|
|
"epoch": 0.007947721653126103,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.1026,
|
|
"mean_token_accuracy": 0.072137650847435,
|
|
"num_tokens": 165157.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.10411615371704,
|
|
"epoch": 0.008389261744966443,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9848,
|
|
"mean_token_accuracy": 0.0728946004062891,
|
|
"num_tokens": 174958.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.01873140335083,
|
|
"epoch": 0.008830801836806783,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.8889,
|
|
"mean_token_accuracy": 0.07516518756747245,
|
|
"num_tokens": 184256.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 9.956882572174072,
|
|
"epoch": 0.009272341928647121,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7839,
|
|
"mean_token_accuracy": 0.067950439453125,
|
|
"num_tokens": 192894.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 9.884513092041015,
|
|
"epoch": 0.00971388202048746,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.6868,
|
|
"mean_token_accuracy": 0.07383731976151467,
|
|
"num_tokens": 202675.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 9.810705184936523,
|
|
"epoch": 0.010155422112327799,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.624,
|
|
"mean_token_accuracy": 0.07006355635821819,
|
|
"num_tokens": 212261.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.743825721740723,
|
|
"epoch": 0.010596962204168139,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.599,
|
|
"mean_token_accuracy": 0.06874018795788288,
|
|
"num_tokens": 222329.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.528209781646728,
|
|
"epoch": 0.011038502296008477,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.4477,
|
|
"mean_token_accuracy": 0.06682575456798076,
|
|
"num_tokens": 231247.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.442446994781495,
|
|
"epoch": 0.011480042387848817,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.3412,
|
|
"mean_token_accuracy": 0.06921537183225154,
|
|
"num_tokens": 239978.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.272939491271973,
|
|
"epoch": 0.011921582479689156,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.3,
|
|
"mean_token_accuracy": 0.06850462295114994,
|
|
"num_tokens": 249735.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.222266483306885,
|
|
"epoch": 0.012363122571529495,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 8.2344,
|
|
"mean_token_accuracy": 0.06959122642874718,
|
|
"num_tokens": 259369.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.956540203094482,
|
|
"epoch": 0.012804662663369834,
|
|
"grad_norm": 0.81640625,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 8.2305,
|
|
"mean_token_accuracy": 0.06539506763219834,
|
|
"num_tokens": 268645.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.88605546951294,
|
|
"epoch": 0.013246202755210172,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 8.0685,
|
|
"mean_token_accuracy": 0.07155903875827789,
|
|
"num_tokens": 276667.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.623716259002686,
|
|
"epoch": 0.013687742847050512,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 8.0682,
|
|
"mean_token_accuracy": 0.07539896108210087,
|
|
"num_tokens": 286017.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.590844440460206,
|
|
"epoch": 0.01412928293889085,
|
|
"grad_norm": 0.83984375,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 8.0571,
|
|
"mean_token_accuracy": 0.07278457470238209,
|
|
"num_tokens": 295631.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.562520027160645,
|
|
"epoch": 0.01457082303073119,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 8.0486,
|
|
"mean_token_accuracy": 0.06986252851784229,
|
|
"num_tokens": 304704.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.490843200683594,
|
|
"epoch": 0.01501236312257153,
|
|
"grad_norm": 0.7734375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 8.0665,
|
|
"mean_token_accuracy": 0.07160350978374481,
|
|
"num_tokens": 314195.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.416227722167969,
|
|
"epoch": 0.015453903214411868,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 8.0632,
|
|
"mean_token_accuracy": 0.07028085552155972,
|
|
"num_tokens": 323379.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.398184299468994,
|
|
"epoch": 0.015895443306252206,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.9637,
|
|
"mean_token_accuracy": 0.08185541778802871,
|
|
"num_tokens": 332322.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.336036014556885,
|
|
"epoch": 0.016336983398092548,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.9427,
|
|
"mean_token_accuracy": 0.08073886930942535,
|
|
"num_tokens": 341735.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.337114715576172,
|
|
"epoch": 0.016778523489932886,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 8.0349,
|
|
"mean_token_accuracy": 0.06938613168895244,
|
|
"num_tokens": 351209.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 8.39198350906372,
|
|
"epoch": 0.017220063581773224,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.9792,
|
|
"mean_token_accuracy": 0.07559169828891754,
|
|
"num_tokens": 360467.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.235328102111817,
|
|
"epoch": 0.017661603673613566,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.9423,
|
|
"mean_token_accuracy": 0.07669526152312756,
|
|
"num_tokens": 370361.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 8.374059581756592,
|
|
"epoch": 0.018103143765453904,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 8.0075,
|
|
"mean_token_accuracy": 0.07567069008946418,
|
|
"num_tokens": 380366.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 8.206629276275635,
|
|
"epoch": 0.018544683857294242,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.9185,
|
|
"mean_token_accuracy": 0.08159190192818641,
|
|
"num_tokens": 390690.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 8.24603796005249,
|
|
"epoch": 0.01898622394913458,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.9601,
|
|
"mean_token_accuracy": 0.0793293446302414,
|
|
"num_tokens": 400722.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 8.157498931884765,
|
|
"epoch": 0.01942776404097492,
|
|
"grad_norm": 0.84375,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.8501,
|
|
"mean_token_accuracy": 0.08334142193198205,
|
|
"num_tokens": 410223.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 8.205572509765625,
|
|
"epoch": 0.01986930413281526,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.9021,
|
|
"mean_token_accuracy": 0.07716193869709968,
|
|
"num_tokens": 420214.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 8.172825717926026,
|
|
"epoch": 0.020310844224655598,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.8564,
|
|
"mean_token_accuracy": 0.08035471551120281,
|
|
"num_tokens": 429407.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 8.162760925292968,
|
|
"epoch": 0.02075238431649594,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.8246,
|
|
"mean_token_accuracy": 0.07542734369635581,
|
|
"num_tokens": 438403.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 8.177341651916503,
|
|
"epoch": 0.021193924408336277,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.8545,
|
|
"mean_token_accuracy": 0.08535856604576111,
|
|
"num_tokens": 447466.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 8.069422197341918,
|
|
"epoch": 0.021635464500176615,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.9366,
|
|
"mean_token_accuracy": 0.07459555268287658,
|
|
"num_tokens": 457141.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 8.160084056854249,
|
|
"epoch": 0.022077004592016954,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.812,
|
|
"mean_token_accuracy": 0.08223466873168946,
|
|
"num_tokens": 465708.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 8.142998504638673,
|
|
"epoch": 0.022518544683857295,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.8339,
|
|
"mean_token_accuracy": 0.07565066292881965,
|
|
"num_tokens": 475369.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 8.075135421752929,
|
|
"epoch": 0.022960084775697633,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.7972,
|
|
"mean_token_accuracy": 0.08645984381437302,
|
|
"num_tokens": 484249.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 8.122587871551513,
|
|
"epoch": 0.02340162486753797,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.8872,
|
|
"mean_token_accuracy": 0.07687325775623322,
|
|
"num_tokens": 493303.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 8.101485538482667,
|
|
"epoch": 0.023843164959378313,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.8664,
|
|
"mean_token_accuracy": 0.0807331919670105,
|
|
"num_tokens": 501503.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 8.036290693283082,
|
|
"epoch": 0.02428470505121865,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.8074,
|
|
"mean_token_accuracy": 0.08591768592596054,
|
|
"num_tokens": 509661.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 8.045488977432251,
|
|
"epoch": 0.02472624514305899,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.7904,
|
|
"mean_token_accuracy": 0.08441019728779793,
|
|
"num_tokens": 519464.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 8.107398653030396,
|
|
"epoch": 0.025167785234899327,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.7489,
|
|
"mean_token_accuracy": 0.08773190379142762,
|
|
"num_tokens": 527968.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 8.081705808639526,
|
|
"epoch": 0.02560932532673967,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.7768,
|
|
"mean_token_accuracy": 0.0868467777967453,
|
|
"num_tokens": 537234.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.99565052986145,
|
|
"epoch": 0.026050865418580007,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.7747,
|
|
"mean_token_accuracy": 0.08527034223079681,
|
|
"num_tokens": 546398.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 8.011523675918578,
|
|
"epoch": 0.026492405510420345,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.7616,
|
|
"mean_token_accuracy": 0.08982880860567093,
|
|
"num_tokens": 555362.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 8.107937812805176,
|
|
"epoch": 0.026933945602260687,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.8221,
|
|
"mean_token_accuracy": 0.07775180079042912,
|
|
"num_tokens": 564575.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 8.133016395568848,
|
|
"epoch": 0.027375485694101025,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.8384,
|
|
"mean_token_accuracy": 0.08304800540208816,
|
|
"num_tokens": 573915.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 8.016209363937378,
|
|
"epoch": 0.027817025785941363,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.7322,
|
|
"mean_token_accuracy": 0.08581754639744758,
|
|
"num_tokens": 583216.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.982406425476074,
|
|
"epoch": 0.0282585658777817,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.7553,
|
|
"mean_token_accuracy": 0.08679840788245201,
|
|
"num_tokens": 591955.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.9430736064910885,
|
|
"epoch": 0.028700105969622042,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.7588,
|
|
"mean_token_accuracy": 0.08934888392686843,
|
|
"num_tokens": 600999.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 8.070584297180176,
|
|
"epoch": 0.02914164606146238,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.6563,
|
|
"mean_token_accuracy": 0.09217674285173416,
|
|
"num_tokens": 609478.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.987708568572998,
|
|
"epoch": 0.02958318615330272,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.7225,
|
|
"mean_token_accuracy": 0.08663035854697228,
|
|
"num_tokens": 618348.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.911137056350708,
|
|
"epoch": 0.03002472624514306,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.7691,
|
|
"mean_token_accuracy": 0.083287762850523,
|
|
"num_tokens": 628548.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 8.057271575927734,
|
|
"epoch": 0.0304662663369834,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.7102,
|
|
"mean_token_accuracy": 0.08196588605642319,
|
|
"num_tokens": 637489.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.939978122711182,
|
|
"epoch": 0.030907806428823736,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.689,
|
|
"mean_token_accuracy": 0.0814521424472332,
|
|
"num_tokens": 646715.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.897878551483155,
|
|
"epoch": 0.031349346520664075,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.6894,
|
|
"mean_token_accuracy": 0.08998864293098449,
|
|
"num_tokens": 656858.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 8.019395637512208,
|
|
"epoch": 0.03179088661250441,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.6755,
|
|
"mean_token_accuracy": 0.08710955381393433,
|
|
"num_tokens": 665968.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 8.001319217681885,
|
|
"epoch": 0.03223242670434476,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.6655,
|
|
"mean_token_accuracy": 0.08621685430407525,
|
|
"num_tokens": 674295.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.810992002487183,
|
|
"epoch": 0.032673966796185096,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.591,
|
|
"mean_token_accuracy": 0.08370614722371102,
|
|
"num_tokens": 683559.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.816927337646485,
|
|
"epoch": 0.033115506888025434,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.6253,
|
|
"mean_token_accuracy": 0.08996079638600349,
|
|
"num_tokens": 692402.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.967683601379394,
|
|
"epoch": 0.03355704697986577,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.7304,
|
|
"mean_token_accuracy": 0.08065761215984821,
|
|
"num_tokens": 702052.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 8.058749055862426,
|
|
"epoch": 0.03399858707170611,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.6528,
|
|
"mean_token_accuracy": 0.08705045655369759,
|
|
"num_tokens": 711926.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.8771873950958256,
|
|
"epoch": 0.03444012716354645,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.612,
|
|
"mean_token_accuracy": 0.08773906156420708,
|
|
"num_tokens": 720948.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.893786334991455,
|
|
"epoch": 0.034881667255386786,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.6301,
|
|
"mean_token_accuracy": 0.09444142654538154,
|
|
"num_tokens": 729611.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.892533588409424,
|
|
"epoch": 0.03532320734722713,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.6187,
|
|
"mean_token_accuracy": 0.08193654045462609,
|
|
"num_tokens": 738433.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.945340347290039,
|
|
"epoch": 0.03576474743906747,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.6397,
|
|
"mean_token_accuracy": 0.08668759167194366,
|
|
"num_tokens": 747310.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.854477500915527,
|
|
"epoch": 0.03620628753090781,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.5994,
|
|
"mean_token_accuracy": 0.09020926207304,
|
|
"num_tokens": 756362.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.90778489112854,
|
|
"epoch": 0.036647827622748146,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.6034,
|
|
"mean_token_accuracy": 0.08586042672395706,
|
|
"num_tokens": 764978.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.87300386428833,
|
|
"epoch": 0.037089367714588484,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.5707,
|
|
"mean_token_accuracy": 0.09018185958266259,
|
|
"num_tokens": 774058.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.795767593383789,
|
|
"epoch": 0.03753090780642882,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.5492,
|
|
"mean_token_accuracy": 0.0897379383444786,
|
|
"num_tokens": 783332.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.853004789352417,
|
|
"epoch": 0.03797244789826916,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 7.6298,
|
|
"mean_token_accuracy": 0.08684360906481743,
|
|
"num_tokens": 792481.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.766995525360107,
|
|
"epoch": 0.038413987990109505,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 7.5212,
|
|
"mean_token_accuracy": 0.09301207512617111,
|
|
"num_tokens": 801396.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.8428326606750485,
|
|
"epoch": 0.03885552808194984,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.5865,
|
|
"mean_token_accuracy": 0.08940735682845116,
|
|
"num_tokens": 809903.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.828377294540405,
|
|
"epoch": 0.03929706817379018,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000222,
|
|
"loss": 7.5389,
|
|
"mean_token_accuracy": 0.0962544821202755,
|
|
"num_tokens": 819144.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.7183678150177,
|
|
"epoch": 0.03973860826563052,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 7.5608,
|
|
"mean_token_accuracy": 0.08849129751324654,
|
|
"num_tokens": 828881.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.764478397369385,
|
|
"epoch": 0.04018014835747086,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 7.4877,
|
|
"mean_token_accuracy": 0.08765893578529357,
|
|
"num_tokens": 837588.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.767373847961426,
|
|
"epoch": 0.040621688449311195,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 7.4849,
|
|
"mean_token_accuracy": 0.09265839084982871,
|
|
"num_tokens": 847002.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.726333475112915,
|
|
"epoch": 0.041063228541151534,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 7.4945,
|
|
"mean_token_accuracy": 0.09333177357912063,
|
|
"num_tokens": 855791.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.7462080955505375,
|
|
"epoch": 0.04150476863299188,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.5242,
|
|
"mean_token_accuracy": 0.0911882683634758,
|
|
"num_tokens": 865392.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.736569499969482,
|
|
"epoch": 0.04194630872483222,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 7.5873,
|
|
"mean_token_accuracy": 0.08873779252171517,
|
|
"num_tokens": 874807.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.758917284011841,
|
|
"epoch": 0.042387848816672555,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.5409,
|
|
"mean_token_accuracy": 0.09495326653122901,
|
|
"num_tokens": 883928.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.777913904190063,
|
|
"epoch": 0.04282938890851289,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000242,
|
|
"loss": 7.4436,
|
|
"mean_token_accuracy": 0.09124857932329178,
|
|
"num_tokens": 893047.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.662859010696411,
|
|
"epoch": 0.04327092900035323,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 7.4593,
|
|
"mean_token_accuracy": 0.09315531030297279,
|
|
"num_tokens": 901645.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.743328475952149,
|
|
"epoch": 0.04371246909219357,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000247,
|
|
"loss": 7.4727,
|
|
"mean_token_accuracy": 0.09244368895888329,
|
|
"num_tokens": 911169.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.7239625453948975,
|
|
"epoch": 0.04415400918403391,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 7.4748,
|
|
"mean_token_accuracy": 0.08498905003070831,
|
|
"num_tokens": 921382.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.544922304153443,
|
|
"epoch": 0.04459554927587425,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000252,
|
|
"loss": 7.4121,
|
|
"mean_token_accuracy": 0.09429771155118942,
|
|
"num_tokens": 930409.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.67856912612915,
|
|
"epoch": 0.04503708936771459,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 7.3879,
|
|
"mean_token_accuracy": 0.09879431128501892,
|
|
"num_tokens": 939049.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.718625736236572,
|
|
"epoch": 0.04547862945955493,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000257,
|
|
"loss": 7.395,
|
|
"mean_token_accuracy": 0.0960740551352501,
|
|
"num_tokens": 947575.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.709804058074951,
|
|
"epoch": 0.045920169551395267,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.5643,
|
|
"mean_token_accuracy": 0.09047991409897804,
|
|
"num_tokens": 957848.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.655015087127685,
|
|
"epoch": 0.046361709643235605,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000262,
|
|
"loss": 7.3857,
|
|
"mean_token_accuracy": 0.09998803585767746,
|
|
"num_tokens": 966521.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.688518905639649,
|
|
"epoch": 0.04680324973507594,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 7.4461,
|
|
"mean_token_accuracy": 0.09324755370616913,
|
|
"num_tokens": 975827.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.606715154647827,
|
|
"epoch": 0.04724478982691628,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 7.4075,
|
|
"mean_token_accuracy": 0.09566703587770461,
|
|
"num_tokens": 985292.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.616068124771118,
|
|
"epoch": 0.047686329918756626,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 7.3841,
|
|
"mean_token_accuracy": 0.09411159604787826,
|
|
"num_tokens": 994791.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.51567816734314,
|
|
"epoch": 0.048127870010596964,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 7.3594,
|
|
"mean_token_accuracy": 0.1026044063270092,
|
|
"num_tokens": 1003700.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.510391616821289,
|
|
"epoch": 0.0485694101024373,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 7.381,
|
|
"mean_token_accuracy": 0.09829011410474778,
|
|
"num_tokens": 1012682.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.683912038803101,
|
|
"epoch": 0.04901095019427764,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000277,
|
|
"loss": 7.4075,
|
|
"mean_token_accuracy": 0.09179475829005242,
|
|
"num_tokens": 1021018.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.570155191421509,
|
|
"epoch": 0.04945249028611798,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 7.2759,
|
|
"mean_token_accuracy": 0.09699172824621201,
|
|
"num_tokens": 1029744.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.495694351196289,
|
|
"epoch": 0.049894030377958316,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 7.3605,
|
|
"mean_token_accuracy": 0.09879247918725013,
|
|
"num_tokens": 1038805.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.5144976615905765,
|
|
"epoch": 0.050335570469798654,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 7.4131,
|
|
"mean_token_accuracy": 0.0988279327750206,
|
|
"num_tokens": 1047656.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.647522783279419,
|
|
"epoch": 0.050777110561639,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000287,
|
|
"loss": 7.4048,
|
|
"mean_token_accuracy": 0.09629088416695594,
|
|
"num_tokens": 1056598.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.6095935821533205,
|
|
"epoch": 0.05121865065347934,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 7.3994,
|
|
"mean_token_accuracy": 0.09847217947244644,
|
|
"num_tokens": 1065226.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.529495334625244,
|
|
"epoch": 0.051660190745319676,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000292,
|
|
"loss": 7.3208,
|
|
"mean_token_accuracy": 0.10098938867449761,
|
|
"num_tokens": 1074661.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.503559398651123,
|
|
"epoch": 0.052101730837160014,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 7.4251,
|
|
"mean_token_accuracy": 0.09441772177815437,
|
|
"num_tokens": 1083921.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.540312194824219,
|
|
"epoch": 0.05254327092900035,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 7.3162,
|
|
"mean_token_accuracy": 0.104298634827137,
|
|
"num_tokens": 1093399.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.5133528232574465,
|
|
"epoch": 0.05298481102084069,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 7.391,
|
|
"mean_token_accuracy": 0.09825902208685874,
|
|
"num_tokens": 1104065.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.433008003234863,
|
|
"epoch": 0.05342635111268103,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000302,
|
|
"loss": 7.2778,
|
|
"mean_token_accuracy": 0.1005440428853035,
|
|
"num_tokens": 1112995.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.47243971824646,
|
|
"epoch": 0.05386789120452137,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 7.3103,
|
|
"mean_token_accuracy": 0.10175202563405036,
|
|
"num_tokens": 1121637.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.455365610122681,
|
|
"epoch": 0.05430943129636171,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000307,
|
|
"loss": 7.2549,
|
|
"mean_token_accuracy": 0.09826337993144989,
|
|
"num_tokens": 1131166.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.4712036609649655,
|
|
"epoch": 0.05475097138820205,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 7.2562,
|
|
"mean_token_accuracy": 0.10475531965494156,
|
|
"num_tokens": 1140888.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.551609897613526,
|
|
"epoch": 0.05519251148004239,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000312,
|
|
"loss": 7.4148,
|
|
"mean_token_accuracy": 0.0961816966533661,
|
|
"num_tokens": 1150278.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.433546924591065,
|
|
"epoch": 0.055634051571882726,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 7.3742,
|
|
"mean_token_accuracy": 0.0970606379210949,
|
|
"num_tokens": 1159348.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.624134588241577,
|
|
"epoch": 0.056075591663723064,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000317,
|
|
"loss": 7.3756,
|
|
"mean_token_accuracy": 0.0949991799890995,
|
|
"num_tokens": 1168883.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.48681526184082,
|
|
"epoch": 0.0565171317555634,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 7.269,
|
|
"mean_token_accuracy": 0.1064944364130497,
|
|
"num_tokens": 1178572.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 7.44178466796875,
|
|
"epoch": 0.05695867184740375,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000322,
|
|
"loss": 7.3576,
|
|
"mean_token_accuracy": 0.0987204596400261,
|
|
"num_tokens": 1188909.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.466546869277954,
|
|
"epoch": 0.057400211939244085,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 7.2649,
|
|
"mean_token_accuracy": 0.09890259429812431,
|
|
"num_tokens": 1197705.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.450878572463989,
|
|
"epoch": 0.05784175203108442,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 7.1061,
|
|
"mean_token_accuracy": 0.10522415414452553,
|
|
"num_tokens": 1206351.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.340301847457885,
|
|
"epoch": 0.05828329212292476,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 7.2237,
|
|
"mean_token_accuracy": 0.09693196043372154,
|
|
"num_tokens": 1214984.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.4402018070220945,
|
|
"epoch": 0.0587248322147651,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 7.2999,
|
|
"mean_token_accuracy": 0.09738482013344765,
|
|
"num_tokens": 1224485.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.435847473144531,
|
|
"epoch": 0.05916637230660544,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 7.3266,
|
|
"mean_token_accuracy": 0.09173622950911522,
|
|
"num_tokens": 1233560.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.428315305709839,
|
|
"epoch": 0.05960791239844578,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 7.2436,
|
|
"mean_token_accuracy": 0.09967414885759354,
|
|
"num_tokens": 1242628.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.388672494888306,
|
|
"epoch": 0.06004945249028612,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 7.1697,
|
|
"mean_token_accuracy": 0.10538085550069809,
|
|
"num_tokens": 1251004.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.459445238113403,
|
|
"epoch": 0.06049099258212646,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000342,
|
|
"loss": 7.3463,
|
|
"mean_token_accuracy": 0.09609238728880883,
|
|
"num_tokens": 1260344.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.343485164642334,
|
|
"epoch": 0.0609325326739668,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 7.2517,
|
|
"mean_token_accuracy": 0.09760257676243782,
|
|
"num_tokens": 1269988.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.340139007568359,
|
|
"epoch": 0.061374072765807135,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000347,
|
|
"loss": 7.2126,
|
|
"mean_token_accuracy": 0.10715288370847702,
|
|
"num_tokens": 1280912.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 7.350299119949341,
|
|
"epoch": 0.06181561285764747,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 7.2246,
|
|
"mean_token_accuracy": 0.10604915320873261,
|
|
"num_tokens": 1289684.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.427703905105591,
|
|
"epoch": 0.06225715294948781,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 7.2534,
|
|
"mean_token_accuracy": 0.09802542477846146,
|
|
"num_tokens": 1298853.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 7.3198949813842775,
|
|
"epoch": 0.06269869304132815,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 7.214,
|
|
"mean_token_accuracy": 0.10874532908201218,
|
|
"num_tokens": 1309112.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 7.372763156890869,
|
|
"epoch": 0.0631402331331685,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000357,
|
|
"loss": 7.1981,
|
|
"mean_token_accuracy": 0.10583075731992722,
|
|
"num_tokens": 1319064.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 7.269387340545654,
|
|
"epoch": 0.06358177322500883,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 7.1748,
|
|
"mean_token_accuracy": 0.1100200168788433,
|
|
"num_tokens": 1327889.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 7.324726533889771,
|
|
"epoch": 0.06402331331684917,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000362,
|
|
"loss": 7.1938,
|
|
"mean_token_accuracy": 0.10258080512285232,
|
|
"num_tokens": 1337241.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 7.319574499130249,
|
|
"epoch": 0.06446485340868952,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 7.2533,
|
|
"mean_token_accuracy": 0.10085726305842399,
|
|
"num_tokens": 1346527.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.314885807037354,
|
|
"epoch": 0.06490639350052985,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000367,
|
|
"loss": 7.2315,
|
|
"mean_token_accuracy": 0.10445504561066628,
|
|
"num_tokens": 1355677.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 7.396700429916382,
|
|
"epoch": 0.06534793359237019,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 7.2163,
|
|
"mean_token_accuracy": 0.10588330775499344,
|
|
"num_tokens": 1364874.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.285468482971192,
|
|
"epoch": 0.06578947368421052,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000372,
|
|
"loss": 7.1378,
|
|
"mean_token_accuracy": 0.1090671844780445,
|
|
"num_tokens": 1373717.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 7.375531625747681,
|
|
"epoch": 0.06623101377605087,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 7.0955,
|
|
"mean_token_accuracy": 0.10741576477885247,
|
|
"num_tokens": 1382767.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 7.1357104778289795,
|
|
"epoch": 0.0666725538678912,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000377,
|
|
"loss": 7.1389,
|
|
"mean_token_accuracy": 0.10613262876868249,
|
|
"num_tokens": 1391190.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 7.234480524063111,
|
|
"epoch": 0.06711409395973154,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 7.1509,
|
|
"mean_token_accuracy": 0.10508784130215645,
|
|
"num_tokens": 1400722.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 7.402392435073852,
|
|
"epoch": 0.06755563405157189,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 7.1779,
|
|
"mean_token_accuracy": 0.10437385067343712,
|
|
"num_tokens": 1409328.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 7.06873927116394,
|
|
"epoch": 0.06799717414341222,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 7.0531,
|
|
"mean_token_accuracy": 0.11192933171987533,
|
|
"num_tokens": 1418504.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 7.440014839172363,
|
|
"epoch": 0.06843871423525257,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 7.1989,
|
|
"mean_token_accuracy": 0.10317453742027283,
|
|
"num_tokens": 1427690.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 7.181108903884888,
|
|
"epoch": 0.0688802543270929,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 7.1806,
|
|
"mean_token_accuracy": 0.10798285007476807,
|
|
"num_tokens": 1436798.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 7.2046185493469235,
|
|
"epoch": 0.06932179441893324,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 7.1646,
|
|
"mean_token_accuracy": 0.10358999595046044,
|
|
"num_tokens": 1446357.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 7.2555629253387455,
|
|
"epoch": 0.06976333451077357,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 7.0882,
|
|
"mean_token_accuracy": 0.11000654250383377,
|
|
"num_tokens": 1455998.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 7.207996559143067,
|
|
"epoch": 0.07020487460261392,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 7.145,
|
|
"mean_token_accuracy": 0.09857687279582024,
|
|
"num_tokens": 1465237.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 7.24621729850769,
|
|
"epoch": 0.07064641469445426,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 7.0958,
|
|
"mean_token_accuracy": 0.11087250858545303,
|
|
"num_tokens": 1474363.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 7.272359037399292,
|
|
"epoch": 0.0710879547862946,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000402,
|
|
"loss": 7.1713,
|
|
"mean_token_accuracy": 0.10843008160591125,
|
|
"num_tokens": 1483379.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 7.28739447593689,
|
|
"epoch": 0.07152949487813494,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 7.1265,
|
|
"mean_token_accuracy": 0.10922098532319069,
|
|
"num_tokens": 1492507.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 7.144436979293824,
|
|
"epoch": 0.07197103496997527,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 7.0154,
|
|
"mean_token_accuracy": 0.11775125116109848,
|
|
"num_tokens": 1500888.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 7.11500997543335,
|
|
"epoch": 0.07241257506181561,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 7.0709,
|
|
"mean_token_accuracy": 0.10802061259746551,
|
|
"num_tokens": 1510310.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 7.1448290824890135,
|
|
"epoch": 0.07285411515365595,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000412,
|
|
"loss": 7.0494,
|
|
"mean_token_accuracy": 0.11422519460320472,
|
|
"num_tokens": 1519427.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 7.2035074710845945,
|
|
"epoch": 0.07329565524549629,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 7.0679,
|
|
"mean_token_accuracy": 0.1063395880162716,
|
|
"num_tokens": 1529456.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 7.131991720199585,
|
|
"epoch": 0.07373719533733664,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000417,
|
|
"loss": 7.0241,
|
|
"mean_token_accuracy": 0.11403456106781959,
|
|
"num_tokens": 1537695.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 7.203299617767334,
|
|
"epoch": 0.07417873542917697,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 7.1456,
|
|
"mean_token_accuracy": 0.10954299196600914,
|
|
"num_tokens": 1547511.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 7.255322885513306,
|
|
"epoch": 0.07462027552101731,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000422,
|
|
"loss": 7.1315,
|
|
"mean_token_accuracy": 0.1110302060842514,
|
|
"num_tokens": 1557035.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 7.1888104438781735,
|
|
"epoch": 0.07506181561285764,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 7.0906,
|
|
"mean_token_accuracy": 0.11411306262016296,
|
|
"num_tokens": 1566773.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 7.094766998291016,
|
|
"epoch": 0.07550335570469799,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000427,
|
|
"loss": 7.0787,
|
|
"mean_token_accuracy": 0.10884842053055763,
|
|
"num_tokens": 1576873.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 7.178222894668579,
|
|
"epoch": 0.07594489579653832,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 7.111,
|
|
"mean_token_accuracy": 0.10762306824326515,
|
|
"num_tokens": 1586170.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 7.286298131942749,
|
|
"epoch": 0.07638643588837866,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000432,
|
|
"loss": 7.154,
|
|
"mean_token_accuracy": 0.10613771453499794,
|
|
"num_tokens": 1596054.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 7.1001307487487795,
|
|
"epoch": 0.07682797598021901,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 7.0262,
|
|
"mean_token_accuracy": 0.11607334911823272,
|
|
"num_tokens": 1604544.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 7.172781848907471,
|
|
"epoch": 0.07726951607205934,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000437,
|
|
"loss": 7.0446,
|
|
"mean_token_accuracy": 0.11472792029380799,
|
|
"num_tokens": 1614580.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 7.132223224639892,
|
|
"epoch": 0.07771105616389969,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 7.1232,
|
|
"mean_token_accuracy": 0.1109985999763012,
|
|
"num_tokens": 1624701.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 7.128903436660766,
|
|
"epoch": 0.07815259625574002,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000442,
|
|
"loss": 7.0573,
|
|
"mean_token_accuracy": 0.10825628340244293,
|
|
"num_tokens": 1634085.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 7.123282432556152,
|
|
"epoch": 0.07859413634758036,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 7.0322,
|
|
"mean_token_accuracy": 0.11617021560668946,
|
|
"num_tokens": 1643190.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 7.060208940505982,
|
|
"epoch": 0.0790356764394207,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 7.07,
|
|
"mean_token_accuracy": 0.11254222765564918,
|
|
"num_tokens": 1652705.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 7.132848882675171,
|
|
"epoch": 0.07947721653126104,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 7.0536,
|
|
"mean_token_accuracy": 0.10692465007305145,
|
|
"num_tokens": 1662210.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 7.192712593078613,
|
|
"epoch": 0.07991875662310138,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 7.1014,
|
|
"mean_token_accuracy": 0.10652303621172905,
|
|
"num_tokens": 1671893.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 7.059550428390503,
|
|
"epoch": 0.08036029671494171,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 7.0402,
|
|
"mean_token_accuracy": 0.11181816533207893,
|
|
"num_tokens": 1681217.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 7.149940156936646,
|
|
"epoch": 0.08080183680678206,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 7.053,
|
|
"mean_token_accuracy": 0.11131602600216865,
|
|
"num_tokens": 1690447.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 7.081046295166016,
|
|
"epoch": 0.08124337689862239,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 7.1332,
|
|
"mean_token_accuracy": 0.10568991601467133,
|
|
"num_tokens": 1700355.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 7.16390905380249,
|
|
"epoch": 0.08168491699046274,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000462,
|
|
"loss": 7.028,
|
|
"mean_token_accuracy": 0.10254786685109138,
|
|
"num_tokens": 1709449.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 7.091014242172241,
|
|
"epoch": 0.08212645708230307,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 7.058,
|
|
"mean_token_accuracy": 0.10658924430608749,
|
|
"num_tokens": 1718838.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 7.023260927200317,
|
|
"epoch": 0.08256799717414341,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000467,
|
|
"loss": 7.0217,
|
|
"mean_token_accuracy": 0.11323517858982086,
|
|
"num_tokens": 1728594.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 7.1843287467956545,
|
|
"epoch": 0.08300953726598376,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 7.0731,
|
|
"mean_token_accuracy": 0.11138227805495263,
|
|
"num_tokens": 1738814.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 7.071042823791504,
|
|
"epoch": 0.08345107735782409,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000472,
|
|
"loss": 7.0089,
|
|
"mean_token_accuracy": 0.11532488241791725,
|
|
"num_tokens": 1747644.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 7.104792213439941,
|
|
"epoch": 0.08389261744966443,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 7.0338,
|
|
"mean_token_accuracy": 0.11352440416812896,
|
|
"num_tokens": 1757489.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.995518827438355,
|
|
"epoch": 0.08433415754150476,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.9488,
|
|
"mean_token_accuracy": 0.11878458335995674,
|
|
"num_tokens": 1767546.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 7.094525289535523,
|
|
"epoch": 0.08477569763334511,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 7.0088,
|
|
"mean_token_accuracy": 0.10509251430630684,
|
|
"num_tokens": 1776035.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 7.100050449371338,
|
|
"epoch": 0.08521723772518544,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.000482,
|
|
"loss": 7.0869,
|
|
"mean_token_accuracy": 0.10708501487970352,
|
|
"num_tokens": 1786161.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 7.161181020736694,
|
|
"epoch": 0.08565877781702579,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 7.1355,
|
|
"mean_token_accuracy": 0.10680384710431098,
|
|
"num_tokens": 1796093.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 7.064108896255493,
|
|
"epoch": 0.08610031790886613,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.9829,
|
|
"mean_token_accuracy": 0.1097193941473961,
|
|
"num_tokens": 1805574.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 7.07778491973877,
|
|
"epoch": 0.08654185800070646,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 7.0171,
|
|
"mean_token_accuracy": 0.11008013710379601,
|
|
"num_tokens": 1815175.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 7.017868852615356,
|
|
"epoch": 0.08698339809254681,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.9322,
|
|
"mean_token_accuracy": 0.11862852200865745,
|
|
"num_tokens": 1824683.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 7.004701948165893,
|
|
"epoch": 0.08742493818438714,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.9091,
|
|
"mean_token_accuracy": 0.1145630083978176,
|
|
"num_tokens": 1833174.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 7.008507776260376,
|
|
"epoch": 0.08786647827622748,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 7.0124,
|
|
"mean_token_accuracy": 0.1165225401520729,
|
|
"num_tokens": 1842409.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.900066137313843,
|
|
"epoch": 0.08830801836806781,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.9172,
|
|
"mean_token_accuracy": 0.1189465768635273,
|
|
"num_tokens": 1851441.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 7.234589004516602,
|
|
"epoch": 0.08874955845990816,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499999998589561,
|
|
"loss": 7.0545,
|
|
"mean_token_accuracy": 0.1098681665956974,
|
|
"num_tokens": 1861188.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.916832828521729,
|
|
"epoch": 0.0891910985517485,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999999928596523,
|
|
"loss": 6.9934,
|
|
"mean_token_accuracy": 0.1134356640279293,
|
|
"num_tokens": 1870284.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.8979510307312015,
|
|
"epoch": 0.08963263864358884,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999999827221219,
|
|
"loss": 6.9508,
|
|
"mean_token_accuracy": 0.11564922854304313,
|
|
"num_tokens": 1879744.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 7.141992807388306,
|
|
"epoch": 0.09007417873542918,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999999681769696,
|
|
"loss": 6.9612,
|
|
"mean_token_accuracy": 0.1163177601993084,
|
|
"num_tokens": 1889241.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.971645736694336,
|
|
"epoch": 0.09051571882726951,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499999949224196,
|
|
"loss": 6.969,
|
|
"mean_token_accuracy": 0.11504201143980027,
|
|
"num_tokens": 1898247.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 7.006195020675659,
|
|
"epoch": 0.09095725891910986,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999999258638013,
|
|
"loss": 6.9244,
|
|
"mean_token_accuracy": 0.11498644798994065,
|
|
"num_tokens": 1907559.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.903467321395874,
|
|
"epoch": 0.09139879901095019,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999998980957861,
|
|
"loss": 6.924,
|
|
"mean_token_accuracy": 0.11912157312035561,
|
|
"num_tokens": 1917072.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.983310127258301,
|
|
"epoch": 0.09184033910279053,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999998659201508,
|
|
"loss": 6.8753,
|
|
"mean_token_accuracy": 0.11874028518795968,
|
|
"num_tokens": 1926597.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 7.006656980514526,
|
|
"epoch": 0.09228187919463088,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999998293368961,
|
|
"loss": 6.918,
|
|
"mean_token_accuracy": 0.11443031057715417,
|
|
"num_tokens": 1935978.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.924140882492066,
|
|
"epoch": 0.09272341928647121,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999997883460227,
|
|
"loss": 6.864,
|
|
"mean_token_accuracy": 0.1171707384288311,
|
|
"num_tokens": 1944424.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.984135913848877,
|
|
"epoch": 0.09316495937831155,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999997429475314,
|
|
"loss": 6.8384,
|
|
"mean_token_accuracy": 0.12138021439313888,
|
|
"num_tokens": 1953844.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.947112941741944,
|
|
"epoch": 0.09360649947015189,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999996931414232,
|
|
"loss": 6.9207,
|
|
"mean_token_accuracy": 0.12120825350284577,
|
|
"num_tokens": 1963974.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.977735805511474,
|
|
"epoch": 0.09404803956199223,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999996389276988,
|
|
"loss": 6.8969,
|
|
"mean_token_accuracy": 0.12291403263807296,
|
|
"num_tokens": 1973466.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.803595685958863,
|
|
"epoch": 0.09448957965383256,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999995803063596,
|
|
"loss": 6.918,
|
|
"mean_token_accuracy": 0.12123456448316575,
|
|
"num_tokens": 1983478.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 7.003172111511231,
|
|
"epoch": 0.0949311197456729,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999995172774065,
|
|
"loss": 6.9879,
|
|
"mean_token_accuracy": 0.11715293675661087,
|
|
"num_tokens": 1992775.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 7.030760860443115,
|
|
"epoch": 0.09537265983751325,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999994498408408,
|
|
"loss": 6.9419,
|
|
"mean_token_accuracy": 0.11398354098200798,
|
|
"num_tokens": 2002526.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.936420059204101,
|
|
"epoch": 0.09581419992935358,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999993779966639,
|
|
"loss": 6.9592,
|
|
"mean_token_accuracy": 0.11412434950470925,
|
|
"num_tokens": 2012476.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.838730955123902,
|
|
"epoch": 0.09625574002119393,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999993017448771,
|
|
"loss": 6.7924,
|
|
"mean_token_accuracy": 0.13063410446047782,
|
|
"num_tokens": 2021252.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 7.061969757080078,
|
|
"epoch": 0.09669728011303426,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999992210854821,
|
|
"loss": 6.9411,
|
|
"mean_token_accuracy": 0.11837697625160218,
|
|
"num_tokens": 2031438.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.9324125289917,
|
|
"epoch": 0.0971388202048746,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999991360184801,
|
|
"loss": 6.9789,
|
|
"mean_token_accuracy": 0.11443927884101868,
|
|
"num_tokens": 2041319.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.95938081741333,
|
|
"epoch": 0.09758036029671494,
|
|
"grad_norm": 0.859375,
|
|
"learning_rate": 0.0004999990465438731,
|
|
"loss": 6.9746,
|
|
"mean_token_accuracy": 0.11487890034914017,
|
|
"num_tokens": 2052060.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.891486358642578,
|
|
"epoch": 0.09802190038855528,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999989526616628,
|
|
"loss": 6.8643,
|
|
"mean_token_accuracy": 0.12554761841893197,
|
|
"num_tokens": 2061331.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.952145195007324,
|
|
"epoch": 0.09846344048039563,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999988543718509,
|
|
"loss": 6.8733,
|
|
"mean_token_accuracy": 0.11660940647125244,
|
|
"num_tokens": 2070006.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 7.005694484710693,
|
|
"epoch": 0.09890498057223596,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999987516744394,
|
|
"loss": 6.8403,
|
|
"mean_token_accuracy": 0.12942354679107665,
|
|
"num_tokens": 2079089.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.814391231536865,
|
|
"epoch": 0.0993465206640763,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999986445694303,
|
|
"loss": 6.8164,
|
|
"mean_token_accuracy": 0.12300374433398246,
|
|
"num_tokens": 2087237.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.877581930160522,
|
|
"epoch": 0.09978806075591663,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999985330568258,
|
|
"loss": 6.7838,
|
|
"mean_token_accuracy": 0.12274843603372573,
|
|
"num_tokens": 2096598.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.867826128005982,
|
|
"epoch": 0.10022960084775698,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999984171366278,
|
|
"loss": 6.8802,
|
|
"mean_token_accuracy": 0.11098882853984833,
|
|
"num_tokens": 2106106.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.928639554977417,
|
|
"epoch": 0.10067114093959731,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499998296808839,
|
|
"loss": 6.856,
|
|
"mean_token_accuracy": 0.11591004803776742,
|
|
"num_tokens": 2115478.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.942514848709107,
|
|
"epoch": 0.10111268103143765,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999981720734615,
|
|
"loss": 6.9007,
|
|
"mean_token_accuracy": 0.11097749546170235,
|
|
"num_tokens": 2124439.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.928429985046387,
|
|
"epoch": 0.101554221123278,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999980429304977,
|
|
"loss": 6.8983,
|
|
"mean_token_accuracy": 0.11434343308210373,
|
|
"num_tokens": 2133718.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.835229969024658,
|
|
"epoch": 0.10199576121511833,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999979093799502,
|
|
"loss": 6.801,
|
|
"mean_token_accuracy": 0.12133207321166992,
|
|
"num_tokens": 2142978.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.8165655612945555,
|
|
"epoch": 0.10243730130695868,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999977714218217,
|
|
"loss": 6.863,
|
|
"mean_token_accuracy": 0.117961073666811,
|
|
"num_tokens": 2152250.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.972959232330322,
|
|
"epoch": 0.102878841398799,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999976290561147,
|
|
"loss": 6.8608,
|
|
"mean_token_accuracy": 0.11624824330210685,
|
|
"num_tokens": 2161620.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.921932792663574,
|
|
"epoch": 0.10332038149063935,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999974822828322,
|
|
"loss": 6.8726,
|
|
"mean_token_accuracy": 0.1173239678144455,
|
|
"num_tokens": 2170856.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.885772609710694,
|
|
"epoch": 0.10376192158247968,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499997331101977,
|
|
"loss": 6.8626,
|
|
"mean_token_accuracy": 0.11568826138973236,
|
|
"num_tokens": 2180926.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.911135244369507,
|
|
"epoch": 0.10420346167432003,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499997175513552,
|
|
"loss": 6.8419,
|
|
"mean_token_accuracy": 0.11412648186087608,
|
|
"num_tokens": 2190248.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.86896710395813,
|
|
"epoch": 0.10464500176616037,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999970155175603,
|
|
"loss": 6.855,
|
|
"mean_token_accuracy": 0.12222710996866226,
|
|
"num_tokens": 2199833.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.856761360168457,
|
|
"epoch": 0.1050865418580007,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499996851114005,
|
|
"loss": 6.8115,
|
|
"mean_token_accuracy": 0.12742498219013215,
|
|
"num_tokens": 2208240.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.837878751754761,
|
|
"epoch": 0.10552808194984105,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999966823028894,
|
|
"loss": 6.8776,
|
|
"mean_token_accuracy": 0.11117666661739349,
|
|
"num_tokens": 2218758.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.94004077911377,
|
|
"epoch": 0.10596962204168138,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999965090842168,
|
|
"loss": 6.8665,
|
|
"mean_token_accuracy": 0.12320348769426345,
|
|
"num_tokens": 2228443.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.74250750541687,
|
|
"epoch": 0.10641116213352173,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999963314579905,
|
|
"loss": 6.7084,
|
|
"mean_token_accuracy": 0.1319122113287449,
|
|
"num_tokens": 2236787.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.956686353683471,
|
|
"epoch": 0.10685270222536206,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999961494242139,
|
|
"loss": 6.8901,
|
|
"mean_token_accuracy": 0.11468368023633957,
|
|
"num_tokens": 2247089.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.789958381652832,
|
|
"epoch": 0.1072942423172024,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999959629828908,
|
|
"loss": 6.816,
|
|
"mean_token_accuracy": 0.11569953635334969,
|
|
"num_tokens": 2256045.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.929527616500854,
|
|
"epoch": 0.10773578240904275,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999957721340248,
|
|
"loss": 6.8769,
|
|
"mean_token_accuracy": 0.12202595993876457,
|
|
"num_tokens": 2265835.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.807823610305786,
|
|
"epoch": 0.10817732250088308,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999955768776195,
|
|
"loss": 6.8076,
|
|
"mean_token_accuracy": 0.11702087000012398,
|
|
"num_tokens": 2275318.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.869472360610962,
|
|
"epoch": 0.10861886259272342,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999953772136788,
|
|
"loss": 6.7978,
|
|
"mean_token_accuracy": 0.12102394551038742,
|
|
"num_tokens": 2284821.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.885239315032959,
|
|
"epoch": 0.10906040268456375,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999951731422068,
|
|
"loss": 6.7645,
|
|
"mean_token_accuracy": 0.12054353281855583,
|
|
"num_tokens": 2294013.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.847450494766235,
|
|
"epoch": 0.1095019427764041,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999949646632072,
|
|
"loss": 6.7626,
|
|
"mean_token_accuracy": 0.12177001982927323,
|
|
"num_tokens": 2302727.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.8077342987060545,
|
|
"epoch": 0.10994348286824443,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999947517766842,
|
|
"loss": 6.8031,
|
|
"mean_token_accuracy": 0.12163913846015931,
|
|
"num_tokens": 2312032.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.986685180664063,
|
|
"epoch": 0.11038502296008477,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499994534482642,
|
|
"loss": 6.8748,
|
|
"mean_token_accuracy": 0.1190544456243515,
|
|
"num_tokens": 2321839.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.811014032363891,
|
|
"epoch": 0.11082656305192512,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999943127810847,
|
|
"loss": 6.8536,
|
|
"mean_token_accuracy": 0.1122577242553234,
|
|
"num_tokens": 2331255.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.794656276702881,
|
|
"epoch": 0.11126810314376545,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999940866720169,
|
|
"loss": 6.6705,
|
|
"mean_token_accuracy": 0.12881582453846932,
|
|
"num_tokens": 2340038.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.71243953704834,
|
|
"epoch": 0.1117096432356058,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999938561554429,
|
|
"loss": 6.7797,
|
|
"mean_token_accuracy": 0.12242485880851746,
|
|
"num_tokens": 2348901.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.858448314666748,
|
|
"epoch": 0.11215118332744613,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999936212313672,
|
|
"loss": 6.8659,
|
|
"mean_token_accuracy": 0.11461173072457313,
|
|
"num_tokens": 2358842.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.8239977836608885,
|
|
"epoch": 0.11259272341928647,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999933818997943,
|
|
"loss": 6.7596,
|
|
"mean_token_accuracy": 0.12424605414271354,
|
|
"num_tokens": 2368650.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.831825399398804,
|
|
"epoch": 0.1130342635111268,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999931381607292,
|
|
"loss": 6.8252,
|
|
"mean_token_accuracy": 0.12058763056993485,
|
|
"num_tokens": 2377916.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.818245553970337,
|
|
"epoch": 0.11347580360296715,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999928900141764,
|
|
"loss": 6.7698,
|
|
"mean_token_accuracy": 0.12198482304811478,
|
|
"num_tokens": 2387507.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.819052505493164,
|
|
"epoch": 0.1139173436948075,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499992637460141,
|
|
"loss": 6.8052,
|
|
"mean_token_accuracy": 0.12523134648799897,
|
|
"num_tokens": 2396148.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.816552209854126,
|
|
"epoch": 0.11435888378664782,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999923804986275,
|
|
"loss": 6.693,
|
|
"mean_token_accuracy": 0.11803872361779214,
|
|
"num_tokens": 2404891.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.840794086456299,
|
|
"epoch": 0.11480042387848817,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999921191296415,
|
|
"loss": 6.7153,
|
|
"mean_token_accuracy": 0.12199744880199433,
|
|
"num_tokens": 2414406.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.7452630519866945,
|
|
"epoch": 0.1152419639703285,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999918533531877,
|
|
"loss": 6.8046,
|
|
"mean_token_accuracy": 0.1228412576019764,
|
|
"num_tokens": 2424363.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.852883148193359,
|
|
"epoch": 0.11568350406216885,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999915831692714,
|
|
"loss": 6.7419,
|
|
"mean_token_accuracy": 0.1251549780368805,
|
|
"num_tokens": 2433753.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.7218766689300535,
|
|
"epoch": 0.11612504415400918,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999913085778981,
|
|
"loss": 6.7685,
|
|
"mean_token_accuracy": 0.1185051940381527,
|
|
"num_tokens": 2443275.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.873374080657959,
|
|
"epoch": 0.11656658424584952,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999910295790729,
|
|
"loss": 6.7937,
|
|
"mean_token_accuracy": 0.11835979968309403,
|
|
"num_tokens": 2452510.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.8684648990631105,
|
|
"epoch": 0.11700812433768987,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999907461728014,
|
|
"loss": 6.8746,
|
|
"mean_token_accuracy": 0.1169828750193119,
|
|
"num_tokens": 2462742.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.740426445007325,
|
|
"epoch": 0.1174496644295302,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999904583590893,
|
|
"loss": 6.7434,
|
|
"mean_token_accuracy": 0.12029099017381668,
|
|
"num_tokens": 2471409.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.839800691604614,
|
|
"epoch": 0.11789120452137054,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999901661379418,
|
|
"loss": 6.6931,
|
|
"mean_token_accuracy": 0.12773663252592088,
|
|
"num_tokens": 2481011.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.690527105331421,
|
|
"epoch": 0.11833274461321087,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999898695093652,
|
|
"loss": 6.7866,
|
|
"mean_token_accuracy": 0.12104339599609375,
|
|
"num_tokens": 2490664.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.818962049484253,
|
|
"epoch": 0.11877428470505122,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999895684733648,
|
|
"loss": 6.7279,
|
|
"mean_token_accuracy": 0.12824407517910003,
|
|
"num_tokens": 2499799.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.8539299964904785,
|
|
"epoch": 0.11921582479689156,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004999892630299467,
|
|
"loss": 6.7045,
|
|
"mean_token_accuracy": 0.1257259279489517,
|
|
"num_tokens": 2508780.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.706318616867065,
|
|
"epoch": 0.1196573648887319,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999889531791171,
|
|
"loss": 6.7138,
|
|
"mean_token_accuracy": 0.12127138078212737,
|
|
"num_tokens": 2517741.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.766215896606445,
|
|
"epoch": 0.12009890498057224,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999886389208817,
|
|
"loss": 6.7972,
|
|
"mean_token_accuracy": 0.11826895922422409,
|
|
"num_tokens": 2528742.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.717579746246338,
|
|
"epoch": 0.12054044507241257,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999883202552468,
|
|
"loss": 6.7345,
|
|
"mean_token_accuracy": 0.12455343306064606,
|
|
"num_tokens": 2538609.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.882754898071289,
|
|
"epoch": 0.12098198516425292,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999879971822189,
|
|
"loss": 6.7157,
|
|
"mean_token_accuracy": 0.11966117843985558,
|
|
"num_tokens": 2547772.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.69037971496582,
|
|
"epoch": 0.12142352525609325,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999876697018038,
|
|
"loss": 6.6897,
|
|
"mean_token_accuracy": 0.12769502475857736,
|
|
"num_tokens": 2556114.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.8372406482696535,
|
|
"epoch": 0.1218650653479336,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999873378140085,
|
|
"loss": 6.7182,
|
|
"mean_token_accuracy": 0.12253274098038673,
|
|
"num_tokens": 2566814.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.714360284805298,
|
|
"epoch": 0.12230660543977394,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999870015188389,
|
|
"loss": 6.6914,
|
|
"mean_token_accuracy": 0.12260655164718628,
|
|
"num_tokens": 2576030.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.743939208984375,
|
|
"epoch": 0.12274814553161427,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999866608163021,
|
|
"loss": 6.7176,
|
|
"mean_token_accuracy": 0.1260794699192047,
|
|
"num_tokens": 2585756.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.800289154052734,
|
|
"epoch": 0.12318968562345461,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999863157064045,
|
|
"loss": 6.7797,
|
|
"mean_token_accuracy": 0.1238692507147789,
|
|
"num_tokens": 2595676.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.8109955310821535,
|
|
"epoch": 0.12363122571529495,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999859661891529,
|
|
"loss": 6.7624,
|
|
"mean_token_accuracy": 0.1246532566845417,
|
|
"num_tokens": 2606197.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.76338677406311,
|
|
"epoch": 0.12407276580713529,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999856122645543,
|
|
"loss": 6.6854,
|
|
"mean_token_accuracy": 0.12518818601965903,
|
|
"num_tokens": 2615311.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.725164937973022,
|
|
"epoch": 0.12451430589897562,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999852539326154,
|
|
"loss": 6.5931,
|
|
"mean_token_accuracy": 0.12619537115097046,
|
|
"num_tokens": 2624074.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.681712102890015,
|
|
"epoch": 0.12495584599081597,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999848911933434,
|
|
"loss": 6.7411,
|
|
"mean_token_accuracy": 0.13295672461390495,
|
|
"num_tokens": 2633877.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.7498420715332035,
|
|
"epoch": 0.1253973860826563,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999845240467453,
|
|
"loss": 6.6702,
|
|
"mean_token_accuracy": 0.12471742331981658,
|
|
"num_tokens": 2643330.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.74535722732544,
|
|
"epoch": 0.12583892617449666,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999841524928282,
|
|
"loss": 6.6543,
|
|
"mean_token_accuracy": 0.12766205966472627,
|
|
"num_tokens": 2652070.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.722480249404907,
|
|
"epoch": 0.126280466266337,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999837765315997,
|
|
"loss": 6.697,
|
|
"mean_token_accuracy": 0.1277957484126091,
|
|
"num_tokens": 2660546.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.598982477188111,
|
|
"epoch": 0.12672200635817732,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999833961630669,
|
|
"loss": 6.5999,
|
|
"mean_token_accuracy": 0.1297621488571167,
|
|
"num_tokens": 2669938.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.852873420715332,
|
|
"epoch": 0.12716354645001765,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999830113872374,
|
|
"loss": 6.7248,
|
|
"mean_token_accuracy": 0.12213384285569191,
|
|
"num_tokens": 2679814.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.585267829895019,
|
|
"epoch": 0.127605086541858,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999826222041186,
|
|
"loss": 6.6355,
|
|
"mean_token_accuracy": 0.13383011817932128,
|
|
"num_tokens": 2688733.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.706393527984619,
|
|
"epoch": 0.12804662663369834,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999822286137182,
|
|
"loss": 6.6188,
|
|
"mean_token_accuracy": 0.12774292454123498,
|
|
"num_tokens": 2697744.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.633390617370606,
|
|
"epoch": 0.12848816672553867,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999818306160439,
|
|
"loss": 6.5827,
|
|
"mean_token_accuracy": 0.13278514444828032,
|
|
"num_tokens": 2707037.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.808015394210815,
|
|
"epoch": 0.12892970681737903,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004999814282111034,
|
|
"loss": 6.7453,
|
|
"mean_token_accuracy": 0.12486135885119438,
|
|
"num_tokens": 2717345.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.7304778575897215,
|
|
"epoch": 0.12937124690921936,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999810213989047,
|
|
"loss": 6.7017,
|
|
"mean_token_accuracy": 0.12036777138710023,
|
|
"num_tokens": 2726892.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.667453098297119,
|
|
"epoch": 0.1298127870010597,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999806101794558,
|
|
"loss": 6.6615,
|
|
"mean_token_accuracy": 0.12705308422446251,
|
|
"num_tokens": 2736479.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.809973049163818,
|
|
"epoch": 0.13025432709290002,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999801945527648,
|
|
"loss": 6.7078,
|
|
"mean_token_accuracy": 0.12117967531085014,
|
|
"num_tokens": 2745998.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.725075721740723,
|
|
"epoch": 0.13069586718474038,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999797745188395,
|
|
"loss": 6.6906,
|
|
"mean_token_accuracy": 0.12821464985609055,
|
|
"num_tokens": 2754346.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.686868619918823,
|
|
"epoch": 0.13113740727658071,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999793500776886,
|
|
"loss": 6.6285,
|
|
"mean_token_accuracy": 0.1294437274336815,
|
|
"num_tokens": 2763391.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.674964761734008,
|
|
"epoch": 0.13157894736842105,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999789212293201,
|
|
"loss": 6.6827,
|
|
"mean_token_accuracy": 0.12898893728852273,
|
|
"num_tokens": 2772764.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.775622892379761,
|
|
"epoch": 0.1320204874602614,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999784879737423,
|
|
"loss": 6.78,
|
|
"mean_token_accuracy": 0.12160285860300064,
|
|
"num_tokens": 2782312.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.745266675949097,
|
|
"epoch": 0.13246202755210174,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999780503109642,
|
|
"loss": 6.6798,
|
|
"mean_token_accuracy": 0.1227384127676487,
|
|
"num_tokens": 2791159.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.650141859054566,
|
|
"epoch": 0.13290356764394207,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999776082409939,
|
|
"loss": 6.5068,
|
|
"mean_token_accuracy": 0.13456878885626794,
|
|
"num_tokens": 2799319.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.6320771217346195,
|
|
"epoch": 0.1333451077357824,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999771617638401,
|
|
"loss": 6.6316,
|
|
"mean_token_accuracy": 0.12712259590625763,
|
|
"num_tokens": 2807401.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.758873414993286,
|
|
"epoch": 0.13378664782762276,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999767108795118,
|
|
"loss": 6.6961,
|
|
"mean_token_accuracy": 0.12330949455499648,
|
|
"num_tokens": 2817734.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.644004774093628,
|
|
"epoch": 0.1342281879194631,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999762555880176,
|
|
"loss": 6.6783,
|
|
"mean_token_accuracy": 0.12152940481901169,
|
|
"num_tokens": 2828235.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.743390846252441,
|
|
"epoch": 0.13466972801130342,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999757958893666,
|
|
"loss": 6.7124,
|
|
"mean_token_accuracy": 0.1237283930182457,
|
|
"num_tokens": 2837453.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.60786280632019,
|
|
"epoch": 0.13511126810314378,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999753317835677,
|
|
"loss": 6.6795,
|
|
"mean_token_accuracy": 0.12087962031364441,
|
|
"num_tokens": 2847055.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.7429241180419925,
|
|
"epoch": 0.1355528081949841,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999748632706299,
|
|
"loss": 6.6568,
|
|
"mean_token_accuracy": 0.13167096227407454,
|
|
"num_tokens": 2857101.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.673015403747558,
|
|
"epoch": 0.13599434828682444,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999743903505626,
|
|
"loss": 6.553,
|
|
"mean_token_accuracy": 0.1336723633110523,
|
|
"num_tokens": 2866685.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.676457214355469,
|
|
"epoch": 0.13643588837866477,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999739130233749,
|
|
"loss": 6.6731,
|
|
"mean_token_accuracy": 0.12713466510176658,
|
|
"num_tokens": 2876022.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.783061361312866,
|
|
"epoch": 0.13687742847050513,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999734312890761,
|
|
"loss": 6.6062,
|
|
"mean_token_accuracy": 0.12626957073807715,
|
|
"num_tokens": 2885560.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.59228024482727,
|
|
"epoch": 0.13731896856234546,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999729451476757,
|
|
"loss": 6.6439,
|
|
"mean_token_accuracy": 0.12623701319098474,
|
|
"num_tokens": 2894686.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.730476140975952,
|
|
"epoch": 0.1377605086541858,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999724545991835,
|
|
"loss": 6.6588,
|
|
"mean_token_accuracy": 0.13341889455914496,
|
|
"num_tokens": 2904390.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.662082195281982,
|
|
"epoch": 0.13820204874602615,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999719596436086,
|
|
"loss": 6.6982,
|
|
"mean_token_accuracy": 0.12678939029574393,
|
|
"num_tokens": 2913311.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.736054372787476,
|
|
"epoch": 0.13864358883786648,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999714602809611,
|
|
"loss": 6.578,
|
|
"mean_token_accuracy": 0.13448369055986403,
|
|
"num_tokens": 2923196.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.637621927261352,
|
|
"epoch": 0.1390851289297068,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999709565112506,
|
|
"loss": 6.6428,
|
|
"mean_token_accuracy": 0.12785085365176202,
|
|
"num_tokens": 2932813.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.658017587661743,
|
|
"epoch": 0.13952666902154715,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499970448334487,
|
|
"loss": 6.5634,
|
|
"mean_token_accuracy": 0.119098000228405,
|
|
"num_tokens": 2942694.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.5801304340362545,
|
|
"epoch": 0.1399682091133875,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999699357506803,
|
|
"loss": 6.5185,
|
|
"mean_token_accuracy": 0.1330326519906521,
|
|
"num_tokens": 2950932.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.701116371154785,
|
|
"epoch": 0.14040974920522784,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999694187598406,
|
|
"loss": 6.6085,
|
|
"mean_token_accuracy": 0.1314128704369068,
|
|
"num_tokens": 2960102.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.627189350128174,
|
|
"epoch": 0.14085128929706817,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004999688973619777,
|
|
"loss": 6.6088,
|
|
"mean_token_accuracy": 0.12604895159602164,
|
|
"num_tokens": 2969968.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.588514184951782,
|
|
"epoch": 0.14129282938890853,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999683715571022,
|
|
"loss": 6.5542,
|
|
"mean_token_accuracy": 0.1347218669950962,
|
|
"num_tokens": 2978880.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.644829893112183,
|
|
"epoch": 0.14173436948074886,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999678413452242,
|
|
"loss": 6.5863,
|
|
"mean_token_accuracy": 0.12890450209379195,
|
|
"num_tokens": 2988369.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.667511320114135,
|
|
"epoch": 0.1421759095725892,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999673067263542,
|
|
"loss": 6.6373,
|
|
"mean_token_accuracy": 0.12620072290301323,
|
|
"num_tokens": 2997070.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.682367372512817,
|
|
"epoch": 0.14261744966442952,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999667677005026,
|
|
"loss": 6.5749,
|
|
"mean_token_accuracy": 0.13310741856694222,
|
|
"num_tokens": 3006547.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.584122562408448,
|
|
"epoch": 0.14305898975626988,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999662242676799,
|
|
"loss": 6.5986,
|
|
"mean_token_accuracy": 0.1310425490140915,
|
|
"num_tokens": 3015821.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.665965223312378,
|
|
"epoch": 0.1435005298481102,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999656764278968,
|
|
"loss": 6.5655,
|
|
"mean_token_accuracy": 0.1309148021042347,
|
|
"num_tokens": 3024750.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.623423147201538,
|
|
"epoch": 0.14394206993995054,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999651241811642,
|
|
"loss": 6.5389,
|
|
"mean_token_accuracy": 0.1308048278093338,
|
|
"num_tokens": 3033345.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.59770941734314,
|
|
"epoch": 0.1443836100317909,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999645675274925,
|
|
"loss": 6.5209,
|
|
"mean_token_accuracy": 0.13443350791931152,
|
|
"num_tokens": 3042060.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.660798358917236,
|
|
"epoch": 0.14482515012363123,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999640064668931,
|
|
"loss": 6.6684,
|
|
"mean_token_accuracy": 0.12563745751976968,
|
|
"num_tokens": 3052490.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.726674222946167,
|
|
"epoch": 0.14526669021547156,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999634409993766,
|
|
"loss": 6.6441,
|
|
"mean_token_accuracy": 0.12772516757249833,
|
|
"num_tokens": 3061934.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.556687307357788,
|
|
"epoch": 0.1457082303073119,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999628711249544,
|
|
"loss": 6.5591,
|
|
"mean_token_accuracy": 0.13611432090401648,
|
|
"num_tokens": 3070890.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.703302097320557,
|
|
"epoch": 0.14614977039915225,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999622968436373,
|
|
"loss": 6.5614,
|
|
"mean_token_accuracy": 0.12631918862462044,
|
|
"num_tokens": 3079933.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.5565461158752445,
|
|
"epoch": 0.14659131049099258,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999617181554369,
|
|
"loss": 6.6078,
|
|
"mean_token_accuracy": 0.13254086449742317,
|
|
"num_tokens": 3089910.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.718549633026123,
|
|
"epoch": 0.1470328505828329,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999611350603643,
|
|
"loss": 6.5916,
|
|
"mean_token_accuracy": 0.13437702059745787,
|
|
"num_tokens": 3098676.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.595283174514771,
|
|
"epoch": 0.14747439067467327,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499960547558431,
|
|
"loss": 6.7159,
|
|
"mean_token_accuracy": 0.1256335400044918,
|
|
"num_tokens": 3109055.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.624332904815674,
|
|
"epoch": 0.1479159307665136,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999599556496486,
|
|
"loss": 6.5472,
|
|
"mean_token_accuracy": 0.13295547068119049,
|
|
"num_tokens": 3117517.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.573185825347901,
|
|
"epoch": 0.14835747085835393,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999593593340286,
|
|
"loss": 6.5544,
|
|
"mean_token_accuracy": 0.13249536529183387,
|
|
"num_tokens": 3126606.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.704092359542846,
|
|
"epoch": 0.14879901095019427,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999587586115826,
|
|
"loss": 6.5733,
|
|
"mean_token_accuracy": 0.12993996366858482,
|
|
"num_tokens": 3135444.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.57456955909729,
|
|
"epoch": 0.14924055104203462,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999581534823226,
|
|
"loss": 6.4927,
|
|
"mean_token_accuracy": 0.14061653688549997,
|
|
"num_tokens": 3144967.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.607442092895508,
|
|
"epoch": 0.14968209113387496,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999575439462601,
|
|
"loss": 6.522,
|
|
"mean_token_accuracy": 0.12690635845065118,
|
|
"num_tokens": 3153898.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.536995601654053,
|
|
"epoch": 0.1501236312257153,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999569300034075,
|
|
"loss": 6.5921,
|
|
"mean_token_accuracy": 0.12649724259972572,
|
|
"num_tokens": 3162341.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.60729718208313,
|
|
"epoch": 0.15056517131755565,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999563116537764,
|
|
"loss": 6.3826,
|
|
"mean_token_accuracy": 0.14100785404443741,
|
|
"num_tokens": 3171420.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.54091591835022,
|
|
"epoch": 0.15100671140939598,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999556888973792,
|
|
"loss": 6.4366,
|
|
"mean_token_accuracy": 0.13033056780695915,
|
|
"num_tokens": 3180221.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.6625391960144045,
|
|
"epoch": 0.1514482515012363,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999550617342279,
|
|
"loss": 6.587,
|
|
"mean_token_accuracy": 0.12798949852585792,
|
|
"num_tokens": 3189156.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.539492607116699,
|
|
"epoch": 0.15188979159307664,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499954430164335,
|
|
"loss": 6.5149,
|
|
"mean_token_accuracy": 0.13213447630405425,
|
|
"num_tokens": 3199870.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.559759998321534,
|
|
"epoch": 0.152331331684917,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999537941877127,
|
|
"loss": 6.4883,
|
|
"mean_token_accuracy": 0.13440720960497857,
|
|
"num_tokens": 3208815.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.604632616043091,
|
|
"epoch": 0.15277287177675733,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999531538043735,
|
|
"loss": 6.5717,
|
|
"mean_token_accuracy": 0.13279605731368066,
|
|
"num_tokens": 3218692.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.584544372558594,
|
|
"epoch": 0.15321441186859766,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999525090143298,
|
|
"loss": 6.5367,
|
|
"mean_token_accuracy": 0.14099612906575204,
|
|
"num_tokens": 3227604.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.629879665374756,
|
|
"epoch": 0.15365595196043802,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999518598175946,
|
|
"loss": 6.5446,
|
|
"mean_token_accuracy": 0.14111834466457368,
|
|
"num_tokens": 3237912.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.453984022140503,
|
|
"epoch": 0.15409749205227835,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999512062141805,
|
|
"loss": 6.4239,
|
|
"mean_token_accuracy": 0.13701695203781128,
|
|
"num_tokens": 3246003.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.6436468124389645,
|
|
"epoch": 0.15453903214411868,
|
|
"grad_norm": 0.85546875,
|
|
"learning_rate": 0.0004999505482040999,
|
|
"loss": 6.5558,
|
|
"mean_token_accuracy": 0.13121686428785323,
|
|
"num_tokens": 3256363.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.583334445953369,
|
|
"epoch": 0.154980572235959,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999498857873662,
|
|
"loss": 6.5518,
|
|
"mean_token_accuracy": 0.13320463374257088,
|
|
"num_tokens": 3265822.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.478305387496948,
|
|
"epoch": 0.15542211232779937,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999492189639921,
|
|
"loss": 6.4224,
|
|
"mean_token_accuracy": 0.13528963476419448,
|
|
"num_tokens": 3274614.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.61256685256958,
|
|
"epoch": 0.1558636524196397,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999485477339907,
|
|
"loss": 6.5201,
|
|
"mean_token_accuracy": 0.13317029252648355,
|
|
"num_tokens": 3283800.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.526872968673706,
|
|
"epoch": 0.15630519251148003,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999478720973753,
|
|
"loss": 6.4319,
|
|
"mean_token_accuracy": 0.14009243845939637,
|
|
"num_tokens": 3293221.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.5570995807647705,
|
|
"epoch": 0.1567467326033204,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499947192054159,
|
|
"loss": 6.6029,
|
|
"mean_token_accuracy": 0.1324251540005207,
|
|
"num_tokens": 3302852.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.650819587707519,
|
|
"epoch": 0.15718827269516072,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499946507604355,
|
|
"loss": 6.4641,
|
|
"mean_token_accuracy": 0.14009604677557946,
|
|
"num_tokens": 3311752.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.473453664779663,
|
|
"epoch": 0.15762981278700106,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499945818747977,
|
|
"loss": 6.5492,
|
|
"mean_token_accuracy": 0.13722263872623444,
|
|
"num_tokens": 3321339.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.671073293685913,
|
|
"epoch": 0.1580713528788414,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999451254850383,
|
|
"loss": 6.5514,
|
|
"mean_token_accuracy": 0.1324629843235016,
|
|
"num_tokens": 3330269.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.550148677825928,
|
|
"epoch": 0.15851289297068175,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999444278155525,
|
|
"loss": 6.4576,
|
|
"mean_token_accuracy": 0.1376182422041893,
|
|
"num_tokens": 3340770.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.544644594192505,
|
|
"epoch": 0.15895443306252208,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999437257395333,
|
|
"loss": 6.5753,
|
|
"mean_token_accuracy": 0.13476464301347732,
|
|
"num_tokens": 3349976.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.597910165786743,
|
|
"epoch": 0.1593959731543624,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999430192569944,
|
|
"loss": 6.6158,
|
|
"mean_token_accuracy": 0.12608520165085793,
|
|
"num_tokens": 3359764.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.539244031906128,
|
|
"epoch": 0.15983751324620277,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999423083679498,
|
|
"loss": 6.4868,
|
|
"mean_token_accuracy": 0.13115186169743537,
|
|
"num_tokens": 3369939.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.639003801345825,
|
|
"epoch": 0.1602790533380431,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999415930724133,
|
|
"loss": 6.6346,
|
|
"mean_token_accuracy": 0.12994891554117202,
|
|
"num_tokens": 3381326.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.659589242935181,
|
|
"epoch": 0.16072059342988343,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999408733703988,
|
|
"loss": 6.5949,
|
|
"mean_token_accuracy": 0.12143847793340683,
|
|
"num_tokens": 3391016.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.533053731918335,
|
|
"epoch": 0.16116213352172376,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999401492619207,
|
|
"loss": 6.3366,
|
|
"mean_token_accuracy": 0.13945143148303032,
|
|
"num_tokens": 3400294.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.587634897232055,
|
|
"epoch": 0.16160367361356412,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999394207469928,
|
|
"loss": 6.5447,
|
|
"mean_token_accuracy": 0.12779992073774338,
|
|
"num_tokens": 3409685.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.636557579040527,
|
|
"epoch": 0.16204521370540445,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999386878256297,
|
|
"loss": 6.4611,
|
|
"mean_token_accuracy": 0.1320968374609947,
|
|
"num_tokens": 3418946.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.423011112213135,
|
|
"epoch": 0.16248675379724478,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999379504978457,
|
|
"loss": 6.468,
|
|
"mean_token_accuracy": 0.13338307663798332,
|
|
"num_tokens": 3428245.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.578200912475586,
|
|
"epoch": 0.16292829388908514,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499937208763655,
|
|
"loss": 6.4741,
|
|
"mean_token_accuracy": 0.1402788795530796,
|
|
"num_tokens": 3437580.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.617180633544922,
|
|
"epoch": 0.16336983398092547,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999364626230724,
|
|
"loss": 6.4472,
|
|
"mean_token_accuracy": 0.14007550328969956,
|
|
"num_tokens": 3446393.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.451139450073242,
|
|
"epoch": 0.1638113740727658,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999357120761124,
|
|
"loss": 6.551,
|
|
"mean_token_accuracy": 0.13571736514568328,
|
|
"num_tokens": 3455956.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.6568724632263185,
|
|
"epoch": 0.16425291416460613,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999349571227898,
|
|
"loss": 6.5495,
|
|
"mean_token_accuracy": 0.13430711701512338,
|
|
"num_tokens": 3465722.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.55973629951477,
|
|
"epoch": 0.1646944542564465,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999341977631193,
|
|
"loss": 6.52,
|
|
"mean_token_accuracy": 0.12885117009282113,
|
|
"num_tokens": 3475467.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.513189268112183,
|
|
"epoch": 0.16513599434828682,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999334339971157,
|
|
"loss": 6.4162,
|
|
"mean_token_accuracy": 0.13661258816719055,
|
|
"num_tokens": 3484931.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.572188520431519,
|
|
"epoch": 0.16557753444012716,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999326658247942,
|
|
"loss": 6.5161,
|
|
"mean_token_accuracy": 0.13407543525099755,
|
|
"num_tokens": 3494001.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.522925567626953,
|
|
"epoch": 0.16601907453196751,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999318932461696,
|
|
"loss": 6.4558,
|
|
"mean_token_accuracy": 0.1386028841137886,
|
|
"num_tokens": 3503021.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.46082649230957,
|
|
"epoch": 0.16646061462380785,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004999311162612571,
|
|
"loss": 6.4843,
|
|
"mean_token_accuracy": 0.13625017702579498,
|
|
"num_tokens": 3513005.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.661849021911621,
|
|
"epoch": 0.16690215471564818,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.000499930334870072,
|
|
"loss": 6.5629,
|
|
"mean_token_accuracy": 0.13853515014052392,
|
|
"num_tokens": 3523219.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.584684419631958,
|
|
"epoch": 0.1673436948074885,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999295490726296,
|
|
"loss": 6.51,
|
|
"mean_token_accuracy": 0.13359814062714576,
|
|
"num_tokens": 3532917.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.513212633132935,
|
|
"epoch": 0.16778523489932887,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999287588689453,
|
|
"loss": 6.5245,
|
|
"mean_token_accuracy": 0.1389150969684124,
|
|
"num_tokens": 3542845.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.599180459976196,
|
|
"epoch": 0.1682267749911692,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999279642590344,
|
|
"loss": 6.5438,
|
|
"mean_token_accuracy": 0.13201173320412635,
|
|
"num_tokens": 3552126.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.5960267066955565,
|
|
"epoch": 0.16866831508300953,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999271652429127,
|
|
"loss": 6.5561,
|
|
"mean_token_accuracy": 0.12684691920876504,
|
|
"num_tokens": 3561254.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.528163957595825,
|
|
"epoch": 0.1691098551748499,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999263618205958,
|
|
"loss": 6.3933,
|
|
"mean_token_accuracy": 0.1403527893126011,
|
|
"num_tokens": 3569781.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.555511140823365,
|
|
"epoch": 0.16955139526669022,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999255539920993,
|
|
"loss": 6.4866,
|
|
"mean_token_accuracy": 0.13409090787172318,
|
|
"num_tokens": 3579664.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.567546367645264,
|
|
"epoch": 0.16999293535853055,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999247417574391,
|
|
"loss": 6.5376,
|
|
"mean_token_accuracy": 0.13570395410060881,
|
|
"num_tokens": 3588671.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.54329285621643,
|
|
"epoch": 0.17043447545037088,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999239251166312,
|
|
"loss": 6.4281,
|
|
"mean_token_accuracy": 0.13799721151590347,
|
|
"num_tokens": 3597656.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.480014228820801,
|
|
"epoch": 0.17087601554221124,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999231040696914,
|
|
"loss": 6.4491,
|
|
"mean_token_accuracy": 0.13938046917319297,
|
|
"num_tokens": 3608017.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.529258060455322,
|
|
"epoch": 0.17131755563405157,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999222786166361,
|
|
"loss": 6.5236,
|
|
"mean_token_accuracy": 0.1355483777821064,
|
|
"num_tokens": 3618189.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.60916223526001,
|
|
"epoch": 0.1717590957258919,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999214487574812,
|
|
"loss": 6.4772,
|
|
"mean_token_accuracy": 0.13526797890663148,
|
|
"num_tokens": 3627211.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.498088455200195,
|
|
"epoch": 0.17220063581773226,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999206144922431,
|
|
"loss": 6.4181,
|
|
"mean_token_accuracy": 0.13236208409070968,
|
|
"num_tokens": 3636781.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.4924522876739506,
|
|
"epoch": 0.1726421759095726,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499919775820938,
|
|
"loss": 6.4985,
|
|
"mean_token_accuracy": 0.13709900975227357,
|
|
"num_tokens": 3644891.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.591422748565674,
|
|
"epoch": 0.17308371600141292,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999189327435825,
|
|
"loss": 6.5295,
|
|
"mean_token_accuracy": 0.13574066162109374,
|
|
"num_tokens": 3655477.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.5104138374328615,
|
|
"epoch": 0.17352525609325326,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999180852601929,
|
|
"loss": 6.5386,
|
|
"mean_token_accuracy": 0.13653166219592094,
|
|
"num_tokens": 3664542.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.577597141265869,
|
|
"epoch": 0.17396679618509361,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000499917233370786,
|
|
"loss": 6.4609,
|
|
"mean_token_accuracy": 0.1293606199324131,
|
|
"num_tokens": 3673806.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.484398126602173,
|
|
"epoch": 0.17440833627693395,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999163770753784,
|
|
"loss": 6.4524,
|
|
"mean_token_accuracy": 0.13555625528097154,
|
|
"num_tokens": 3683238.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.558987331390381,
|
|
"epoch": 0.17484987636877428,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004999155163739869,
|
|
"loss": 6.4372,
|
|
"mean_token_accuracy": 0.13345005139708518,
|
|
"num_tokens": 3692161.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.510820627212524,
|
|
"epoch": 0.17529141646061464,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999146512666284,
|
|
"loss": 6.4535,
|
|
"mean_token_accuracy": 0.13496344164013863,
|
|
"num_tokens": 3701431.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.530817365646362,
|
|
"epoch": 0.17573295655245497,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004999137817533197,
|
|
"loss": 6.4293,
|
|
"mean_token_accuracy": 0.1400221474468708,
|
|
"num_tokens": 3710963.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.48544750213623,
|
|
"epoch": 0.1761744966442953,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999129078340779,
|
|
"loss": 6.4221,
|
|
"mean_token_accuracy": 0.14005866199731826,
|
|
"num_tokens": 3720177.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.465721511840821,
|
|
"epoch": 0.17661603673613563,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999120295089202,
|
|
"loss": 6.3795,
|
|
"mean_token_accuracy": 0.14514245688915253,
|
|
"num_tokens": 3728123.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.571569442749023,
|
|
"epoch": 0.177057576827976,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999111467778639,
|
|
"loss": 6.4978,
|
|
"mean_token_accuracy": 0.1349009484052658,
|
|
"num_tokens": 3736869.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.4819518566131595,
|
|
"epoch": 0.17749911691981632,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499910259640926,
|
|
"loss": 6.3989,
|
|
"mean_token_accuracy": 0.1346891440451145,
|
|
"num_tokens": 3745830.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 6.486936283111572,
|
|
"epoch": 0.17794065701165665,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499909368098124,
|
|
"loss": 6.4328,
|
|
"mean_token_accuracy": 0.13926308006048202,
|
|
"num_tokens": 3755019.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 6.517996597290039,
|
|
"epoch": 0.178382197103497,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999084721494754,
|
|
"loss": 6.4076,
|
|
"mean_token_accuracy": 0.13306454047560692,
|
|
"num_tokens": 3764814.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 6.465504217147827,
|
|
"epoch": 0.17882373719533734,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999075717949978,
|
|
"loss": 6.384,
|
|
"mean_token_accuracy": 0.14004691764712335,
|
|
"num_tokens": 3774258.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 6.422070741653442,
|
|
"epoch": 0.17926527728717767,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999066670347089,
|
|
"loss": 6.4155,
|
|
"mean_token_accuracy": 0.14247353076934816,
|
|
"num_tokens": 3783961.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.416653776168824,
|
|
"epoch": 0.179706817379018,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999057578686261,
|
|
"loss": 6.3804,
|
|
"mean_token_accuracy": 0.1390853337943554,
|
|
"num_tokens": 3792417.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 6.496200704574585,
|
|
"epoch": 0.18014835747085836,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999048442967675,
|
|
"loss": 6.3592,
|
|
"mean_token_accuracy": 0.13822411596775055,
|
|
"num_tokens": 3801479.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.456766891479492,
|
|
"epoch": 0.1805898975626987,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999039263191508,
|
|
"loss": 6.4305,
|
|
"mean_token_accuracy": 0.133939179033041,
|
|
"num_tokens": 3810799.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.5264753818511965,
|
|
"epoch": 0.18103143765453902,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999030039357943,
|
|
"loss": 6.48,
|
|
"mean_token_accuracy": 0.1298608623445034,
|
|
"num_tokens": 3820966.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 6.493787717819214,
|
|
"epoch": 0.18147297774637938,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999020771467158,
|
|
"loss": 6.4572,
|
|
"mean_token_accuracy": 0.1327923409640789,
|
|
"num_tokens": 3829247.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 6.596197843551636,
|
|
"epoch": 0.1819145178382197,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999011459519335,
|
|
"loss": 6.3948,
|
|
"mean_token_accuracy": 0.13827238082885743,
|
|
"num_tokens": 3838114.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.390679597854614,
|
|
"epoch": 0.18235605793006004,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999002103514655,
|
|
"loss": 6.4735,
|
|
"mean_token_accuracy": 0.13869670927524566,
|
|
"num_tokens": 3848075.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.585737419128418,
|
|
"epoch": 0.18279759802190038,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998992703453304,
|
|
"loss": 6.5106,
|
|
"mean_token_accuracy": 0.13122646436095237,
|
|
"num_tokens": 3857934.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 6.450649833679199,
|
|
"epoch": 0.18323913811374073,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004998983259335466,
|
|
"loss": 6.3519,
|
|
"mean_token_accuracy": 0.14236594662070273,
|
|
"num_tokens": 3866707.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 6.58008508682251,
|
|
"epoch": 0.18368067820558107,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998973771161324,
|
|
"loss": 6.4244,
|
|
"mean_token_accuracy": 0.1374949462711811,
|
|
"num_tokens": 3875233.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.389654541015625,
|
|
"epoch": 0.1841222182974214,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998964238931065,
|
|
"loss": 6.4131,
|
|
"mean_token_accuracy": 0.1403422772884369,
|
|
"num_tokens": 3885173.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 6.564711236953736,
|
|
"epoch": 0.18456375838926176,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998954662644876,
|
|
"loss": 6.3803,
|
|
"mean_token_accuracy": 0.13195990920066833,
|
|
"num_tokens": 3894198.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 6.44653811454773,
|
|
"epoch": 0.1850052984811021,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998945042302943,
|
|
"loss": 6.382,
|
|
"mean_token_accuracy": 0.1373509407043457,
|
|
"num_tokens": 3904076.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 6.4586262702941895,
|
|
"epoch": 0.18544683857294242,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998935377905457,
|
|
"loss": 6.4943,
|
|
"mean_token_accuracy": 0.13231708630919456,
|
|
"num_tokens": 3913204.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 6.587808132171631,
|
|
"epoch": 0.18588837866478275,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998925669452605,
|
|
"loss": 6.4565,
|
|
"mean_token_accuracy": 0.1339179016649723,
|
|
"num_tokens": 3922148.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 6.401728963851928,
|
|
"epoch": 0.1863299187566231,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004998915916944579,
|
|
"loss": 6.4234,
|
|
"mean_token_accuracy": 0.1407448723912239,
|
|
"num_tokens": 3931333.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 6.545709466934204,
|
|
"epoch": 0.18677145884846344,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998906120381568,
|
|
"loss": 6.3789,
|
|
"mean_token_accuracy": 0.1448886923491955,
|
|
"num_tokens": 3941061.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 6.505582046508789,
|
|
"epoch": 0.18721299894030377,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998896279763766,
|
|
"loss": 6.4761,
|
|
"mean_token_accuracy": 0.13257319629192352,
|
|
"num_tokens": 3950075.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 6.455861520767212,
|
|
"epoch": 0.18765453903214413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998886395091365,
|
|
"loss": 6.3345,
|
|
"mean_token_accuracy": 0.1409289576113224,
|
|
"num_tokens": 3958885.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 6.417616128921509,
|
|
"epoch": 0.18809607912398446,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998876466364559,
|
|
"loss": 6.437,
|
|
"mean_token_accuracy": 0.13843559697270394,
|
|
"num_tokens": 3968218.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 6.400978994369507,
|
|
"epoch": 0.1885376192158248,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998866493583541,
|
|
"loss": 6.364,
|
|
"mean_token_accuracy": 0.14499804973602295,
|
|
"num_tokens": 3977435.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.388528203964233,
|
|
"epoch": 0.18897915930766512,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998856476748509,
|
|
"loss": 6.349,
|
|
"mean_token_accuracy": 0.14326094537973405,
|
|
"num_tokens": 3986608.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 6.440869951248169,
|
|
"epoch": 0.18942069939950548,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998846415859656,
|
|
"loss": 6.3602,
|
|
"mean_token_accuracy": 0.14130929261445999,
|
|
"num_tokens": 3996087.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.476541662216187,
|
|
"epoch": 0.1898622394913458,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998836310917182,
|
|
"loss": 6.4058,
|
|
"mean_token_accuracy": 0.13654726892709732,
|
|
"num_tokens": 4006257.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.478487300872803,
|
|
"epoch": 0.19030377958318614,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998826161921282,
|
|
"loss": 6.4072,
|
|
"mean_token_accuracy": 0.14593140706419944,
|
|
"num_tokens": 4015904.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 6.427832221984863,
|
|
"epoch": 0.1907453196750265,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004998815968872157,
|
|
"loss": 6.4181,
|
|
"mean_token_accuracy": 0.13653010204434396,
|
|
"num_tokens": 4025417.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 6.392728614807129,
|
|
"epoch": 0.19118685976686683,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998805731770007,
|
|
"loss": 6.2628,
|
|
"mean_token_accuracy": 0.15659967064857483,
|
|
"num_tokens": 4035181.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.4461814880371096,
|
|
"epoch": 0.19162839985870717,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000499879545061503,
|
|
"loss": 6.4504,
|
|
"mean_token_accuracy": 0.13371687456965448,
|
|
"num_tokens": 4045112.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.5883321285247805,
|
|
"epoch": 0.1920699399505475,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004998785125407432,
|
|
"loss": 6.5425,
|
|
"mean_token_accuracy": 0.12125966772437095,
|
|
"num_tokens": 4054566.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 6.552111434936523,
|
|
"epoch": 0.19251148004238786,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499877475614741,
|
|
"loss": 6.3758,
|
|
"mean_token_accuracy": 0.1340583384037018,
|
|
"num_tokens": 4063960.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 6.3515486240386965,
|
|
"epoch": 0.1929530201342282,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998764342835169,
|
|
"loss": 6.3157,
|
|
"mean_token_accuracy": 0.14617449343204497,
|
|
"num_tokens": 4072620.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 6.406283712387085,
|
|
"epoch": 0.19339456022606852,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998753885470915,
|
|
"loss": 6.2789,
|
|
"mean_token_accuracy": 0.14062159806489943,
|
|
"num_tokens": 4081590.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 6.425132322311401,
|
|
"epoch": 0.19383610031790888,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998743384054851,
|
|
"loss": 6.4202,
|
|
"mean_token_accuracy": 0.14044143706560136,
|
|
"num_tokens": 4090758.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 6.3962568759918215,
|
|
"epoch": 0.1942776404097492,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998732838587183,
|
|
"loss": 6.2458,
|
|
"mean_token_accuracy": 0.149021477997303,
|
|
"num_tokens": 4099281.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 6.40768404006958,
|
|
"epoch": 0.19471918050158954,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998722249068118,
|
|
"loss": 6.3451,
|
|
"mean_token_accuracy": 0.1404854990541935,
|
|
"num_tokens": 4108953.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 6.396564531326294,
|
|
"epoch": 0.19516072059342987,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998711615497863,
|
|
"loss": 6.4535,
|
|
"mean_token_accuracy": 0.13740591406822206,
|
|
"num_tokens": 4118799.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 6.579476261138916,
|
|
"epoch": 0.19560226068527023,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998700937876626,
|
|
"loss": 6.4122,
|
|
"mean_token_accuracy": 0.13571444600820542,
|
|
"num_tokens": 4127862.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 6.4028857231140135,
|
|
"epoch": 0.19604380077711056,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998690216204615,
|
|
"loss": 6.5068,
|
|
"mean_token_accuracy": 0.12704429849982263,
|
|
"num_tokens": 4139029.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 6.5047478675842285,
|
|
"epoch": 0.1964853408689509,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998679450482043,
|
|
"loss": 6.3028,
|
|
"mean_token_accuracy": 0.14782762974500657,
|
|
"num_tokens": 4148257.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 6.341253662109375,
|
|
"epoch": 0.19692688096079125,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499866864070912,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.14017492160201073,
|
|
"num_tokens": 4157626.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 6.429310417175293,
|
|
"epoch": 0.19736842105263158,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998657786886056,
|
|
"loss": 6.4713,
|
|
"mean_token_accuracy": 0.13993202298879623,
|
|
"num_tokens": 4166804.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.554042720794678,
|
|
"epoch": 0.1978099611444719,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998646889013066,
|
|
"loss": 6.3607,
|
|
"mean_token_accuracy": 0.14240839183330536,
|
|
"num_tokens": 4175701.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 6.4098762512207035,
|
|
"epoch": 0.19825150123631224,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998635947090362,
|
|
"loss": 6.4425,
|
|
"mean_token_accuracy": 0.14030273035168647,
|
|
"num_tokens": 4184711.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 6.475190782546997,
|
|
"epoch": 0.1986930413281526,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004998624961118158,
|
|
"loss": 6.4017,
|
|
"mean_token_accuracy": 0.14099944159388542,
|
|
"num_tokens": 4193931.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 6.44319109916687,
|
|
"epoch": 0.19913458141999293,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499861393109667,
|
|
"loss": 6.2899,
|
|
"mean_token_accuracy": 0.14828752726316452,
|
|
"num_tokens": 4203000.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 6.274944400787353,
|
|
"epoch": 0.19957612151183327,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998602857026114,
|
|
"loss": 6.2991,
|
|
"mean_token_accuracy": 0.1458234503865242,
|
|
"num_tokens": 4211977.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 6.472422647476196,
|
|
"epoch": 0.20001766160367362,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998591738906708,
|
|
"loss": 6.3814,
|
|
"mean_token_accuracy": 0.15182094275951385,
|
|
"num_tokens": 4220375.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.494894886016846,
|
|
"epoch": 0.20045920169551396,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998580576738668,
|
|
"loss": 6.3793,
|
|
"mean_token_accuracy": 0.1367909237742424,
|
|
"num_tokens": 4230506.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 6.404332447052002,
|
|
"epoch": 0.2009007417873543,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998569370522213,
|
|
"loss": 6.3524,
|
|
"mean_token_accuracy": 0.13977290093898773,
|
|
"num_tokens": 4240235.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 6.448247480392456,
|
|
"epoch": 0.20134228187919462,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004998558120257563,
|
|
"loss": 6.4189,
|
|
"mean_token_accuracy": 0.14339498728513717,
|
|
"num_tokens": 4249775.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 6.419510173797607,
|
|
"epoch": 0.20178382197103498,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998546825944938,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.141066338121891,
|
|
"num_tokens": 4258523.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 6.419132709503174,
|
|
"epoch": 0.2022253620628753,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499853548758456,
|
|
"loss": 6.358,
|
|
"mean_token_accuracy": 0.13891511633992196,
|
|
"num_tokens": 4267683.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 6.495290946960449,
|
|
"epoch": 0.20266690215471564,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000499852410517665,
|
|
"loss": 6.417,
|
|
"mean_token_accuracy": 0.13713881745934486,
|
|
"num_tokens": 4276903.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 6.426017999649048,
|
|
"epoch": 0.203108442246556,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004998512678721431,
|
|
"loss": 6.3661,
|
|
"mean_token_accuracy": 0.135909353941679,
|
|
"num_tokens": 4287257.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 6.447468948364258,
|
|
"epoch": 0.20354998233839633,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499850120821913,
|
|
"loss": 6.3875,
|
|
"mean_token_accuracy": 0.13645304143428802,
|
|
"num_tokens": 4297345.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 6.528428220748902,
|
|
"epoch": 0.20399152243023666,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998489693669967,
|
|
"loss": 6.3613,
|
|
"mean_token_accuracy": 0.13837311565876007,
|
|
"num_tokens": 4306533.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.392482471466065,
|
|
"epoch": 0.204433062522077,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499847813507417,
|
|
"loss": 6.4023,
|
|
"mean_token_accuracy": 0.14094773977994918,
|
|
"num_tokens": 4316338.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 6.399602174758911,
|
|
"epoch": 0.20487460261391735,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998466532431966,
|
|
"loss": 6.3549,
|
|
"mean_token_accuracy": 0.13948202207684518,
|
|
"num_tokens": 4326585.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.544098567962647,
|
|
"epoch": 0.20531614270575768,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004998454885743581,
|
|
"loss": 6.4795,
|
|
"mean_token_accuracy": 0.1351695440709591,
|
|
"num_tokens": 4336490.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 6.363640975952149,
|
|
"epoch": 0.205757682797598,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998443195009242,
|
|
"loss": 6.3589,
|
|
"mean_token_accuracy": 0.1348782531917095,
|
|
"num_tokens": 4346264.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 6.461185693740845,
|
|
"epoch": 0.20619922288943837,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998431460229182,
|
|
"loss": 6.3318,
|
|
"mean_token_accuracy": 0.1448797807097435,
|
|
"num_tokens": 4355102.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 6.482104396820068,
|
|
"epoch": 0.2066407629812787,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998419681403627,
|
|
"loss": 6.5133,
|
|
"mean_token_accuracy": 0.13086750581860543,
|
|
"num_tokens": 4365569.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 6.507464361190796,
|
|
"epoch": 0.20708230307311903,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004998407858532809,
|
|
"loss": 6.36,
|
|
"mean_token_accuracy": 0.14296017587184906,
|
|
"num_tokens": 4375437.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 6.37006025314331,
|
|
"epoch": 0.20752384316495937,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499839599161696,
|
|
"loss": 6.3389,
|
|
"mean_token_accuracy": 0.1450774312019348,
|
|
"num_tokens": 4384420.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 6.460700035095215,
|
|
"epoch": 0.20796538325679972,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998384080656314,
|
|
"loss": 6.3106,
|
|
"mean_token_accuracy": 0.14050144031643869,
|
|
"num_tokens": 4393730.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 6.332716035842895,
|
|
"epoch": 0.20840692334864006,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00049983721256511,
|
|
"loss": 6.2798,
|
|
"mean_token_accuracy": 0.14351205080747603,
|
|
"num_tokens": 4402731.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 6.3846518993377686,
|
|
"epoch": 0.2088484634404804,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998360126601556,
|
|
"loss": 6.3369,
|
|
"mean_token_accuracy": 0.14289727210998535,
|
|
"num_tokens": 4411606.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 6.3337644100189205,
|
|
"epoch": 0.20929000353232075,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998348083507916,
|
|
"loss": 6.4062,
|
|
"mean_token_accuracy": 0.14081210866570473,
|
|
"num_tokens": 4421685.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 6.558507633209229,
|
|
"epoch": 0.20973154362416108,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998335996370416,
|
|
"loss": 6.3782,
|
|
"mean_token_accuracy": 0.14399294778704644,
|
|
"num_tokens": 4431765.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 6.321137380599976,
|
|
"epoch": 0.2101730837160014,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998323865189291,
|
|
"loss": 6.3523,
|
|
"mean_token_accuracy": 0.13661579713225364,
|
|
"num_tokens": 4441191.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 6.481552457809448,
|
|
"epoch": 0.21061462380784174,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998311689964781,
|
|
"loss": 6.4322,
|
|
"mean_token_accuracy": 0.13680859059095382,
|
|
"num_tokens": 4450156.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 6.4807047843933105,
|
|
"epoch": 0.2110561638996821,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998299470697125,
|
|
"loss": 6.4163,
|
|
"mean_token_accuracy": 0.14260546639561653,
|
|
"num_tokens": 4459466.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 6.461113500595093,
|
|
"epoch": 0.21149770399152243,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998287207386559,
|
|
"loss": 6.4156,
|
|
"mean_token_accuracy": 0.14247968047857285,
|
|
"num_tokens": 4468539.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 6.525749444961548,
|
|
"epoch": 0.21193924408336276,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998274900033326,
|
|
"loss": 6.315,
|
|
"mean_token_accuracy": 0.14835015684366226,
|
|
"num_tokens": 4477579.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 6.236664247512818,
|
|
"epoch": 0.21238078417520312,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998262548637667,
|
|
"loss": 6.2842,
|
|
"mean_token_accuracy": 0.14991160482168198,
|
|
"num_tokens": 4486800.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 6.444223546981812,
|
|
"epoch": 0.21282232426704345,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998250153199822,
|
|
"loss": 6.2465,
|
|
"mean_token_accuracy": 0.14562757611274718,
|
|
"num_tokens": 4495985.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 6.417825555801391,
|
|
"epoch": 0.21326386435888378,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998237713720036,
|
|
"loss": 6.4031,
|
|
"mean_token_accuracy": 0.14113787487149237,
|
|
"num_tokens": 4504944.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 6.344033908843994,
|
|
"epoch": 0.2137054044507241,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998225230198552,
|
|
"loss": 6.2875,
|
|
"mean_token_accuracy": 0.14928205609321593,
|
|
"num_tokens": 4515402.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 6.412223052978516,
|
|
"epoch": 0.21414694454256447,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004998212702635614,
|
|
"loss": 6.355,
|
|
"mean_token_accuracy": 0.14224686175584794,
|
|
"num_tokens": 4525009.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 6.537710332870484,
|
|
"epoch": 0.2145884846344048,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998200131031469,
|
|
"loss": 6.4066,
|
|
"mean_token_accuracy": 0.142412006855011,
|
|
"num_tokens": 4534460.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 6.355252885818482,
|
|
"epoch": 0.21503002472624513,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998187515386361,
|
|
"loss": 6.2239,
|
|
"mean_token_accuracy": 0.15292632952332497,
|
|
"num_tokens": 4543748.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 6.32159743309021,
|
|
"epoch": 0.2154715648180855,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998174855700538,
|
|
"loss": 6.3515,
|
|
"mean_token_accuracy": 0.14536840543150903,
|
|
"num_tokens": 4552722.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 6.455835342407227,
|
|
"epoch": 0.21591310490992582,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004998162151974248,
|
|
"loss": 6.2244,
|
|
"mean_token_accuracy": 0.1433302193880081,
|
|
"num_tokens": 4561607.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 6.376346635818481,
|
|
"epoch": 0.21635464500176615,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.000499814940420774,
|
|
"loss": 6.4926,
|
|
"mean_token_accuracy": 0.13043315410614015,
|
|
"num_tokens": 4572524.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 6.473890399932861,
|
|
"epoch": 0.21679618509360649,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998136612401266,
|
|
"loss": 6.306,
|
|
"mean_token_accuracy": 0.13808697760105132,
|
|
"num_tokens": 4581601.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 6.362298917770386,
|
|
"epoch": 0.21723772518544684,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998123776555071,
|
|
"loss": 6.355,
|
|
"mean_token_accuracy": 0.13434374257922171,
|
|
"num_tokens": 4591795.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 6.4316198348999025,
|
|
"epoch": 0.21767926527728718,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998110896669412,
|
|
"loss": 6.3127,
|
|
"mean_token_accuracy": 0.14232389852404595,
|
|
"num_tokens": 4600745.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 6.403714847564697,
|
|
"epoch": 0.2181208053691275,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004998097972744539,
|
|
"loss": 6.3668,
|
|
"mean_token_accuracy": 0.13958390951156616,
|
|
"num_tokens": 4610490.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.421542739868164,
|
|
"epoch": 0.21856234546096787,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998085004780705,
|
|
"loss": 6.3027,
|
|
"mean_token_accuracy": 0.14644000679254532,
|
|
"num_tokens": 4619511.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 6.411759996414185,
|
|
"epoch": 0.2190038855528082,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998071992778164,
|
|
"loss": 6.3536,
|
|
"mean_token_accuracy": 0.13926490917801856,
|
|
"num_tokens": 4628186.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 6.416973400115967,
|
|
"epoch": 0.21944542564464853,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499805893673717,
|
|
"loss": 6.3574,
|
|
"mean_token_accuracy": 0.14341954439878463,
|
|
"num_tokens": 4637431.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 6.347478723526001,
|
|
"epoch": 0.21988696573648886,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004998045836657982,
|
|
"loss": 6.2093,
|
|
"mean_token_accuracy": 0.14213306605815887,
|
|
"num_tokens": 4646627.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 6.368200349807739,
|
|
"epoch": 0.22032850582832922,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998032692540853,
|
|
"loss": 6.3012,
|
|
"mean_token_accuracy": 0.14045739471912383,
|
|
"num_tokens": 4656095.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 6.476345205307007,
|
|
"epoch": 0.22077004592016955,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998019504386044,
|
|
"loss": 6.3876,
|
|
"mean_token_accuracy": 0.13640450164675713,
|
|
"num_tokens": 4665807.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 6.444914436340332,
|
|
"epoch": 0.22121158601200988,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998006272193809,
|
|
"loss": 6.3493,
|
|
"mean_token_accuracy": 0.13855071663856505,
|
|
"num_tokens": 4674459.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 6.361084604263306,
|
|
"epoch": 0.22165312610385024,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004997992995964412,
|
|
"loss": 6.4325,
|
|
"mean_token_accuracy": 0.13779560700058938,
|
|
"num_tokens": 4684063.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 6.45024299621582,
|
|
"epoch": 0.22209466619569057,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997979675698109,
|
|
"loss": 6.3029,
|
|
"mean_token_accuracy": 0.14212062656879426,
|
|
"num_tokens": 4692807.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 6.438331031799317,
|
|
"epoch": 0.2225362062875309,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997966311395164,
|
|
"loss": 6.2422,
|
|
"mean_token_accuracy": 0.14746622294187545,
|
|
"num_tokens": 4701100.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 6.284497547149658,
|
|
"epoch": 0.22297774637937123,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997952903055836,
|
|
"loss": 6.3071,
|
|
"mean_token_accuracy": 0.13971827551722527,
|
|
"num_tokens": 4710697.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 6.4242840766906735,
|
|
"epoch": 0.2234192864712116,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499793945068039,
|
|
"loss": 6.2875,
|
|
"mean_token_accuracy": 0.14202770590782166,
|
|
"num_tokens": 4718745.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 6.358328342437744,
|
|
"epoch": 0.22386082656305192,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004997925954269088,
|
|
"loss": 6.2493,
|
|
"mean_token_accuracy": 0.15010830983519555,
|
|
"num_tokens": 4728056.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 6.390234851837159,
|
|
"epoch": 0.22430236665489225,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004997912413822196,
|
|
"loss": 6.3892,
|
|
"mean_token_accuracy": 0.14226726815104485,
|
|
"num_tokens": 4737605.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 6.333422613143921,
|
|
"epoch": 0.2247439067467326,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997898829339979,
|
|
"loss": 6.216,
|
|
"mean_token_accuracy": 0.15019772350788116,
|
|
"num_tokens": 4746168.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 6.414393615722656,
|
|
"epoch": 0.22518544683857294,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049978852008227,
|
|
"loss": 6.2827,
|
|
"mean_token_accuracy": 0.14390757903456688,
|
|
"num_tokens": 4755072.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 6.292112064361572,
|
|
"epoch": 0.22562698693041328,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000499787152827063,
|
|
"loss": 6.2887,
|
|
"mean_token_accuracy": 0.14927180036902427,
|
|
"num_tokens": 4764580.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 6.362084197998047,
|
|
"epoch": 0.2260685270222536,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997857811684035,
|
|
"loss": 6.3109,
|
|
"mean_token_accuracy": 0.14619807451963424,
|
|
"num_tokens": 4774135.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 6.443705081939697,
|
|
"epoch": 0.22651006711409397,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997844051063183,
|
|
"loss": 6.3931,
|
|
"mean_token_accuracy": 0.1407366193830967,
|
|
"num_tokens": 4784733.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 6.393408250808716,
|
|
"epoch": 0.2269516072059343,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004997830246408346,
|
|
"loss": 6.3304,
|
|
"mean_token_accuracy": 0.14791636019945145,
|
|
"num_tokens": 4795327.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 6.342170095443725,
|
|
"epoch": 0.22739314729777463,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004997816397719791,
|
|
"loss": 6.3028,
|
|
"mean_token_accuracy": 0.14114121049642564,
|
|
"num_tokens": 4804314.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 6.458865118026734,
|
|
"epoch": 0.227834687389615,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997802504997792,
|
|
"loss": 6.3913,
|
|
"mean_token_accuracy": 0.13652418628335,
|
|
"num_tokens": 4813637.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 6.382553291320801,
|
|
"epoch": 0.22827622748145532,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997788568242621,
|
|
"loss": 6.2591,
|
|
"mean_token_accuracy": 0.14338775500655174,
|
|
"num_tokens": 4823094.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 6.327674865722656,
|
|
"epoch": 0.22871776757329565,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.000499777458745455,
|
|
"loss": 6.1969,
|
|
"mean_token_accuracy": 0.1477431207895279,
|
|
"num_tokens": 4833199.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 6.392719554901123,
|
|
"epoch": 0.22915930766513598,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997760562633853,
|
|
"loss": 6.2909,
|
|
"mean_token_accuracy": 0.14219107255339622,
|
|
"num_tokens": 4842970.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 6.394578695297241,
|
|
"epoch": 0.22960084775697634,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997746493780804,
|
|
"loss": 6.3788,
|
|
"mean_token_accuracy": 0.13738251477479935,
|
|
"num_tokens": 4852043.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 6.376160097122193,
|
|
"epoch": 0.23004238784881667,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499773238089568,
|
|
"loss": 6.2693,
|
|
"mean_token_accuracy": 0.13766007199883462,
|
|
"num_tokens": 4862232.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 6.345012950897217,
|
|
"epoch": 0.230483927940657,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997718223978758,
|
|
"loss": 6.2081,
|
|
"mean_token_accuracy": 0.1466532751917839,
|
|
"num_tokens": 4871186.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 6.343753385543823,
|
|
"epoch": 0.23092546803249736,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997704023030315,
|
|
"loss": 6.3059,
|
|
"mean_token_accuracy": 0.150559451431036,
|
|
"num_tokens": 4879974.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 6.393126726150513,
|
|
"epoch": 0.2313670081243377,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997689778050627,
|
|
"loss": 6.3617,
|
|
"mean_token_accuracy": 0.14292784333229064,
|
|
"num_tokens": 4890300.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 6.369229030609131,
|
|
"epoch": 0.23180854821617802,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997675489039975,
|
|
"loss": 6.3301,
|
|
"mean_token_accuracy": 0.1426799289882183,
|
|
"num_tokens": 4900428.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 6.3974145412445065,
|
|
"epoch": 0.23225008830801835,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997661155998638,
|
|
"loss": 6.3245,
|
|
"mean_token_accuracy": 0.1442883849143982,
|
|
"num_tokens": 4910092.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 6.384046411514282,
|
|
"epoch": 0.2326916283998587,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997646778926898,
|
|
"loss": 6.3247,
|
|
"mean_token_accuracy": 0.13738622814416884,
|
|
"num_tokens": 4919593.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 6.373585891723633,
|
|
"epoch": 0.23313316849169904,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997632357825035,
|
|
"loss": 6.3001,
|
|
"mean_token_accuracy": 0.14202155098319053,
|
|
"num_tokens": 4929098.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 6.43012547492981,
|
|
"epoch": 0.23357470858353938,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997617892693333,
|
|
"loss": 6.3657,
|
|
"mean_token_accuracy": 0.1421157017350197,
|
|
"num_tokens": 4938265.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 6.394748878479004,
|
|
"epoch": 0.23401624867537973,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997603383532075,
|
|
"loss": 6.3016,
|
|
"mean_token_accuracy": 0.14679210409522056,
|
|
"num_tokens": 4946694.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 6.368328046798706,
|
|
"epoch": 0.23445778876722007,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997588830341545,
|
|
"loss": 6.3132,
|
|
"mean_token_accuracy": 0.1434150867164135,
|
|
"num_tokens": 4955296.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 6.32787938117981,
|
|
"epoch": 0.2348993288590604,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997574233122028,
|
|
"loss": 6.2759,
|
|
"mean_token_accuracy": 0.14597226828336715,
|
|
"num_tokens": 4964409.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 6.3884584426879885,
|
|
"epoch": 0.23534086895090076,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997559591873809,
|
|
"loss": 6.284,
|
|
"mean_token_accuracy": 0.1481903851032257,
|
|
"num_tokens": 4973449.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 6.345009279251099,
|
|
"epoch": 0.2357824090427411,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997544906597178,
|
|
"loss": 6.2779,
|
|
"mean_token_accuracy": 0.1470661997795105,
|
|
"num_tokens": 4983057.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 6.2613893985748295,
|
|
"epoch": 0.23622394913458142,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004997530177292418,
|
|
"loss": 6.3532,
|
|
"mean_token_accuracy": 0.13861697241663934,
|
|
"num_tokens": 4991950.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 6.461032247543335,
|
|
"epoch": 0.23666548922642175,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997515403959823,
|
|
"loss": 6.2857,
|
|
"mean_token_accuracy": 0.14603266417980193,
|
|
"num_tokens": 5001042.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 6.356680679321289,
|
|
"epoch": 0.2371070293182621,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997500586599677,
|
|
"loss": 6.2198,
|
|
"mean_token_accuracy": 0.15022996366024016,
|
|
"num_tokens": 5009827.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 6.292784547805786,
|
|
"epoch": 0.23754856941010244,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997485725212274,
|
|
"loss": 6.2662,
|
|
"mean_token_accuracy": 0.1465997129678726,
|
|
"num_tokens": 5018708.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 6.334398937225342,
|
|
"epoch": 0.23799010950194277,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997470819797903,
|
|
"loss": 6.1678,
|
|
"mean_token_accuracy": 0.149826068431139,
|
|
"num_tokens": 5027522.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 6.312096786499024,
|
|
"epoch": 0.23843164959378313,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997455870356857,
|
|
"loss": 6.2858,
|
|
"mean_token_accuracy": 0.14754335582256317,
|
|
"num_tokens": 5035755.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 6.38718318939209,
|
|
"epoch": 0.23887318968562346,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997440876889429,
|
|
"loss": 6.2373,
|
|
"mean_token_accuracy": 0.14902258217334746,
|
|
"num_tokens": 5045289.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 6.2602025985717775,
|
|
"epoch": 0.2393147297774638,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997425839395913,
|
|
"loss": 6.2623,
|
|
"mean_token_accuracy": 0.14851141721010208,
|
|
"num_tokens": 5053774.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 6.4446056365966795,
|
|
"epoch": 0.23975626986930412,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004997410757876602,
|
|
"loss": 6.3368,
|
|
"mean_token_accuracy": 0.13768139705061913,
|
|
"num_tokens": 5062911.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 6.34313178062439,
|
|
"epoch": 0.24019780996114448,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997395632331793,
|
|
"loss": 6.1974,
|
|
"mean_token_accuracy": 0.15056394785642624,
|
|
"num_tokens": 5072107.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 6.232982730865478,
|
|
"epoch": 0.2406393500529848,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997380462761781,
|
|
"loss": 6.1744,
|
|
"mean_token_accuracy": 0.15013156086206436,
|
|
"num_tokens": 5080588.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 6.360136985778809,
|
|
"epoch": 0.24108089014482514,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997365249166864,
|
|
"loss": 6.3571,
|
|
"mean_token_accuracy": 0.1455472856760025,
|
|
"num_tokens": 5090262.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 6.398047304153442,
|
|
"epoch": 0.2415224302366655,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997349991547342,
|
|
"loss": 6.2776,
|
|
"mean_token_accuracy": 0.15021264627575875,
|
|
"num_tokens": 5099285.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 6.356108903884888,
|
|
"epoch": 0.24196397032850583,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997334689903509,
|
|
"loss": 6.3226,
|
|
"mean_token_accuracy": 0.14855852872133254,
|
|
"num_tokens": 5109115.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 6.388672256469727,
|
|
"epoch": 0.24240551042034617,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004997319344235668,
|
|
"loss": 6.3429,
|
|
"mean_token_accuracy": 0.14180680066347123,
|
|
"num_tokens": 5117977.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 6.393019914627075,
|
|
"epoch": 0.2428470505121865,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499730395454412,
|
|
"loss": 6.3025,
|
|
"mean_token_accuracy": 0.1451731264591217,
|
|
"num_tokens": 5127457.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 6.300089502334595,
|
|
"epoch": 0.24328859060402686,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997288520829166,
|
|
"loss": 6.3466,
|
|
"mean_token_accuracy": 0.14065672382712363,
|
|
"num_tokens": 5137310.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 6.394161605834961,
|
|
"epoch": 0.2437301306958672,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997273043091107,
|
|
"loss": 6.2725,
|
|
"mean_token_accuracy": 0.14155926927924156,
|
|
"num_tokens": 5146963.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 6.31079797744751,
|
|
"epoch": 0.24417167078770752,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997257521330248,
|
|
"loss": 6.2601,
|
|
"mean_token_accuracy": 0.14022860154509545,
|
|
"num_tokens": 5155521.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 6.40408935546875,
|
|
"epoch": 0.24461321087954788,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997241955546892,
|
|
"loss": 6.237,
|
|
"mean_token_accuracy": 0.14580907300114632,
|
|
"num_tokens": 5165182.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 6.289978647232056,
|
|
"epoch": 0.2450547509713882,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004997226345741343,
|
|
"loss": 6.2649,
|
|
"mean_token_accuracy": 0.13975519686937332,
|
|
"num_tokens": 5175511.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 6.354159593582153,
|
|
"epoch": 0.24549629106322854,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499721069191391,
|
|
"loss": 6.2126,
|
|
"mean_token_accuracy": 0.14699607565999032,
|
|
"num_tokens": 5184772.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 6.288126516342163,
|
|
"epoch": 0.24593783115506887,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997194994064896,
|
|
"loss": 6.3014,
|
|
"mean_token_accuracy": 0.14003223031759263,
|
|
"num_tokens": 5195136.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 6.391946315765381,
|
|
"epoch": 0.24637937124690923,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499717925219461,
|
|
"loss": 6.2279,
|
|
"mean_token_accuracy": 0.15093553364276885,
|
|
"num_tokens": 5203163.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 6.346267318725586,
|
|
"epoch": 0.24682091133874956,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997163466303362,
|
|
"loss": 6.3224,
|
|
"mean_token_accuracy": 0.14383373707532882,
|
|
"num_tokens": 5213233.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 6.329722881317139,
|
|
"epoch": 0.2472624514305899,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499714763639146,
|
|
"loss": 6.2346,
|
|
"mean_token_accuracy": 0.14386766105890275,
|
|
"num_tokens": 5222940.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 6.36411566734314,
|
|
"epoch": 0.24770399152243025,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997131762459211,
|
|
"loss": 6.2596,
|
|
"mean_token_accuracy": 0.14432956129312516,
|
|
"num_tokens": 5232263.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 6.369926023483276,
|
|
"epoch": 0.24814553161427058,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997115844506932,
|
|
"loss": 6.2334,
|
|
"mean_token_accuracy": 0.14295720756053926,
|
|
"num_tokens": 5241536.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 6.348195028305054,
|
|
"epoch": 0.2485870717061109,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997099882534929,
|
|
"loss": 6.2732,
|
|
"mean_token_accuracy": 0.14211497604846954,
|
|
"num_tokens": 5250702.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 6.359058141708374,
|
|
"epoch": 0.24902861179795124,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997083876543519,
|
|
"loss": 6.2763,
|
|
"mean_token_accuracy": 0.14498503208160402,
|
|
"num_tokens": 5259811.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 6.397128582000732,
|
|
"epoch": 0.2494701518897916,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004997067826533014,
|
|
"loss": 6.3615,
|
|
"mean_token_accuracy": 0.13723283037543296,
|
|
"num_tokens": 5270518.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 6.375804328918457,
|
|
"epoch": 0.24991169198163193,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004997051732503726,
|
|
"loss": 6.2458,
|
|
"mean_token_accuracy": 0.14747673273086548,
|
|
"num_tokens": 5279538.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 6.313772678375244,
|
|
"epoch": 0.25035323207347226,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997035594455975,
|
|
"loss": 6.2702,
|
|
"mean_token_accuracy": 0.13872402533888817,
|
|
"num_tokens": 5289633.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 6.370833015441894,
|
|
"epoch": 0.2507947721653126,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997019412390074,
|
|
"loss": 6.3603,
|
|
"mean_token_accuracy": 0.1444901555776596,
|
|
"num_tokens": 5299148.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 6.36065092086792,
|
|
"epoch": 0.2512363122571529,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499700318630634,
|
|
"loss": 6.2527,
|
|
"mean_token_accuracy": 0.1457889422774315,
|
|
"num_tokens": 5309090.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 6.384716939926148,
|
|
"epoch": 0.2516778523489933,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996986916205092,
|
|
"loss": 6.3297,
|
|
"mean_token_accuracy": 0.14129810705780982,
|
|
"num_tokens": 5318798.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 6.313976621627807,
|
|
"epoch": 0.25211939244083365,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996970602086648,
|
|
"loss": 6.1848,
|
|
"mean_token_accuracy": 0.15023760497570038,
|
|
"num_tokens": 5327915.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 6.251888847351074,
|
|
"epoch": 0.252560932532674,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996954243951327,
|
|
"loss": 6.2192,
|
|
"mean_token_accuracy": 0.15385923832654952,
|
|
"num_tokens": 5336970.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 6.299870014190674,
|
|
"epoch": 0.2530024726245143,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996937841799451,
|
|
"loss": 6.1821,
|
|
"mean_token_accuracy": 0.15226729065179825,
|
|
"num_tokens": 5345167.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 6.1627833366394045,
|
|
"epoch": 0.25344401271635464,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004996921395631342,
|
|
"loss": 6.1711,
|
|
"mean_token_accuracy": 0.14804726019501685,
|
|
"num_tokens": 5353399.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 6.380198335647583,
|
|
"epoch": 0.25388555280819497,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499690490544732,
|
|
"loss": 6.285,
|
|
"mean_token_accuracy": 0.146748573333025,
|
|
"num_tokens": 5363224.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 6.372404766082764,
|
|
"epoch": 0.2543270929000353,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996888371247707,
|
|
"loss": 6.2862,
|
|
"mean_token_accuracy": 0.14108646661043167,
|
|
"num_tokens": 5372274.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 6.302175998687744,
|
|
"epoch": 0.2547686329918757,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499687179303283,
|
|
"loss": 6.2746,
|
|
"mean_token_accuracy": 0.15104661732912064,
|
|
"num_tokens": 5380240.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 6.269204902648926,
|
|
"epoch": 0.255210173083716,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004996855170803012,
|
|
"loss": 6.138,
|
|
"mean_token_accuracy": 0.15041064321994782,
|
|
"num_tokens": 5389390.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 6.343308639526367,
|
|
"epoch": 0.25565171317555635,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996838504558581,
|
|
"loss": 6.2986,
|
|
"mean_token_accuracy": 0.14492825120687486,
|
|
"num_tokens": 5399425.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 6.379653215408325,
|
|
"epoch": 0.2560932532673967,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499682179429986,
|
|
"loss": 6.3089,
|
|
"mean_token_accuracy": 0.14139388352632523,
|
|
"num_tokens": 5408717.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 6.292103576660156,
|
|
"epoch": 0.256534793359237,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996805040027178,
|
|
"loss": 6.2399,
|
|
"mean_token_accuracy": 0.1403766691684723,
|
|
"num_tokens": 5418475.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 6.395513296127319,
|
|
"epoch": 0.25697633345107734,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996788241740863,
|
|
"loss": 6.2884,
|
|
"mean_token_accuracy": 0.143946073949337,
|
|
"num_tokens": 5428403.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 6.366812467575073,
|
|
"epoch": 0.2574178735429177,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996771399441243,
|
|
"loss": 6.3188,
|
|
"mean_token_accuracy": 0.14125285297632217,
|
|
"num_tokens": 5437347.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 6.395107555389404,
|
|
"epoch": 0.25785941363475806,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004996754513128652,
|
|
"loss": 6.2216,
|
|
"mean_token_accuracy": 0.1553879424929619,
|
|
"num_tokens": 5446804.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 6.245992279052734,
|
|
"epoch": 0.2583009537265984,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996737582803416,
|
|
"loss": 6.1701,
|
|
"mean_token_accuracy": 0.14774591475725174,
|
|
"num_tokens": 5455888.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 6.349690961837768,
|
|
"epoch": 0.2587424938184387,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996720608465868,
|
|
"loss": 6.1785,
|
|
"mean_token_accuracy": 0.14454589933156967,
|
|
"num_tokens": 5463977.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 6.251695680618286,
|
|
"epoch": 0.25918403391027905,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996703590116342,
|
|
"loss": 6.2901,
|
|
"mean_token_accuracy": 0.1413638859987259,
|
|
"num_tokens": 5473780.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 6.343139219284057,
|
|
"epoch": 0.2596255740021194,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996686527755171,
|
|
"loss": 6.1747,
|
|
"mean_token_accuracy": 0.15054369121789932,
|
|
"num_tokens": 5482151.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 6.287330961227417,
|
|
"epoch": 0.2600671140939597,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996669421382687,
|
|
"loss": 6.181,
|
|
"mean_token_accuracy": 0.15408090725541115,
|
|
"num_tokens": 5491103.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 6.23843822479248,
|
|
"epoch": 0.26050865418580005,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004996652270999228,
|
|
"loss": 6.2051,
|
|
"mean_token_accuracy": 0.1455566719174385,
|
|
"num_tokens": 5500367.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 6.401996898651123,
|
|
"epoch": 0.26095019427764043,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996635076605128,
|
|
"loss": 6.2392,
|
|
"mean_token_accuracy": 0.1509515941143036,
|
|
"num_tokens": 5509631.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 6.3384003162384035,
|
|
"epoch": 0.26139173436948077,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004996617838200725,
|
|
"loss": 6.2572,
|
|
"mean_token_accuracy": 0.14331620335578918,
|
|
"num_tokens": 5518635.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 6.241027069091797,
|
|
"epoch": 0.2618332744613211,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996600555786357,
|
|
"loss": 6.2142,
|
|
"mean_token_accuracy": 0.1464727446436882,
|
|
"num_tokens": 5527696.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 6.348132085800171,
|
|
"epoch": 0.26227481455316143,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996583229362362,
|
|
"loss": 6.1834,
|
|
"mean_token_accuracy": 0.14780823439359664,
|
|
"num_tokens": 5536632.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 6.378821849822998,
|
|
"epoch": 0.26271635464500176,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996565858929078,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.14528179541230202,
|
|
"num_tokens": 5545825.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 6.259585618972778,
|
|
"epoch": 0.2631578947368421,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996548444486847,
|
|
"loss": 6.1389,
|
|
"mean_token_accuracy": 0.15060140788555146,
|
|
"num_tokens": 5555158.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 6.116889953613281,
|
|
"epoch": 0.2635994348286824,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996530986036008,
|
|
"loss": 6.0795,
|
|
"mean_token_accuracy": 0.15272270664572715,
|
|
"num_tokens": 5564218.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 6.255494451522827,
|
|
"epoch": 0.2640409749205228,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996513483576907,
|
|
"loss": 6.2219,
|
|
"mean_token_accuracy": 0.14951637461781503,
|
|
"num_tokens": 5572760.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 6.423755645751953,
|
|
"epoch": 0.26448251501236314,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996495937109884,
|
|
"loss": 6.2825,
|
|
"mean_token_accuracy": 0.14191085398197173,
|
|
"num_tokens": 5581660.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 6.250067615509034,
|
|
"epoch": 0.26492405510420347,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996478346635283,
|
|
"loss": 6.1968,
|
|
"mean_token_accuracy": 0.1436440147459507,
|
|
"num_tokens": 5590664.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.26492405510420347,
|
|
"eval_entropy": 6.077911184598204,
|
|
"eval_loss": 6.2711029052734375,
|
|
"eval_mean_token_accuracy": 0.15016848111384373,
|
|
"eval_num_tokens": 5590664.0,
|
|
"eval_runtime": 26.2453,
|
|
"eval_samples_per_second": 1345.574,
|
|
"eval_steps_per_second": 168.221,
|
|
"step": 3000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 113230,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 8221684432896000.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|