Model: fpadovani/eus-latn-100mb-after-ppt-shuff-dyck-100mb-ckpt500_seed3407 Source: Original Platform
24079 lines
660 KiB
JSON
24079 lines
660 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0596962204168139,
|
|
"eval_steps": 3000,
|
|
"global_step": 12000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 4.790674829483033,
|
|
"epoch": 0.0004415400918403391,
|
|
"grad_norm": 13.0625,
|
|
"learning_rate": 2e-06,
|
|
"loss": 14.4349,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 9390.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 4.818728256225586,
|
|
"epoch": 0.0008830801836806782,
|
|
"grad_norm": 14.1875,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 14.4117,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 18671.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 4.857943296432495,
|
|
"epoch": 0.0013246202755210173,
|
|
"grad_norm": 16.25,
|
|
"learning_rate": 7e-06,
|
|
"loss": 14.1693,
|
|
"mean_token_accuracy": 0.00014005602570250631,
|
|
"num_tokens": 27614.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 5.027469444274902,
|
|
"epoch": 0.0017661603673613563,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 13.7713,
|
|
"mean_token_accuracy": 8.547008619643747e-05,
|
|
"num_tokens": 37850.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 6.5380439281463625,
|
|
"epoch": 0.0022077004592016957,
|
|
"grad_norm": 26.375,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 12.2509,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 47166.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 9.970003509521485,
|
|
"epoch": 0.0026492405510420347,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 10.9371,
|
|
"mean_token_accuracy": 0.00022374301915988325,
|
|
"num_tokens": 55500.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 10.680026626586914,
|
|
"epoch": 0.0030907806428823736,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 10.6238,
|
|
"mean_token_accuracy": 0.009453117521479726,
|
|
"num_tokens": 63851.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 10.703511428833007,
|
|
"epoch": 0.0035323207347227126,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 10.3602,
|
|
"mean_token_accuracy": 0.029037438705563544,
|
|
"num_tokens": 73697.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 10.651962184906006,
|
|
"epoch": 0.003973860826563052,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 10.0115,
|
|
"mean_token_accuracy": 0.05894971713423729,
|
|
"num_tokens": 83000.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 10.439279747009277,
|
|
"epoch": 0.004415400918403391,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 9.8132,
|
|
"mean_token_accuracy": 0.05815875120460987,
|
|
"num_tokens": 92982.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 10.318083763122559,
|
|
"epoch": 0.00485694101024373,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 9.6231,
|
|
"mean_token_accuracy": 0.05530005097389221,
|
|
"num_tokens": 101455.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 10.381121063232422,
|
|
"epoch": 0.005298481102084069,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 9.5438,
|
|
"mean_token_accuracy": 0.05805549845099449,
|
|
"num_tokens": 110782.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 10.360444736480712,
|
|
"epoch": 0.005740021193924408,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 9.4168,
|
|
"mean_token_accuracy": 0.060499183088541034,
|
|
"num_tokens": 119241.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 10.300647163391114,
|
|
"epoch": 0.006181561285764747,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 9.4178,
|
|
"mean_token_accuracy": 0.055320289358496665,
|
|
"num_tokens": 127903.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 10.332123184204102,
|
|
"epoch": 0.006623101377605086,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 9.3721,
|
|
"mean_token_accuracy": 0.05736841931939125,
|
|
"num_tokens": 137370.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 10.290982055664063,
|
|
"epoch": 0.007064641469445425,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 9.2214,
|
|
"mean_token_accuracy": 0.06618293710052967,
|
|
"num_tokens": 146582.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 10.196907424926758,
|
|
"epoch": 0.007506181561285765,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 9.1585,
|
|
"mean_token_accuracy": 0.05961471572518349,
|
|
"num_tokens": 154933.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 10.205323791503906,
|
|
"epoch": 0.007947721653126103,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 9.1026,
|
|
"mean_token_accuracy": 0.072137650847435,
|
|
"num_tokens": 165157.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 10.10411615371704,
|
|
"epoch": 0.008389261744966443,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 8.9848,
|
|
"mean_token_accuracy": 0.0728946004062891,
|
|
"num_tokens": 174958.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 10.01873140335083,
|
|
"epoch": 0.008830801836806783,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 8.8889,
|
|
"mean_token_accuracy": 0.07516518756747245,
|
|
"num_tokens": 184256.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 9.956882572174072,
|
|
"epoch": 0.009272341928647121,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 8.7839,
|
|
"mean_token_accuracy": 0.067950439453125,
|
|
"num_tokens": 192894.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 9.884513092041015,
|
|
"epoch": 0.00971388202048746,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 8.6868,
|
|
"mean_token_accuracy": 0.07383731976151467,
|
|
"num_tokens": 202675.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 9.810705184936523,
|
|
"epoch": 0.010155422112327799,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 8.624,
|
|
"mean_token_accuracy": 0.07006355635821819,
|
|
"num_tokens": 212261.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 9.743825721740723,
|
|
"epoch": 0.010596962204168139,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 8.599,
|
|
"mean_token_accuracy": 0.06874018795788288,
|
|
"num_tokens": 222329.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 9.528209781646728,
|
|
"epoch": 0.011038502296008477,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 8.4477,
|
|
"mean_token_accuracy": 0.06682575456798076,
|
|
"num_tokens": 231247.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 9.442446994781495,
|
|
"epoch": 0.011480042387848817,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 8.3412,
|
|
"mean_token_accuracy": 0.06921537183225154,
|
|
"num_tokens": 239978.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 9.272939491271973,
|
|
"epoch": 0.011921582479689156,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 8.3,
|
|
"mean_token_accuracy": 0.06850462295114994,
|
|
"num_tokens": 249735.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 9.222266483306885,
|
|
"epoch": 0.012363122571529495,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 8.2344,
|
|
"mean_token_accuracy": 0.06959122642874718,
|
|
"num_tokens": 259369.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 8.956540203094482,
|
|
"epoch": 0.012804662663369834,
|
|
"grad_norm": 0.81640625,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 8.2305,
|
|
"mean_token_accuracy": 0.06539506763219834,
|
|
"num_tokens": 268645.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 8.88605546951294,
|
|
"epoch": 0.013246202755210172,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 8.0685,
|
|
"mean_token_accuracy": 0.07155903875827789,
|
|
"num_tokens": 276667.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 8.623716259002686,
|
|
"epoch": 0.013687742847050512,
|
|
"grad_norm": 0.703125,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 8.0682,
|
|
"mean_token_accuracy": 0.07539896108210087,
|
|
"num_tokens": 286017.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 8.590844440460206,
|
|
"epoch": 0.01412928293889085,
|
|
"grad_norm": 0.83984375,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 8.0571,
|
|
"mean_token_accuracy": 0.07278457470238209,
|
|
"num_tokens": 295631.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 8.562520027160645,
|
|
"epoch": 0.01457082303073119,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 8.0486,
|
|
"mean_token_accuracy": 0.06986252851784229,
|
|
"num_tokens": 304704.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 8.490843200683594,
|
|
"epoch": 0.01501236312257153,
|
|
"grad_norm": 0.7734375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 8.0665,
|
|
"mean_token_accuracy": 0.07160350978374481,
|
|
"num_tokens": 314195.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 8.416227722167969,
|
|
"epoch": 0.015453903214411868,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 8.0632,
|
|
"mean_token_accuracy": 0.07028085552155972,
|
|
"num_tokens": 323379.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 8.398184299468994,
|
|
"epoch": 0.015895443306252206,
|
|
"grad_norm": 0.734375,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 7.9637,
|
|
"mean_token_accuracy": 0.08185541778802871,
|
|
"num_tokens": 332322.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 8.336036014556885,
|
|
"epoch": 0.016336983398092548,
|
|
"grad_norm": 0.79296875,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 7.9427,
|
|
"mean_token_accuracy": 0.08073886930942535,
|
|
"num_tokens": 341735.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 8.337114715576172,
|
|
"epoch": 0.016778523489932886,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 8.0349,
|
|
"mean_token_accuracy": 0.06938613168895244,
|
|
"num_tokens": 351209.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 8.39198350906372,
|
|
"epoch": 0.017220063581773224,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 7.9792,
|
|
"mean_token_accuracy": 0.07559169828891754,
|
|
"num_tokens": 360467.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 8.235328102111817,
|
|
"epoch": 0.017661603673613566,
|
|
"grad_norm": 0.875,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 7.9423,
|
|
"mean_token_accuracy": 0.07669526152312756,
|
|
"num_tokens": 370361.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 8.374059581756592,
|
|
"epoch": 0.018103143765453904,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000102,
|
|
"loss": 8.0075,
|
|
"mean_token_accuracy": 0.07567069008946418,
|
|
"num_tokens": 380366.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 8.206629276275635,
|
|
"epoch": 0.018544683857294242,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 7.9185,
|
|
"mean_token_accuracy": 0.08159190192818641,
|
|
"num_tokens": 390690.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 8.24603796005249,
|
|
"epoch": 0.01898622394913458,
|
|
"grad_norm": 0.87109375,
|
|
"learning_rate": 0.000107,
|
|
"loss": 7.9601,
|
|
"mean_token_accuracy": 0.0793293446302414,
|
|
"num_tokens": 400722.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 8.157498931884765,
|
|
"epoch": 0.01942776404097492,
|
|
"grad_norm": 0.84375,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 7.8501,
|
|
"mean_token_accuracy": 0.08334142193198205,
|
|
"num_tokens": 410223.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 8.205572509765625,
|
|
"epoch": 0.01986930413281526,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000112,
|
|
"loss": 7.9021,
|
|
"mean_token_accuracy": 0.07716193869709968,
|
|
"num_tokens": 420214.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 8.172825717926026,
|
|
"epoch": 0.020310844224655598,
|
|
"grad_norm": 0.88671875,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 7.8564,
|
|
"mean_token_accuracy": 0.08035471551120281,
|
|
"num_tokens": 429407.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 8.162760925292968,
|
|
"epoch": 0.02075238431649594,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 7.8246,
|
|
"mean_token_accuracy": 0.07542734369635581,
|
|
"num_tokens": 438403.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 8.177341651916503,
|
|
"epoch": 0.021193924408336277,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 7.8545,
|
|
"mean_token_accuracy": 0.08535856604576111,
|
|
"num_tokens": 447466.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 8.069422197341918,
|
|
"epoch": 0.021635464500176615,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.000122,
|
|
"loss": 7.9366,
|
|
"mean_token_accuracy": 0.07459555268287658,
|
|
"num_tokens": 457141.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 8.160084056854249,
|
|
"epoch": 0.022077004592016954,
|
|
"grad_norm": 0.86328125,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 7.812,
|
|
"mean_token_accuracy": 0.08223466873168946,
|
|
"num_tokens": 465708.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 8.142998504638673,
|
|
"epoch": 0.022518544683857295,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000127,
|
|
"loss": 7.8339,
|
|
"mean_token_accuracy": 0.07565066292881965,
|
|
"num_tokens": 475369.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 8.075135421752929,
|
|
"epoch": 0.022960084775697633,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 7.7972,
|
|
"mean_token_accuracy": 0.08645984381437302,
|
|
"num_tokens": 484249.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 8.122587871551513,
|
|
"epoch": 0.02340162486753797,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000132,
|
|
"loss": 7.8872,
|
|
"mean_token_accuracy": 0.07687325775623322,
|
|
"num_tokens": 493303.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 8.101485538482667,
|
|
"epoch": 0.023843164959378313,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 7.8664,
|
|
"mean_token_accuracy": 0.0807331919670105,
|
|
"num_tokens": 501503.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 8.036290693283082,
|
|
"epoch": 0.02428470505121865,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 7.8074,
|
|
"mean_token_accuracy": 0.08591768592596054,
|
|
"num_tokens": 509661.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 8.045488977432251,
|
|
"epoch": 0.02472624514305899,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 7.7904,
|
|
"mean_token_accuracy": 0.08441019728779793,
|
|
"num_tokens": 519464.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 8.107398653030396,
|
|
"epoch": 0.025167785234899327,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 7.7489,
|
|
"mean_token_accuracy": 0.08773190379142762,
|
|
"num_tokens": 527968.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 8.081705808639526,
|
|
"epoch": 0.02560932532673967,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 7.7768,
|
|
"mean_token_accuracy": 0.0868467777967453,
|
|
"num_tokens": 537234.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.99565052986145,
|
|
"epoch": 0.026050865418580007,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000147,
|
|
"loss": 7.7747,
|
|
"mean_token_accuracy": 0.08527034223079681,
|
|
"num_tokens": 546398.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 8.011523675918578,
|
|
"epoch": 0.026492405510420345,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 7.7616,
|
|
"mean_token_accuracy": 0.08982880860567093,
|
|
"num_tokens": 555362.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 8.107937812805176,
|
|
"epoch": 0.026933945602260687,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000152,
|
|
"loss": 7.8221,
|
|
"mean_token_accuracy": 0.07775180079042912,
|
|
"num_tokens": 564575.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 8.133016395568848,
|
|
"epoch": 0.027375485694101025,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 7.8384,
|
|
"mean_token_accuracy": 0.08304800540208816,
|
|
"num_tokens": 573915.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 8.016209363937378,
|
|
"epoch": 0.027817025785941363,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000157,
|
|
"loss": 7.7322,
|
|
"mean_token_accuracy": 0.08581754639744758,
|
|
"num_tokens": 583216.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.982406425476074,
|
|
"epoch": 0.0282585658777817,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 7.7553,
|
|
"mean_token_accuracy": 0.08679840788245201,
|
|
"num_tokens": 591955.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.9430736064910885,
|
|
"epoch": 0.028700105969622042,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000162,
|
|
"loss": 7.7588,
|
|
"mean_token_accuracy": 0.08934888392686843,
|
|
"num_tokens": 600999.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 8.070584297180176,
|
|
"epoch": 0.02914164606146238,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 7.6563,
|
|
"mean_token_accuracy": 0.09217674285173416,
|
|
"num_tokens": 609478.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.987708568572998,
|
|
"epoch": 0.02958318615330272,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.7225,
|
|
"mean_token_accuracy": 0.08663035854697228,
|
|
"num_tokens": 618348.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 7.911137056350708,
|
|
"epoch": 0.03002472624514306,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 7.7691,
|
|
"mean_token_accuracy": 0.083287762850523,
|
|
"num_tokens": 628548.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 8.057271575927734,
|
|
"epoch": 0.0304662663369834,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 7.7102,
|
|
"mean_token_accuracy": 0.08196588605642319,
|
|
"num_tokens": 637489.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.939978122711182,
|
|
"epoch": 0.030907806428823736,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 7.689,
|
|
"mean_token_accuracy": 0.0814521424472332,
|
|
"num_tokens": 646715.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.897878551483155,
|
|
"epoch": 0.031349346520664075,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000177,
|
|
"loss": 7.6894,
|
|
"mean_token_accuracy": 0.08998864293098449,
|
|
"num_tokens": 656858.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 8.019395637512208,
|
|
"epoch": 0.03179088661250441,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 7.6755,
|
|
"mean_token_accuracy": 0.08710955381393433,
|
|
"num_tokens": 665968.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 8.001319217681885,
|
|
"epoch": 0.03223242670434476,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 7.6655,
|
|
"mean_token_accuracy": 0.08621685430407525,
|
|
"num_tokens": 674295.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.810992002487183,
|
|
"epoch": 0.032673966796185096,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 7.591,
|
|
"mean_token_accuracy": 0.08370614722371102,
|
|
"num_tokens": 683559.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 7.816927337646485,
|
|
"epoch": 0.033115506888025434,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000187,
|
|
"loss": 7.6253,
|
|
"mean_token_accuracy": 0.08996079638600349,
|
|
"num_tokens": 692402.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 7.967683601379394,
|
|
"epoch": 0.03355704697986577,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 7.7304,
|
|
"mean_token_accuracy": 0.08065761215984821,
|
|
"num_tokens": 702052.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 8.058749055862426,
|
|
"epoch": 0.03399858707170611,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000192,
|
|
"loss": 7.6528,
|
|
"mean_token_accuracy": 0.08705045655369759,
|
|
"num_tokens": 711926.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.8771873950958256,
|
|
"epoch": 0.03444012716354645,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 7.612,
|
|
"mean_token_accuracy": 0.08773906156420708,
|
|
"num_tokens": 720948.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.893786334991455,
|
|
"epoch": 0.034881667255386786,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 7.6301,
|
|
"mean_token_accuracy": 0.09444142654538154,
|
|
"num_tokens": 729611.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.892533588409424,
|
|
"epoch": 0.03532320734722713,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 7.6187,
|
|
"mean_token_accuracy": 0.08193654045462609,
|
|
"num_tokens": 738433.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.945340347290039,
|
|
"epoch": 0.03576474743906747,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000202,
|
|
"loss": 7.6397,
|
|
"mean_token_accuracy": 0.08668759167194366,
|
|
"num_tokens": 747310.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.854477500915527,
|
|
"epoch": 0.03620628753090781,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 7.5994,
|
|
"mean_token_accuracy": 0.09020926207304,
|
|
"num_tokens": 756362.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 7.90778489112854,
|
|
"epoch": 0.036647827622748146,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.000207,
|
|
"loss": 7.6034,
|
|
"mean_token_accuracy": 0.08586042672395706,
|
|
"num_tokens": 764978.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 7.87300386428833,
|
|
"epoch": 0.037089367714588484,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 7.5707,
|
|
"mean_token_accuracy": 0.09018185958266259,
|
|
"num_tokens": 774058.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.795767593383789,
|
|
"epoch": 0.03753090780642882,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000212,
|
|
"loss": 7.5492,
|
|
"mean_token_accuracy": 0.0897379383444786,
|
|
"num_tokens": 783332.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 7.853004789352417,
|
|
"epoch": 0.03797244789826916,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 7.6298,
|
|
"mean_token_accuracy": 0.08684360906481743,
|
|
"num_tokens": 792481.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 7.766995525360107,
|
|
"epoch": 0.038413987990109505,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 7.5212,
|
|
"mean_token_accuracy": 0.09301207512617111,
|
|
"num_tokens": 801396.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 7.8428326606750485,
|
|
"epoch": 0.03885552808194984,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 7.5865,
|
|
"mean_token_accuracy": 0.08940735682845116,
|
|
"num_tokens": 809903.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 7.828377294540405,
|
|
"epoch": 0.03929706817379018,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.000222,
|
|
"loss": 7.5389,
|
|
"mean_token_accuracy": 0.0962544821202755,
|
|
"num_tokens": 819144.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 7.7183678150177,
|
|
"epoch": 0.03973860826563052,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 7.5608,
|
|
"mean_token_accuracy": 0.08849129751324654,
|
|
"num_tokens": 828881.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 7.764478397369385,
|
|
"epoch": 0.04018014835747086,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 7.4877,
|
|
"mean_token_accuracy": 0.08765893578529357,
|
|
"num_tokens": 837588.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 7.767373847961426,
|
|
"epoch": 0.040621688449311195,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 7.4849,
|
|
"mean_token_accuracy": 0.09265839084982871,
|
|
"num_tokens": 847002.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 7.726333475112915,
|
|
"epoch": 0.041063228541151534,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 7.4945,
|
|
"mean_token_accuracy": 0.09333177357912063,
|
|
"num_tokens": 855791.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.7462080955505375,
|
|
"epoch": 0.04150476863299188,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 7.5242,
|
|
"mean_token_accuracy": 0.0911882683634758,
|
|
"num_tokens": 865392.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.736569499969482,
|
|
"epoch": 0.04194630872483222,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000237,
|
|
"loss": 7.5873,
|
|
"mean_token_accuracy": 0.08873779252171517,
|
|
"num_tokens": 874807.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 7.758917284011841,
|
|
"epoch": 0.042387848816672555,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 7.5409,
|
|
"mean_token_accuracy": 0.09495326653122901,
|
|
"num_tokens": 883928.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 7.777913904190063,
|
|
"epoch": 0.04282938890851289,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000242,
|
|
"loss": 7.4436,
|
|
"mean_token_accuracy": 0.09124857932329178,
|
|
"num_tokens": 893047.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 7.662859010696411,
|
|
"epoch": 0.04327092900035323,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 7.4593,
|
|
"mean_token_accuracy": 0.09315531030297279,
|
|
"num_tokens": 901645.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 7.743328475952149,
|
|
"epoch": 0.04371246909219357,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000247,
|
|
"loss": 7.4727,
|
|
"mean_token_accuracy": 0.09244368895888329,
|
|
"num_tokens": 911169.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 7.7239625453948975,
|
|
"epoch": 0.04415400918403391,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 7.4748,
|
|
"mean_token_accuracy": 0.08498905003070831,
|
|
"num_tokens": 921382.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 7.544922304153443,
|
|
"epoch": 0.04459554927587425,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000252,
|
|
"loss": 7.4121,
|
|
"mean_token_accuracy": 0.09429771155118942,
|
|
"num_tokens": 930409.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 7.67856912612915,
|
|
"epoch": 0.04503708936771459,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 7.3879,
|
|
"mean_token_accuracy": 0.09879431128501892,
|
|
"num_tokens": 939049.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 7.718625736236572,
|
|
"epoch": 0.04547862945955493,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000257,
|
|
"loss": 7.395,
|
|
"mean_token_accuracy": 0.0960740551352501,
|
|
"num_tokens": 947575.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 7.709804058074951,
|
|
"epoch": 0.045920169551395267,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 7.5643,
|
|
"mean_token_accuracy": 0.09047991409897804,
|
|
"num_tokens": 957848.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 7.655015087127685,
|
|
"epoch": 0.046361709643235605,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000262,
|
|
"loss": 7.3857,
|
|
"mean_token_accuracy": 0.09998803585767746,
|
|
"num_tokens": 966521.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 7.688518905639649,
|
|
"epoch": 0.04680324973507594,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 7.4461,
|
|
"mean_token_accuracy": 0.09324755370616913,
|
|
"num_tokens": 975827.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 7.606715154647827,
|
|
"epoch": 0.04724478982691628,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 7.4075,
|
|
"mean_token_accuracy": 0.09566703587770461,
|
|
"num_tokens": 985292.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.616068124771118,
|
|
"epoch": 0.047686329918756626,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 7.3841,
|
|
"mean_token_accuracy": 0.09411159604787826,
|
|
"num_tokens": 994791.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 7.51567816734314,
|
|
"epoch": 0.048127870010596964,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 7.3594,
|
|
"mean_token_accuracy": 0.1026044063270092,
|
|
"num_tokens": 1003700.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 7.510391616821289,
|
|
"epoch": 0.0485694101024373,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 7.381,
|
|
"mean_token_accuracy": 0.09829011410474778,
|
|
"num_tokens": 1012682.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 7.683912038803101,
|
|
"epoch": 0.04901095019427764,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000277,
|
|
"loss": 7.4075,
|
|
"mean_token_accuracy": 0.09179475829005242,
|
|
"num_tokens": 1021018.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 7.570155191421509,
|
|
"epoch": 0.04945249028611798,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 7.2759,
|
|
"mean_token_accuracy": 0.09699172824621201,
|
|
"num_tokens": 1029744.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 7.495694351196289,
|
|
"epoch": 0.049894030377958316,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 7.3605,
|
|
"mean_token_accuracy": 0.09879247918725013,
|
|
"num_tokens": 1038805.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 7.5144976615905765,
|
|
"epoch": 0.050335570469798654,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 7.4131,
|
|
"mean_token_accuracy": 0.0988279327750206,
|
|
"num_tokens": 1047656.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 7.647522783279419,
|
|
"epoch": 0.050777110561639,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000287,
|
|
"loss": 7.4048,
|
|
"mean_token_accuracy": 0.09629088416695594,
|
|
"num_tokens": 1056598.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 7.6095935821533205,
|
|
"epoch": 0.05121865065347934,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 7.3994,
|
|
"mean_token_accuracy": 0.09847217947244644,
|
|
"num_tokens": 1065226.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 7.529495334625244,
|
|
"epoch": 0.051660190745319676,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000292,
|
|
"loss": 7.3208,
|
|
"mean_token_accuracy": 0.10098938867449761,
|
|
"num_tokens": 1074661.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 7.503559398651123,
|
|
"epoch": 0.052101730837160014,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 7.4251,
|
|
"mean_token_accuracy": 0.09441772177815437,
|
|
"num_tokens": 1083921.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 7.540312194824219,
|
|
"epoch": 0.05254327092900035,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000297,
|
|
"loss": 7.3162,
|
|
"mean_token_accuracy": 0.104298634827137,
|
|
"num_tokens": 1093399.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 7.5133528232574465,
|
|
"epoch": 0.05298481102084069,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 7.391,
|
|
"mean_token_accuracy": 0.09825902208685874,
|
|
"num_tokens": 1104065.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 7.433008003234863,
|
|
"epoch": 0.05342635111268103,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000302,
|
|
"loss": 7.2778,
|
|
"mean_token_accuracy": 0.1005440428853035,
|
|
"num_tokens": 1112995.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 7.47243971824646,
|
|
"epoch": 0.05386789120452137,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 7.3103,
|
|
"mean_token_accuracy": 0.10175202563405036,
|
|
"num_tokens": 1121637.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 7.455365610122681,
|
|
"epoch": 0.05430943129636171,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000307,
|
|
"loss": 7.2549,
|
|
"mean_token_accuracy": 0.09826337993144989,
|
|
"num_tokens": 1131166.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 7.4712036609649655,
|
|
"epoch": 0.05475097138820205,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 7.2562,
|
|
"mean_token_accuracy": 0.10475531965494156,
|
|
"num_tokens": 1140888.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 7.551609897613526,
|
|
"epoch": 0.05519251148004239,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000312,
|
|
"loss": 7.4148,
|
|
"mean_token_accuracy": 0.0961816966533661,
|
|
"num_tokens": 1150278.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 7.433546924591065,
|
|
"epoch": 0.055634051571882726,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 7.3742,
|
|
"mean_token_accuracy": 0.0970606379210949,
|
|
"num_tokens": 1159348.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 7.624134588241577,
|
|
"epoch": 0.056075591663723064,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000317,
|
|
"loss": 7.3756,
|
|
"mean_token_accuracy": 0.0949991799890995,
|
|
"num_tokens": 1168883.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 7.48681526184082,
|
|
"epoch": 0.0565171317555634,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 7.269,
|
|
"mean_token_accuracy": 0.1064944364130497,
|
|
"num_tokens": 1178572.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 7.44178466796875,
|
|
"epoch": 0.05695867184740375,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000322,
|
|
"loss": 7.3576,
|
|
"mean_token_accuracy": 0.0987204596400261,
|
|
"num_tokens": 1188909.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 7.466546869277954,
|
|
"epoch": 0.057400211939244085,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 7.2649,
|
|
"mean_token_accuracy": 0.09890259429812431,
|
|
"num_tokens": 1197705.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 7.450878572463989,
|
|
"epoch": 0.05784175203108442,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 7.1061,
|
|
"mean_token_accuracy": 0.10522415414452553,
|
|
"num_tokens": 1206351.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 7.340301847457885,
|
|
"epoch": 0.05828329212292476,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 7.2237,
|
|
"mean_token_accuracy": 0.09693196043372154,
|
|
"num_tokens": 1214984.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 7.4402018070220945,
|
|
"epoch": 0.0587248322147651,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 7.2999,
|
|
"mean_token_accuracy": 0.09738482013344765,
|
|
"num_tokens": 1224485.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 7.435847473144531,
|
|
"epoch": 0.05916637230660544,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 7.3266,
|
|
"mean_token_accuracy": 0.09173622950911522,
|
|
"num_tokens": 1233560.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 7.428315305709839,
|
|
"epoch": 0.05960791239844578,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000337,
|
|
"loss": 7.2436,
|
|
"mean_token_accuracy": 0.09967414885759354,
|
|
"num_tokens": 1242628.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 7.388672494888306,
|
|
"epoch": 0.06004945249028612,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 7.1697,
|
|
"mean_token_accuracy": 0.10538085550069809,
|
|
"num_tokens": 1251004.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 7.459445238113403,
|
|
"epoch": 0.06049099258212646,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000342,
|
|
"loss": 7.3463,
|
|
"mean_token_accuracy": 0.09609238728880883,
|
|
"num_tokens": 1260344.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 7.343485164642334,
|
|
"epoch": 0.0609325326739668,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 7.2517,
|
|
"mean_token_accuracy": 0.09760257676243782,
|
|
"num_tokens": 1269988.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 7.340139007568359,
|
|
"epoch": 0.061374072765807135,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000347,
|
|
"loss": 7.2126,
|
|
"mean_token_accuracy": 0.10715288370847702,
|
|
"num_tokens": 1280912.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 7.350299119949341,
|
|
"epoch": 0.06181561285764747,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 7.2246,
|
|
"mean_token_accuracy": 0.10604915320873261,
|
|
"num_tokens": 1289684.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 7.427703905105591,
|
|
"epoch": 0.06225715294948781,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000352,
|
|
"loss": 7.2534,
|
|
"mean_token_accuracy": 0.09802542477846146,
|
|
"num_tokens": 1298853.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 7.3198949813842775,
|
|
"epoch": 0.06269869304132815,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 7.214,
|
|
"mean_token_accuracy": 0.10874532908201218,
|
|
"num_tokens": 1309112.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 7.372763156890869,
|
|
"epoch": 0.0631402331331685,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000357,
|
|
"loss": 7.1981,
|
|
"mean_token_accuracy": 0.10583075731992722,
|
|
"num_tokens": 1319064.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 7.269387340545654,
|
|
"epoch": 0.06358177322500883,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 7.1748,
|
|
"mean_token_accuracy": 0.1100200168788433,
|
|
"num_tokens": 1327889.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 7.324726533889771,
|
|
"epoch": 0.06402331331684917,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000362,
|
|
"loss": 7.1938,
|
|
"mean_token_accuracy": 0.10258080512285232,
|
|
"num_tokens": 1337241.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 7.319574499130249,
|
|
"epoch": 0.06446485340868952,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 7.2533,
|
|
"mean_token_accuracy": 0.10085726305842399,
|
|
"num_tokens": 1346527.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 7.314885807037354,
|
|
"epoch": 0.06490639350052985,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000367,
|
|
"loss": 7.2315,
|
|
"mean_token_accuracy": 0.10445504561066628,
|
|
"num_tokens": 1355677.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 7.396700429916382,
|
|
"epoch": 0.06534793359237019,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 7.2163,
|
|
"mean_token_accuracy": 0.10588330775499344,
|
|
"num_tokens": 1364874.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 7.285468482971192,
|
|
"epoch": 0.06578947368421052,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000372,
|
|
"loss": 7.1378,
|
|
"mean_token_accuracy": 0.1090671844780445,
|
|
"num_tokens": 1373717.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 7.375531625747681,
|
|
"epoch": 0.06623101377605087,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 7.0955,
|
|
"mean_token_accuracy": 0.10741576477885247,
|
|
"num_tokens": 1382767.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 7.1357104778289795,
|
|
"epoch": 0.0666725538678912,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000377,
|
|
"loss": 7.1389,
|
|
"mean_token_accuracy": 0.10613262876868249,
|
|
"num_tokens": 1391190.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 7.234480524063111,
|
|
"epoch": 0.06711409395973154,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 7.1509,
|
|
"mean_token_accuracy": 0.10508784130215645,
|
|
"num_tokens": 1400722.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 7.402392435073852,
|
|
"epoch": 0.06755563405157189,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000382,
|
|
"loss": 7.1779,
|
|
"mean_token_accuracy": 0.10437385067343712,
|
|
"num_tokens": 1409328.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 7.06873927116394,
|
|
"epoch": 0.06799717414341222,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 7.0531,
|
|
"mean_token_accuracy": 0.11192933171987533,
|
|
"num_tokens": 1418504.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 7.440014839172363,
|
|
"epoch": 0.06843871423525257,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 7.1989,
|
|
"mean_token_accuracy": 0.10317453742027283,
|
|
"num_tokens": 1427690.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 7.181108903884888,
|
|
"epoch": 0.0688802543270929,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 7.1806,
|
|
"mean_token_accuracy": 0.10798285007476807,
|
|
"num_tokens": 1436798.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 7.2046185493469235,
|
|
"epoch": 0.06932179441893324,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 7.1646,
|
|
"mean_token_accuracy": 0.10358999595046044,
|
|
"num_tokens": 1446357.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 7.2555629253387455,
|
|
"epoch": 0.06976333451077357,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 7.0882,
|
|
"mean_token_accuracy": 0.11000654250383377,
|
|
"num_tokens": 1455998.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 7.207996559143067,
|
|
"epoch": 0.07020487460261392,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 7.145,
|
|
"mean_token_accuracy": 0.09857687279582024,
|
|
"num_tokens": 1465237.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 7.24621729850769,
|
|
"epoch": 0.07064641469445426,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 7.0958,
|
|
"mean_token_accuracy": 0.11087250858545303,
|
|
"num_tokens": 1474363.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 7.272359037399292,
|
|
"epoch": 0.0710879547862946,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000402,
|
|
"loss": 7.1713,
|
|
"mean_token_accuracy": 0.10843008160591125,
|
|
"num_tokens": 1483379.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 7.28739447593689,
|
|
"epoch": 0.07152949487813494,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 7.1265,
|
|
"mean_token_accuracy": 0.10922098532319069,
|
|
"num_tokens": 1492507.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 7.144436979293824,
|
|
"epoch": 0.07197103496997527,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 7.0154,
|
|
"mean_token_accuracy": 0.11775125116109848,
|
|
"num_tokens": 1500888.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 7.11500997543335,
|
|
"epoch": 0.07241257506181561,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 7.0709,
|
|
"mean_token_accuracy": 0.10802061259746551,
|
|
"num_tokens": 1510310.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 7.1448290824890135,
|
|
"epoch": 0.07285411515365595,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000412,
|
|
"loss": 7.0494,
|
|
"mean_token_accuracy": 0.11422519460320472,
|
|
"num_tokens": 1519427.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 7.2035074710845945,
|
|
"epoch": 0.07329565524549629,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 7.0679,
|
|
"mean_token_accuracy": 0.1063395880162716,
|
|
"num_tokens": 1529456.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 7.131991720199585,
|
|
"epoch": 0.07373719533733664,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000417,
|
|
"loss": 7.0241,
|
|
"mean_token_accuracy": 0.11403456106781959,
|
|
"num_tokens": 1537695.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 7.203299617767334,
|
|
"epoch": 0.07417873542917697,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 7.1456,
|
|
"mean_token_accuracy": 0.10954299196600914,
|
|
"num_tokens": 1547511.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 7.255322885513306,
|
|
"epoch": 0.07462027552101731,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000422,
|
|
"loss": 7.1315,
|
|
"mean_token_accuracy": 0.1110302060842514,
|
|
"num_tokens": 1557035.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 7.1888104438781735,
|
|
"epoch": 0.07506181561285764,
|
|
"grad_norm": 0.90234375,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 7.0906,
|
|
"mean_token_accuracy": 0.11411306262016296,
|
|
"num_tokens": 1566773.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 7.094766998291016,
|
|
"epoch": 0.07550335570469799,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000427,
|
|
"loss": 7.0787,
|
|
"mean_token_accuracy": 0.10884842053055763,
|
|
"num_tokens": 1576873.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 7.178222894668579,
|
|
"epoch": 0.07594489579653832,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 7.111,
|
|
"mean_token_accuracy": 0.10762306824326515,
|
|
"num_tokens": 1586170.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 7.286298131942749,
|
|
"epoch": 0.07638643588837866,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000432,
|
|
"loss": 7.154,
|
|
"mean_token_accuracy": 0.10613771453499794,
|
|
"num_tokens": 1596054.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 7.1001307487487795,
|
|
"epoch": 0.07682797598021901,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 7.0262,
|
|
"mean_token_accuracy": 0.11607334911823272,
|
|
"num_tokens": 1604544.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 7.172781848907471,
|
|
"epoch": 0.07726951607205934,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000437,
|
|
"loss": 7.0446,
|
|
"mean_token_accuracy": 0.11472792029380799,
|
|
"num_tokens": 1614580.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 7.132223224639892,
|
|
"epoch": 0.07771105616389969,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 7.1232,
|
|
"mean_token_accuracy": 0.1109985999763012,
|
|
"num_tokens": 1624701.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 7.128903436660766,
|
|
"epoch": 0.07815259625574002,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000442,
|
|
"loss": 7.0573,
|
|
"mean_token_accuracy": 0.10825628340244293,
|
|
"num_tokens": 1634085.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 7.123282432556152,
|
|
"epoch": 0.07859413634758036,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 7.0322,
|
|
"mean_token_accuracy": 0.11617021560668946,
|
|
"num_tokens": 1643190.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 7.060208940505982,
|
|
"epoch": 0.0790356764394207,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 7.07,
|
|
"mean_token_accuracy": 0.11254222765564918,
|
|
"num_tokens": 1652705.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 7.132848882675171,
|
|
"epoch": 0.07947721653126104,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 7.0536,
|
|
"mean_token_accuracy": 0.10692465007305145,
|
|
"num_tokens": 1662210.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 7.192712593078613,
|
|
"epoch": 0.07991875662310138,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 7.1014,
|
|
"mean_token_accuracy": 0.10652303621172905,
|
|
"num_tokens": 1671893.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 7.059550428390503,
|
|
"epoch": 0.08036029671494171,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 7.0402,
|
|
"mean_token_accuracy": 0.11181816533207893,
|
|
"num_tokens": 1681217.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 7.149940156936646,
|
|
"epoch": 0.08080183680678206,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 7.053,
|
|
"mean_token_accuracy": 0.11131602600216865,
|
|
"num_tokens": 1690447.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 7.081046295166016,
|
|
"epoch": 0.08124337689862239,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 7.1332,
|
|
"mean_token_accuracy": 0.10568991601467133,
|
|
"num_tokens": 1700355.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 7.16390905380249,
|
|
"epoch": 0.08168491699046274,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000462,
|
|
"loss": 7.028,
|
|
"mean_token_accuracy": 0.10254786685109138,
|
|
"num_tokens": 1709449.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 7.091014242172241,
|
|
"epoch": 0.08212645708230307,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 7.058,
|
|
"mean_token_accuracy": 0.10658924430608749,
|
|
"num_tokens": 1718838.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 7.023260927200317,
|
|
"epoch": 0.08256799717414341,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000467,
|
|
"loss": 7.0217,
|
|
"mean_token_accuracy": 0.11323517858982086,
|
|
"num_tokens": 1728594.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 7.1843287467956545,
|
|
"epoch": 0.08300953726598376,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 7.0731,
|
|
"mean_token_accuracy": 0.11138227805495263,
|
|
"num_tokens": 1738814.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 7.071042823791504,
|
|
"epoch": 0.08345107735782409,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000472,
|
|
"loss": 7.0089,
|
|
"mean_token_accuracy": 0.11532488241791725,
|
|
"num_tokens": 1747644.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 7.104792213439941,
|
|
"epoch": 0.08389261744966443,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 7.0338,
|
|
"mean_token_accuracy": 0.11352440416812896,
|
|
"num_tokens": 1757489.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.995518827438355,
|
|
"epoch": 0.08433415754150476,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.9488,
|
|
"mean_token_accuracy": 0.11878458335995674,
|
|
"num_tokens": 1767546.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 7.094525289535523,
|
|
"epoch": 0.08477569763334511,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 7.0088,
|
|
"mean_token_accuracy": 0.10509251430630684,
|
|
"num_tokens": 1776035.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 7.100050449371338,
|
|
"epoch": 0.08521723772518544,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.000482,
|
|
"loss": 7.0869,
|
|
"mean_token_accuracy": 0.10708501487970352,
|
|
"num_tokens": 1786161.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 7.161181020736694,
|
|
"epoch": 0.08565877781702579,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 7.1355,
|
|
"mean_token_accuracy": 0.10680384710431098,
|
|
"num_tokens": 1796093.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 7.064108896255493,
|
|
"epoch": 0.08610031790886613,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.9829,
|
|
"mean_token_accuracy": 0.1097193941473961,
|
|
"num_tokens": 1805574.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 7.07778491973877,
|
|
"epoch": 0.08654185800070646,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 7.0171,
|
|
"mean_token_accuracy": 0.11008013710379601,
|
|
"num_tokens": 1815175.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 7.017868852615356,
|
|
"epoch": 0.08698339809254681,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.9322,
|
|
"mean_token_accuracy": 0.11862852200865745,
|
|
"num_tokens": 1824683.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 7.004701948165893,
|
|
"epoch": 0.08742493818438714,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.9091,
|
|
"mean_token_accuracy": 0.1145630083978176,
|
|
"num_tokens": 1833174.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 7.008507776260376,
|
|
"epoch": 0.08786647827622748,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 7.0124,
|
|
"mean_token_accuracy": 0.1165225401520729,
|
|
"num_tokens": 1842409.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.900066137313843,
|
|
"epoch": 0.08830801836806781,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.9172,
|
|
"mean_token_accuracy": 0.1189465768635273,
|
|
"num_tokens": 1851441.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 7.234589004516602,
|
|
"epoch": 0.08874955845990816,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499999998589561,
|
|
"loss": 7.0545,
|
|
"mean_token_accuracy": 0.1098681665956974,
|
|
"num_tokens": 1861188.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.916832828521729,
|
|
"epoch": 0.0891910985517485,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999999928596523,
|
|
"loss": 6.9934,
|
|
"mean_token_accuracy": 0.1134356640279293,
|
|
"num_tokens": 1870284.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.8979510307312015,
|
|
"epoch": 0.08963263864358884,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999999827221219,
|
|
"loss": 6.9508,
|
|
"mean_token_accuracy": 0.11564922854304313,
|
|
"num_tokens": 1879744.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 7.141992807388306,
|
|
"epoch": 0.09007417873542918,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999999681769696,
|
|
"loss": 6.9612,
|
|
"mean_token_accuracy": 0.1163177601993084,
|
|
"num_tokens": 1889241.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.971645736694336,
|
|
"epoch": 0.09051571882726951,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499999949224196,
|
|
"loss": 6.969,
|
|
"mean_token_accuracy": 0.11504201143980027,
|
|
"num_tokens": 1898247.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 7.006195020675659,
|
|
"epoch": 0.09095725891910986,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999999258638013,
|
|
"loss": 6.9244,
|
|
"mean_token_accuracy": 0.11498644798994065,
|
|
"num_tokens": 1907559.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.903467321395874,
|
|
"epoch": 0.09139879901095019,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999998980957861,
|
|
"loss": 6.924,
|
|
"mean_token_accuracy": 0.11912157312035561,
|
|
"num_tokens": 1917072.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.983310127258301,
|
|
"epoch": 0.09184033910279053,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999998659201508,
|
|
"loss": 6.8753,
|
|
"mean_token_accuracy": 0.11874028518795968,
|
|
"num_tokens": 1926597.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 7.006656980514526,
|
|
"epoch": 0.09228187919463088,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999998293368961,
|
|
"loss": 6.918,
|
|
"mean_token_accuracy": 0.11443031057715417,
|
|
"num_tokens": 1935978.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.924140882492066,
|
|
"epoch": 0.09272341928647121,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999997883460227,
|
|
"loss": 6.864,
|
|
"mean_token_accuracy": 0.1171707384288311,
|
|
"num_tokens": 1944424.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.984135913848877,
|
|
"epoch": 0.09316495937831155,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999997429475314,
|
|
"loss": 6.8384,
|
|
"mean_token_accuracy": 0.12138021439313888,
|
|
"num_tokens": 1953844.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.947112941741944,
|
|
"epoch": 0.09360649947015189,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999996931414232,
|
|
"loss": 6.9207,
|
|
"mean_token_accuracy": 0.12120825350284577,
|
|
"num_tokens": 1963974.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.977735805511474,
|
|
"epoch": 0.09404803956199223,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999996389276988,
|
|
"loss": 6.8969,
|
|
"mean_token_accuracy": 0.12291403263807296,
|
|
"num_tokens": 1973466.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.803595685958863,
|
|
"epoch": 0.09448957965383256,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999995803063596,
|
|
"loss": 6.918,
|
|
"mean_token_accuracy": 0.12123456448316575,
|
|
"num_tokens": 1983478.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 7.003172111511231,
|
|
"epoch": 0.0949311197456729,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999995172774065,
|
|
"loss": 6.9879,
|
|
"mean_token_accuracy": 0.11715293675661087,
|
|
"num_tokens": 1992775.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 7.030760860443115,
|
|
"epoch": 0.09537265983751325,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999994498408408,
|
|
"loss": 6.9419,
|
|
"mean_token_accuracy": 0.11398354098200798,
|
|
"num_tokens": 2002526.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.936420059204101,
|
|
"epoch": 0.09581419992935358,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999993779966639,
|
|
"loss": 6.9592,
|
|
"mean_token_accuracy": 0.11412434950470925,
|
|
"num_tokens": 2012476.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.838730955123902,
|
|
"epoch": 0.09625574002119393,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999993017448771,
|
|
"loss": 6.7924,
|
|
"mean_token_accuracy": 0.13063410446047782,
|
|
"num_tokens": 2021252.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 7.061969757080078,
|
|
"epoch": 0.09669728011303426,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999992210854821,
|
|
"loss": 6.9411,
|
|
"mean_token_accuracy": 0.11837697625160218,
|
|
"num_tokens": 2031438.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.9324125289917,
|
|
"epoch": 0.0971388202048746,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999991360184801,
|
|
"loss": 6.9789,
|
|
"mean_token_accuracy": 0.11443927884101868,
|
|
"num_tokens": 2041319.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.95938081741333,
|
|
"epoch": 0.09758036029671494,
|
|
"grad_norm": 0.859375,
|
|
"learning_rate": 0.0004999990465438731,
|
|
"loss": 6.9746,
|
|
"mean_token_accuracy": 0.11487890034914017,
|
|
"num_tokens": 2052060.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.891486358642578,
|
|
"epoch": 0.09802190038855528,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999989526616628,
|
|
"loss": 6.8643,
|
|
"mean_token_accuracy": 0.12554761841893197,
|
|
"num_tokens": 2061331.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.952145195007324,
|
|
"epoch": 0.09846344048039563,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999988543718509,
|
|
"loss": 6.8733,
|
|
"mean_token_accuracy": 0.11660940647125244,
|
|
"num_tokens": 2070006.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 7.005694484710693,
|
|
"epoch": 0.09890498057223596,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999987516744394,
|
|
"loss": 6.8403,
|
|
"mean_token_accuracy": 0.12942354679107665,
|
|
"num_tokens": 2079089.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.814391231536865,
|
|
"epoch": 0.0993465206640763,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999986445694303,
|
|
"loss": 6.8164,
|
|
"mean_token_accuracy": 0.12300374433398246,
|
|
"num_tokens": 2087237.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.877581930160522,
|
|
"epoch": 0.09978806075591663,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999985330568258,
|
|
"loss": 6.7838,
|
|
"mean_token_accuracy": 0.12274843603372573,
|
|
"num_tokens": 2096598.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.867826128005982,
|
|
"epoch": 0.10022960084775698,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999984171366278,
|
|
"loss": 6.8802,
|
|
"mean_token_accuracy": 0.11098882853984833,
|
|
"num_tokens": 2106106.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.928639554977417,
|
|
"epoch": 0.10067114093959731,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499998296808839,
|
|
"loss": 6.856,
|
|
"mean_token_accuracy": 0.11591004803776742,
|
|
"num_tokens": 2115478.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.942514848709107,
|
|
"epoch": 0.10111268103143765,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999981720734615,
|
|
"loss": 6.9007,
|
|
"mean_token_accuracy": 0.11097749546170235,
|
|
"num_tokens": 2124439.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.928429985046387,
|
|
"epoch": 0.101554221123278,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999980429304977,
|
|
"loss": 6.8983,
|
|
"mean_token_accuracy": 0.11434343308210373,
|
|
"num_tokens": 2133718.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.835229969024658,
|
|
"epoch": 0.10199576121511833,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999979093799502,
|
|
"loss": 6.801,
|
|
"mean_token_accuracy": 0.12133207321166992,
|
|
"num_tokens": 2142978.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.8165655612945555,
|
|
"epoch": 0.10243730130695868,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999977714218217,
|
|
"loss": 6.863,
|
|
"mean_token_accuracy": 0.117961073666811,
|
|
"num_tokens": 2152250.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.972959232330322,
|
|
"epoch": 0.102878841398799,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004999976290561147,
|
|
"loss": 6.8608,
|
|
"mean_token_accuracy": 0.11624824330210685,
|
|
"num_tokens": 2161620.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.921932792663574,
|
|
"epoch": 0.10332038149063935,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999974822828322,
|
|
"loss": 6.8726,
|
|
"mean_token_accuracy": 0.1173239678144455,
|
|
"num_tokens": 2170856.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.885772609710694,
|
|
"epoch": 0.10376192158247968,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499997331101977,
|
|
"loss": 6.8626,
|
|
"mean_token_accuracy": 0.11568826138973236,
|
|
"num_tokens": 2180926.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.911135244369507,
|
|
"epoch": 0.10420346167432003,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499997175513552,
|
|
"loss": 6.8419,
|
|
"mean_token_accuracy": 0.11412648186087608,
|
|
"num_tokens": 2190248.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.86896710395813,
|
|
"epoch": 0.10464500176616037,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999970155175603,
|
|
"loss": 6.855,
|
|
"mean_token_accuracy": 0.12222710996866226,
|
|
"num_tokens": 2199833.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.856761360168457,
|
|
"epoch": 0.1050865418580007,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.000499996851114005,
|
|
"loss": 6.8115,
|
|
"mean_token_accuracy": 0.12742498219013215,
|
|
"num_tokens": 2208240.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.837878751754761,
|
|
"epoch": 0.10552808194984105,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999966823028894,
|
|
"loss": 6.8776,
|
|
"mean_token_accuracy": 0.11117666661739349,
|
|
"num_tokens": 2218758.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.94004077911377,
|
|
"epoch": 0.10596962204168138,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999965090842168,
|
|
"loss": 6.8665,
|
|
"mean_token_accuracy": 0.12320348769426345,
|
|
"num_tokens": 2228443.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.74250750541687,
|
|
"epoch": 0.10641116213352173,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999963314579905,
|
|
"loss": 6.7084,
|
|
"mean_token_accuracy": 0.1319122113287449,
|
|
"num_tokens": 2236787.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.956686353683471,
|
|
"epoch": 0.10685270222536206,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999961494242139,
|
|
"loss": 6.8901,
|
|
"mean_token_accuracy": 0.11468368023633957,
|
|
"num_tokens": 2247089.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.789958381652832,
|
|
"epoch": 0.1072942423172024,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999959629828908,
|
|
"loss": 6.816,
|
|
"mean_token_accuracy": 0.11569953635334969,
|
|
"num_tokens": 2256045.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.929527616500854,
|
|
"epoch": 0.10773578240904275,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999957721340248,
|
|
"loss": 6.8769,
|
|
"mean_token_accuracy": 0.12202595993876457,
|
|
"num_tokens": 2265835.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.807823610305786,
|
|
"epoch": 0.10817732250088308,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999955768776195,
|
|
"loss": 6.8076,
|
|
"mean_token_accuracy": 0.11702087000012398,
|
|
"num_tokens": 2275318.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.869472360610962,
|
|
"epoch": 0.10861886259272342,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999953772136788,
|
|
"loss": 6.7978,
|
|
"mean_token_accuracy": 0.12102394551038742,
|
|
"num_tokens": 2284821.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.885239315032959,
|
|
"epoch": 0.10906040268456375,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999951731422068,
|
|
"loss": 6.7645,
|
|
"mean_token_accuracy": 0.12054353281855583,
|
|
"num_tokens": 2294013.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.847450494766235,
|
|
"epoch": 0.1095019427764041,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999949646632072,
|
|
"loss": 6.7626,
|
|
"mean_token_accuracy": 0.12177001982927323,
|
|
"num_tokens": 2302727.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.8077342987060545,
|
|
"epoch": 0.10994348286824443,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999947517766842,
|
|
"loss": 6.8031,
|
|
"mean_token_accuracy": 0.12163913846015931,
|
|
"num_tokens": 2312032.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.986685180664063,
|
|
"epoch": 0.11038502296008477,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499994534482642,
|
|
"loss": 6.8748,
|
|
"mean_token_accuracy": 0.1190544456243515,
|
|
"num_tokens": 2321839.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.811014032363891,
|
|
"epoch": 0.11082656305192512,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999943127810847,
|
|
"loss": 6.8536,
|
|
"mean_token_accuracy": 0.1122577242553234,
|
|
"num_tokens": 2331255.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.794656276702881,
|
|
"epoch": 0.11126810314376545,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999940866720169,
|
|
"loss": 6.6705,
|
|
"mean_token_accuracy": 0.12881582453846932,
|
|
"num_tokens": 2340038.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.71243953704834,
|
|
"epoch": 0.1117096432356058,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999938561554429,
|
|
"loss": 6.7797,
|
|
"mean_token_accuracy": 0.12242485880851746,
|
|
"num_tokens": 2348901.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.858448314666748,
|
|
"epoch": 0.11215118332744613,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999936212313672,
|
|
"loss": 6.8659,
|
|
"mean_token_accuracy": 0.11461173072457313,
|
|
"num_tokens": 2358842.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.8239977836608885,
|
|
"epoch": 0.11259272341928647,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999933818997943,
|
|
"loss": 6.7596,
|
|
"mean_token_accuracy": 0.12424605414271354,
|
|
"num_tokens": 2368650.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.831825399398804,
|
|
"epoch": 0.1130342635111268,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999931381607292,
|
|
"loss": 6.8252,
|
|
"mean_token_accuracy": 0.12058763056993485,
|
|
"num_tokens": 2377916.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.818245553970337,
|
|
"epoch": 0.11347580360296715,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999928900141764,
|
|
"loss": 6.7698,
|
|
"mean_token_accuracy": 0.12198482304811478,
|
|
"num_tokens": 2387507.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.819052505493164,
|
|
"epoch": 0.1139173436948075,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499992637460141,
|
|
"loss": 6.8052,
|
|
"mean_token_accuracy": 0.12523134648799897,
|
|
"num_tokens": 2396148.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.816552209854126,
|
|
"epoch": 0.11435888378664782,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999923804986275,
|
|
"loss": 6.693,
|
|
"mean_token_accuracy": 0.11803872361779214,
|
|
"num_tokens": 2404891.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.840794086456299,
|
|
"epoch": 0.11480042387848817,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999921191296415,
|
|
"loss": 6.7153,
|
|
"mean_token_accuracy": 0.12199744880199433,
|
|
"num_tokens": 2414406.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.7452630519866945,
|
|
"epoch": 0.1152419639703285,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999918533531877,
|
|
"loss": 6.8046,
|
|
"mean_token_accuracy": 0.1228412576019764,
|
|
"num_tokens": 2424363.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.852883148193359,
|
|
"epoch": 0.11568350406216885,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999915831692714,
|
|
"loss": 6.7419,
|
|
"mean_token_accuracy": 0.1251549780368805,
|
|
"num_tokens": 2433753.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.7218766689300535,
|
|
"epoch": 0.11612504415400918,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999913085778981,
|
|
"loss": 6.7685,
|
|
"mean_token_accuracy": 0.1185051940381527,
|
|
"num_tokens": 2443275.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.873374080657959,
|
|
"epoch": 0.11656658424584952,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999910295790729,
|
|
"loss": 6.7937,
|
|
"mean_token_accuracy": 0.11835979968309403,
|
|
"num_tokens": 2452510.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.8684648990631105,
|
|
"epoch": 0.11700812433768987,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999907461728014,
|
|
"loss": 6.8746,
|
|
"mean_token_accuracy": 0.1169828750193119,
|
|
"num_tokens": 2462742.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.740426445007325,
|
|
"epoch": 0.1174496644295302,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999904583590893,
|
|
"loss": 6.7434,
|
|
"mean_token_accuracy": 0.12029099017381668,
|
|
"num_tokens": 2471409.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.839800691604614,
|
|
"epoch": 0.11789120452137054,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004999901661379418,
|
|
"loss": 6.6931,
|
|
"mean_token_accuracy": 0.12773663252592088,
|
|
"num_tokens": 2481011.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.690527105331421,
|
|
"epoch": 0.11833274461321087,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999898695093652,
|
|
"loss": 6.7866,
|
|
"mean_token_accuracy": 0.12104339599609375,
|
|
"num_tokens": 2490664.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.818962049484253,
|
|
"epoch": 0.11877428470505122,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999895684733648,
|
|
"loss": 6.7279,
|
|
"mean_token_accuracy": 0.12824407517910003,
|
|
"num_tokens": 2499799.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.8539299964904785,
|
|
"epoch": 0.11921582479689156,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004999892630299467,
|
|
"loss": 6.7045,
|
|
"mean_token_accuracy": 0.1257259279489517,
|
|
"num_tokens": 2508780.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.706318616867065,
|
|
"epoch": 0.1196573648887319,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999889531791171,
|
|
"loss": 6.7138,
|
|
"mean_token_accuracy": 0.12127138078212737,
|
|
"num_tokens": 2517741.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.766215896606445,
|
|
"epoch": 0.12009890498057224,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999886389208817,
|
|
"loss": 6.7972,
|
|
"mean_token_accuracy": 0.11826895922422409,
|
|
"num_tokens": 2528742.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.717579746246338,
|
|
"epoch": 0.12054044507241257,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999883202552468,
|
|
"loss": 6.7345,
|
|
"mean_token_accuracy": 0.12455343306064606,
|
|
"num_tokens": 2538609.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.882754898071289,
|
|
"epoch": 0.12098198516425292,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999879971822189,
|
|
"loss": 6.7157,
|
|
"mean_token_accuracy": 0.11966117843985558,
|
|
"num_tokens": 2547772.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.69037971496582,
|
|
"epoch": 0.12142352525609325,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999876697018038,
|
|
"loss": 6.6897,
|
|
"mean_token_accuracy": 0.12769502475857736,
|
|
"num_tokens": 2556114.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.8372406482696535,
|
|
"epoch": 0.1218650653479336,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999873378140085,
|
|
"loss": 6.7182,
|
|
"mean_token_accuracy": 0.12253274098038673,
|
|
"num_tokens": 2566814.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.714360284805298,
|
|
"epoch": 0.12230660543977394,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999870015188389,
|
|
"loss": 6.6914,
|
|
"mean_token_accuracy": 0.12260655164718628,
|
|
"num_tokens": 2576030.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.743939208984375,
|
|
"epoch": 0.12274814553161427,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999866608163021,
|
|
"loss": 6.7176,
|
|
"mean_token_accuracy": 0.1260794699192047,
|
|
"num_tokens": 2585756.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.800289154052734,
|
|
"epoch": 0.12318968562345461,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999863157064045,
|
|
"loss": 6.7797,
|
|
"mean_token_accuracy": 0.1238692507147789,
|
|
"num_tokens": 2595676.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.8109955310821535,
|
|
"epoch": 0.12363122571529495,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999859661891529,
|
|
"loss": 6.7624,
|
|
"mean_token_accuracy": 0.1246532566845417,
|
|
"num_tokens": 2606197.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.76338677406311,
|
|
"epoch": 0.12407276580713529,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999856122645543,
|
|
"loss": 6.6854,
|
|
"mean_token_accuracy": 0.12518818601965903,
|
|
"num_tokens": 2615311.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.725164937973022,
|
|
"epoch": 0.12451430589897562,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999852539326154,
|
|
"loss": 6.5931,
|
|
"mean_token_accuracy": 0.12619537115097046,
|
|
"num_tokens": 2624074.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.681712102890015,
|
|
"epoch": 0.12495584599081597,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999848911933434,
|
|
"loss": 6.7411,
|
|
"mean_token_accuracy": 0.13295672461390495,
|
|
"num_tokens": 2633877.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.7498420715332035,
|
|
"epoch": 0.1253973860826563,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004999845240467453,
|
|
"loss": 6.6702,
|
|
"mean_token_accuracy": 0.12471742331981658,
|
|
"num_tokens": 2643330.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.74535722732544,
|
|
"epoch": 0.12583892617449666,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999841524928282,
|
|
"loss": 6.6543,
|
|
"mean_token_accuracy": 0.12766205966472627,
|
|
"num_tokens": 2652070.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.722480249404907,
|
|
"epoch": 0.126280466266337,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999837765315997,
|
|
"loss": 6.697,
|
|
"mean_token_accuracy": 0.1277957484126091,
|
|
"num_tokens": 2660546.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.598982477188111,
|
|
"epoch": 0.12672200635817732,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999833961630669,
|
|
"loss": 6.5999,
|
|
"mean_token_accuracy": 0.1297621488571167,
|
|
"num_tokens": 2669938.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.852873420715332,
|
|
"epoch": 0.12716354645001765,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999830113872374,
|
|
"loss": 6.7248,
|
|
"mean_token_accuracy": 0.12213384285569191,
|
|
"num_tokens": 2679814.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.585267829895019,
|
|
"epoch": 0.127605086541858,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999826222041186,
|
|
"loss": 6.6355,
|
|
"mean_token_accuracy": 0.13383011817932128,
|
|
"num_tokens": 2688733.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.706393527984619,
|
|
"epoch": 0.12804662663369834,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999822286137182,
|
|
"loss": 6.6188,
|
|
"mean_token_accuracy": 0.12774292454123498,
|
|
"num_tokens": 2697744.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.633390617370606,
|
|
"epoch": 0.12848816672553867,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999818306160439,
|
|
"loss": 6.5827,
|
|
"mean_token_accuracy": 0.13278514444828032,
|
|
"num_tokens": 2707037.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.808015394210815,
|
|
"epoch": 0.12892970681737903,
|
|
"grad_norm": 0.90625,
|
|
"learning_rate": 0.0004999814282111034,
|
|
"loss": 6.7453,
|
|
"mean_token_accuracy": 0.12486135885119438,
|
|
"num_tokens": 2717345.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.7304778575897215,
|
|
"epoch": 0.12937124690921936,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999810213989047,
|
|
"loss": 6.7017,
|
|
"mean_token_accuracy": 0.12036777138710023,
|
|
"num_tokens": 2726892.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.667453098297119,
|
|
"epoch": 0.1298127870010597,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999806101794558,
|
|
"loss": 6.6615,
|
|
"mean_token_accuracy": 0.12705308422446251,
|
|
"num_tokens": 2736479.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.809973049163818,
|
|
"epoch": 0.13025432709290002,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999801945527648,
|
|
"loss": 6.7078,
|
|
"mean_token_accuracy": 0.12117967531085014,
|
|
"num_tokens": 2745998.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.725075721740723,
|
|
"epoch": 0.13069586718474038,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999797745188395,
|
|
"loss": 6.6906,
|
|
"mean_token_accuracy": 0.12821464985609055,
|
|
"num_tokens": 2754346.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.686868619918823,
|
|
"epoch": 0.13113740727658071,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999793500776886,
|
|
"loss": 6.6285,
|
|
"mean_token_accuracy": 0.1294437274336815,
|
|
"num_tokens": 2763391.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.674964761734008,
|
|
"epoch": 0.13157894736842105,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999789212293201,
|
|
"loss": 6.6827,
|
|
"mean_token_accuracy": 0.12898893728852273,
|
|
"num_tokens": 2772764.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.775622892379761,
|
|
"epoch": 0.1320204874602614,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999784879737423,
|
|
"loss": 6.78,
|
|
"mean_token_accuracy": 0.12160285860300064,
|
|
"num_tokens": 2782312.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.745266675949097,
|
|
"epoch": 0.13246202755210174,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999780503109642,
|
|
"loss": 6.6798,
|
|
"mean_token_accuracy": 0.1227384127676487,
|
|
"num_tokens": 2791159.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.650141859054566,
|
|
"epoch": 0.13290356764394207,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999776082409939,
|
|
"loss": 6.5068,
|
|
"mean_token_accuracy": 0.13456878885626794,
|
|
"num_tokens": 2799319.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.6320771217346195,
|
|
"epoch": 0.1333451077357824,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999771617638401,
|
|
"loss": 6.6316,
|
|
"mean_token_accuracy": 0.12712259590625763,
|
|
"num_tokens": 2807401.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.758873414993286,
|
|
"epoch": 0.13378664782762276,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999767108795118,
|
|
"loss": 6.6961,
|
|
"mean_token_accuracy": 0.12330949455499648,
|
|
"num_tokens": 2817734.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.644004774093628,
|
|
"epoch": 0.1342281879194631,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999762555880176,
|
|
"loss": 6.6783,
|
|
"mean_token_accuracy": 0.12152940481901169,
|
|
"num_tokens": 2828235.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.743390846252441,
|
|
"epoch": 0.13466972801130342,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999757958893666,
|
|
"loss": 6.7124,
|
|
"mean_token_accuracy": 0.1237283930182457,
|
|
"num_tokens": 2837453.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.60786280632019,
|
|
"epoch": 0.13511126810314378,
|
|
"grad_norm": 0.9453125,
|
|
"learning_rate": 0.0004999753317835677,
|
|
"loss": 6.6795,
|
|
"mean_token_accuracy": 0.12087962031364441,
|
|
"num_tokens": 2847055.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.7429241180419925,
|
|
"epoch": 0.1355528081949841,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999748632706299,
|
|
"loss": 6.6568,
|
|
"mean_token_accuracy": 0.13167096227407454,
|
|
"num_tokens": 2857101.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.673015403747558,
|
|
"epoch": 0.13599434828682444,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999743903505626,
|
|
"loss": 6.553,
|
|
"mean_token_accuracy": 0.1336723633110523,
|
|
"num_tokens": 2866685.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.676457214355469,
|
|
"epoch": 0.13643588837866477,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999739130233749,
|
|
"loss": 6.6731,
|
|
"mean_token_accuracy": 0.12713466510176658,
|
|
"num_tokens": 2876022.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.783061361312866,
|
|
"epoch": 0.13687742847050513,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999734312890761,
|
|
"loss": 6.6062,
|
|
"mean_token_accuracy": 0.12626957073807715,
|
|
"num_tokens": 2885560.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.59228024482727,
|
|
"epoch": 0.13731896856234546,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999729451476757,
|
|
"loss": 6.6439,
|
|
"mean_token_accuracy": 0.12623701319098474,
|
|
"num_tokens": 2894686.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.730476140975952,
|
|
"epoch": 0.1377605086541858,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999724545991835,
|
|
"loss": 6.6588,
|
|
"mean_token_accuracy": 0.13341889455914496,
|
|
"num_tokens": 2904390.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.662082195281982,
|
|
"epoch": 0.13820204874602615,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999719596436086,
|
|
"loss": 6.6982,
|
|
"mean_token_accuracy": 0.12678939029574393,
|
|
"num_tokens": 2913311.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.736054372787476,
|
|
"epoch": 0.13864358883786648,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999714602809611,
|
|
"loss": 6.578,
|
|
"mean_token_accuracy": 0.13448369055986403,
|
|
"num_tokens": 2923196.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.637621927261352,
|
|
"epoch": 0.1390851289297068,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999709565112506,
|
|
"loss": 6.6428,
|
|
"mean_token_accuracy": 0.12785085365176202,
|
|
"num_tokens": 2932813.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.658017587661743,
|
|
"epoch": 0.13952666902154715,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.000499970448334487,
|
|
"loss": 6.5634,
|
|
"mean_token_accuracy": 0.119098000228405,
|
|
"num_tokens": 2942694.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.5801304340362545,
|
|
"epoch": 0.1399682091133875,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999699357506803,
|
|
"loss": 6.5185,
|
|
"mean_token_accuracy": 0.1330326519906521,
|
|
"num_tokens": 2950932.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.701116371154785,
|
|
"epoch": 0.14040974920522784,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999694187598406,
|
|
"loss": 6.6085,
|
|
"mean_token_accuracy": 0.1314128704369068,
|
|
"num_tokens": 2960102.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.627189350128174,
|
|
"epoch": 0.14085128929706817,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004999688973619777,
|
|
"loss": 6.6088,
|
|
"mean_token_accuracy": 0.12604895159602164,
|
|
"num_tokens": 2969968.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.588514184951782,
|
|
"epoch": 0.14129282938890853,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999683715571022,
|
|
"loss": 6.5542,
|
|
"mean_token_accuracy": 0.1347218669950962,
|
|
"num_tokens": 2978880.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.644829893112183,
|
|
"epoch": 0.14173436948074886,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999678413452242,
|
|
"loss": 6.5863,
|
|
"mean_token_accuracy": 0.12890450209379195,
|
|
"num_tokens": 2988369.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.667511320114135,
|
|
"epoch": 0.1421759095725892,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999673067263542,
|
|
"loss": 6.6373,
|
|
"mean_token_accuracy": 0.12620072290301323,
|
|
"num_tokens": 2997070.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.682367372512817,
|
|
"epoch": 0.14261744966442952,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999667677005026,
|
|
"loss": 6.5749,
|
|
"mean_token_accuracy": 0.13310741856694222,
|
|
"num_tokens": 3006547.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.584122562408448,
|
|
"epoch": 0.14305898975626988,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999662242676799,
|
|
"loss": 6.5986,
|
|
"mean_token_accuracy": 0.1310425490140915,
|
|
"num_tokens": 3015821.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.665965223312378,
|
|
"epoch": 0.1435005298481102,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999656764278968,
|
|
"loss": 6.5655,
|
|
"mean_token_accuracy": 0.1309148021042347,
|
|
"num_tokens": 3024750.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.623423147201538,
|
|
"epoch": 0.14394206993995054,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999651241811642,
|
|
"loss": 6.5389,
|
|
"mean_token_accuracy": 0.1308048278093338,
|
|
"num_tokens": 3033345.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.59770941734314,
|
|
"epoch": 0.1443836100317909,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999645675274925,
|
|
"loss": 6.5209,
|
|
"mean_token_accuracy": 0.13443350791931152,
|
|
"num_tokens": 3042060.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.660798358917236,
|
|
"epoch": 0.14482515012363123,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999640064668931,
|
|
"loss": 6.6684,
|
|
"mean_token_accuracy": 0.12563745751976968,
|
|
"num_tokens": 3052490.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.726674222946167,
|
|
"epoch": 0.14526669021547156,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999634409993766,
|
|
"loss": 6.6441,
|
|
"mean_token_accuracy": 0.12772516757249833,
|
|
"num_tokens": 3061934.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.556687307357788,
|
|
"epoch": 0.1457082303073119,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999628711249544,
|
|
"loss": 6.5591,
|
|
"mean_token_accuracy": 0.13611432090401648,
|
|
"num_tokens": 3070890.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.703302097320557,
|
|
"epoch": 0.14614977039915225,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 0.0004999622968436373,
|
|
"loss": 6.5614,
|
|
"mean_token_accuracy": 0.12631918862462044,
|
|
"num_tokens": 3079933.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.5565461158752445,
|
|
"epoch": 0.14659131049099258,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999617181554369,
|
|
"loss": 6.6078,
|
|
"mean_token_accuracy": 0.13254086449742317,
|
|
"num_tokens": 3089910.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.718549633026123,
|
|
"epoch": 0.1470328505828329,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999611350603643,
|
|
"loss": 6.5916,
|
|
"mean_token_accuracy": 0.13437702059745787,
|
|
"num_tokens": 3098676.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.595283174514771,
|
|
"epoch": 0.14747439067467327,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499960547558431,
|
|
"loss": 6.7159,
|
|
"mean_token_accuracy": 0.1256335400044918,
|
|
"num_tokens": 3109055.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.624332904815674,
|
|
"epoch": 0.1479159307665136,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999599556496486,
|
|
"loss": 6.5472,
|
|
"mean_token_accuracy": 0.13295547068119049,
|
|
"num_tokens": 3117517.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.573185825347901,
|
|
"epoch": 0.14835747085835393,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999593593340286,
|
|
"loss": 6.5544,
|
|
"mean_token_accuracy": 0.13249536529183387,
|
|
"num_tokens": 3126606.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.704092359542846,
|
|
"epoch": 0.14879901095019427,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999587586115826,
|
|
"loss": 6.5733,
|
|
"mean_token_accuracy": 0.12993996366858482,
|
|
"num_tokens": 3135444.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.57456955909729,
|
|
"epoch": 0.14924055104203462,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999581534823226,
|
|
"loss": 6.4927,
|
|
"mean_token_accuracy": 0.14061653688549997,
|
|
"num_tokens": 3144967.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.607442092895508,
|
|
"epoch": 0.14968209113387496,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999575439462601,
|
|
"loss": 6.522,
|
|
"mean_token_accuracy": 0.12690635845065118,
|
|
"num_tokens": 3153898.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.536995601654053,
|
|
"epoch": 0.1501236312257153,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999569300034075,
|
|
"loss": 6.5921,
|
|
"mean_token_accuracy": 0.12649724259972572,
|
|
"num_tokens": 3162341.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.60729718208313,
|
|
"epoch": 0.15056517131755565,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999563116537764,
|
|
"loss": 6.3826,
|
|
"mean_token_accuracy": 0.14100785404443741,
|
|
"num_tokens": 3171420.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.54091591835022,
|
|
"epoch": 0.15100671140939598,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999556888973792,
|
|
"loss": 6.4366,
|
|
"mean_token_accuracy": 0.13033056780695915,
|
|
"num_tokens": 3180221.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.6625391960144045,
|
|
"epoch": 0.1514482515012363,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999550617342279,
|
|
"loss": 6.587,
|
|
"mean_token_accuracy": 0.12798949852585792,
|
|
"num_tokens": 3189156.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.539492607116699,
|
|
"epoch": 0.15188979159307664,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499954430164335,
|
|
"loss": 6.5149,
|
|
"mean_token_accuracy": 0.13213447630405425,
|
|
"num_tokens": 3199870.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.559759998321534,
|
|
"epoch": 0.152331331684917,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999537941877127,
|
|
"loss": 6.4883,
|
|
"mean_token_accuracy": 0.13440720960497857,
|
|
"num_tokens": 3208815.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.604632616043091,
|
|
"epoch": 0.15277287177675733,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004999531538043735,
|
|
"loss": 6.5717,
|
|
"mean_token_accuracy": 0.13279605731368066,
|
|
"num_tokens": 3218692.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.584544372558594,
|
|
"epoch": 0.15321441186859766,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999525090143298,
|
|
"loss": 6.5367,
|
|
"mean_token_accuracy": 0.14099612906575204,
|
|
"num_tokens": 3227604.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.629879665374756,
|
|
"epoch": 0.15365595196043802,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999518598175946,
|
|
"loss": 6.5446,
|
|
"mean_token_accuracy": 0.14111834466457368,
|
|
"num_tokens": 3237912.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.453984022140503,
|
|
"epoch": 0.15409749205227835,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999512062141805,
|
|
"loss": 6.4239,
|
|
"mean_token_accuracy": 0.13701695203781128,
|
|
"num_tokens": 3246003.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.6436468124389645,
|
|
"epoch": 0.15453903214411868,
|
|
"grad_norm": 0.85546875,
|
|
"learning_rate": 0.0004999505482040999,
|
|
"loss": 6.5558,
|
|
"mean_token_accuracy": 0.13121686428785323,
|
|
"num_tokens": 3256363.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.583334445953369,
|
|
"epoch": 0.154980572235959,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999498857873662,
|
|
"loss": 6.5518,
|
|
"mean_token_accuracy": 0.13320463374257088,
|
|
"num_tokens": 3265822.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.478305387496948,
|
|
"epoch": 0.15542211232779937,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999492189639921,
|
|
"loss": 6.4224,
|
|
"mean_token_accuracy": 0.13528963476419448,
|
|
"num_tokens": 3274614.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.61256685256958,
|
|
"epoch": 0.1558636524196397,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999485477339907,
|
|
"loss": 6.5201,
|
|
"mean_token_accuracy": 0.13317029252648355,
|
|
"num_tokens": 3283800.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.526872968673706,
|
|
"epoch": 0.15630519251148003,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999478720973753,
|
|
"loss": 6.4319,
|
|
"mean_token_accuracy": 0.14009243845939637,
|
|
"num_tokens": 3293221.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.5570995807647705,
|
|
"epoch": 0.1567467326033204,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.000499947192054159,
|
|
"loss": 6.6029,
|
|
"mean_token_accuracy": 0.1324251540005207,
|
|
"num_tokens": 3302852.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.650819587707519,
|
|
"epoch": 0.15718827269516072,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499946507604355,
|
|
"loss": 6.4641,
|
|
"mean_token_accuracy": 0.14009604677557946,
|
|
"num_tokens": 3311752.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.473453664779663,
|
|
"epoch": 0.15762981278700106,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499945818747977,
|
|
"loss": 6.5492,
|
|
"mean_token_accuracy": 0.13722263872623444,
|
|
"num_tokens": 3321339.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.671073293685913,
|
|
"epoch": 0.1580713528788414,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999451254850383,
|
|
"loss": 6.5514,
|
|
"mean_token_accuracy": 0.1324629843235016,
|
|
"num_tokens": 3330269.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.550148677825928,
|
|
"epoch": 0.15851289297068175,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999444278155525,
|
|
"loss": 6.4576,
|
|
"mean_token_accuracy": 0.1376182422041893,
|
|
"num_tokens": 3340770.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.544644594192505,
|
|
"epoch": 0.15895443306252208,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999437257395333,
|
|
"loss": 6.5753,
|
|
"mean_token_accuracy": 0.13476464301347732,
|
|
"num_tokens": 3349976.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.597910165786743,
|
|
"epoch": 0.1593959731543624,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999430192569944,
|
|
"loss": 6.6158,
|
|
"mean_token_accuracy": 0.12608520165085793,
|
|
"num_tokens": 3359764.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.539244031906128,
|
|
"epoch": 0.15983751324620277,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999423083679498,
|
|
"loss": 6.4868,
|
|
"mean_token_accuracy": 0.13115186169743537,
|
|
"num_tokens": 3369939.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.639003801345825,
|
|
"epoch": 0.1602790533380431,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999415930724133,
|
|
"loss": 6.6346,
|
|
"mean_token_accuracy": 0.12994891554117202,
|
|
"num_tokens": 3381326.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.659589242935181,
|
|
"epoch": 0.16072059342988343,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004999408733703988,
|
|
"loss": 6.5949,
|
|
"mean_token_accuracy": 0.12143847793340683,
|
|
"num_tokens": 3391016.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.533053731918335,
|
|
"epoch": 0.16116213352172376,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999401492619207,
|
|
"loss": 6.3366,
|
|
"mean_token_accuracy": 0.13945143148303032,
|
|
"num_tokens": 3400294.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.587634897232055,
|
|
"epoch": 0.16160367361356412,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999394207469928,
|
|
"loss": 6.5447,
|
|
"mean_token_accuracy": 0.12779992073774338,
|
|
"num_tokens": 3409685.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.636557579040527,
|
|
"epoch": 0.16204521370540445,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999386878256297,
|
|
"loss": 6.4611,
|
|
"mean_token_accuracy": 0.1320968374609947,
|
|
"num_tokens": 3418946.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.423011112213135,
|
|
"epoch": 0.16248675379724478,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999379504978457,
|
|
"loss": 6.468,
|
|
"mean_token_accuracy": 0.13338307663798332,
|
|
"num_tokens": 3428245.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.578200912475586,
|
|
"epoch": 0.16292829388908514,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499937208763655,
|
|
"loss": 6.4741,
|
|
"mean_token_accuracy": 0.1402788795530796,
|
|
"num_tokens": 3437580.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.617180633544922,
|
|
"epoch": 0.16336983398092547,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999364626230724,
|
|
"loss": 6.4472,
|
|
"mean_token_accuracy": 0.14007550328969956,
|
|
"num_tokens": 3446393.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.451139450073242,
|
|
"epoch": 0.1638113740727658,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999357120761124,
|
|
"loss": 6.551,
|
|
"mean_token_accuracy": 0.13571736514568328,
|
|
"num_tokens": 3455956.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.6568724632263185,
|
|
"epoch": 0.16425291416460613,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999349571227898,
|
|
"loss": 6.5495,
|
|
"mean_token_accuracy": 0.13430711701512338,
|
|
"num_tokens": 3465722.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.55973629951477,
|
|
"epoch": 0.1646944542564465,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999341977631193,
|
|
"loss": 6.52,
|
|
"mean_token_accuracy": 0.12885117009282113,
|
|
"num_tokens": 3475467.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.513189268112183,
|
|
"epoch": 0.16513599434828682,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999334339971157,
|
|
"loss": 6.4162,
|
|
"mean_token_accuracy": 0.13661258816719055,
|
|
"num_tokens": 3484931.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.572188520431519,
|
|
"epoch": 0.16557753444012716,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999326658247942,
|
|
"loss": 6.5161,
|
|
"mean_token_accuracy": 0.13407543525099755,
|
|
"num_tokens": 3494001.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.522925567626953,
|
|
"epoch": 0.16601907453196751,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999318932461696,
|
|
"loss": 6.4558,
|
|
"mean_token_accuracy": 0.1386028841137886,
|
|
"num_tokens": 3503021.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.46082649230957,
|
|
"epoch": 0.16646061462380785,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004999311162612571,
|
|
"loss": 6.4843,
|
|
"mean_token_accuracy": 0.13625017702579498,
|
|
"num_tokens": 3513005.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.661849021911621,
|
|
"epoch": 0.16690215471564818,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.000499930334870072,
|
|
"loss": 6.5629,
|
|
"mean_token_accuracy": 0.13853515014052392,
|
|
"num_tokens": 3523219.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.584684419631958,
|
|
"epoch": 0.1673436948074885,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999295490726296,
|
|
"loss": 6.51,
|
|
"mean_token_accuracy": 0.13359814062714576,
|
|
"num_tokens": 3532917.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.513212633132935,
|
|
"epoch": 0.16778523489932887,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999287588689453,
|
|
"loss": 6.5245,
|
|
"mean_token_accuracy": 0.1389150969684124,
|
|
"num_tokens": 3542845.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.599180459976196,
|
|
"epoch": 0.1682267749911692,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999279642590344,
|
|
"loss": 6.5438,
|
|
"mean_token_accuracy": 0.13201173320412635,
|
|
"num_tokens": 3552126.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.5960267066955565,
|
|
"epoch": 0.16866831508300953,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999271652429127,
|
|
"loss": 6.5561,
|
|
"mean_token_accuracy": 0.12684691920876504,
|
|
"num_tokens": 3561254.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.528163957595825,
|
|
"epoch": 0.1691098551748499,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999263618205958,
|
|
"loss": 6.3933,
|
|
"mean_token_accuracy": 0.1403527893126011,
|
|
"num_tokens": 3569781.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.555511140823365,
|
|
"epoch": 0.16955139526669022,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999255539920993,
|
|
"loss": 6.4866,
|
|
"mean_token_accuracy": 0.13409090787172318,
|
|
"num_tokens": 3579664.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.567546367645264,
|
|
"epoch": 0.16999293535853055,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999247417574391,
|
|
"loss": 6.5376,
|
|
"mean_token_accuracy": 0.13570395410060881,
|
|
"num_tokens": 3588671.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.54329285621643,
|
|
"epoch": 0.17043447545037088,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999239251166312,
|
|
"loss": 6.4281,
|
|
"mean_token_accuracy": 0.13799721151590347,
|
|
"num_tokens": 3597656.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.480014228820801,
|
|
"epoch": 0.17087601554221124,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004999231040696914,
|
|
"loss": 6.4491,
|
|
"mean_token_accuracy": 0.13938046917319297,
|
|
"num_tokens": 3608017.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.529258060455322,
|
|
"epoch": 0.17131755563405157,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999222786166361,
|
|
"loss": 6.5236,
|
|
"mean_token_accuracy": 0.1355483777821064,
|
|
"num_tokens": 3618189.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.60916223526001,
|
|
"epoch": 0.1717590957258919,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999214487574812,
|
|
"loss": 6.4772,
|
|
"mean_token_accuracy": 0.13526797890663148,
|
|
"num_tokens": 3627211.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.498088455200195,
|
|
"epoch": 0.17220063581773226,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999206144922431,
|
|
"loss": 6.4181,
|
|
"mean_token_accuracy": 0.13236208409070968,
|
|
"num_tokens": 3636781.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.4924522876739506,
|
|
"epoch": 0.1726421759095726,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499919775820938,
|
|
"loss": 6.4985,
|
|
"mean_token_accuracy": 0.13709900975227357,
|
|
"num_tokens": 3644891.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.591422748565674,
|
|
"epoch": 0.17308371600141292,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999189327435825,
|
|
"loss": 6.5295,
|
|
"mean_token_accuracy": 0.13574066162109374,
|
|
"num_tokens": 3655477.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.5104138374328615,
|
|
"epoch": 0.17352525609325326,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999180852601929,
|
|
"loss": 6.5386,
|
|
"mean_token_accuracy": 0.13653166219592094,
|
|
"num_tokens": 3664542.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.577597141265869,
|
|
"epoch": 0.17396679618509361,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000499917233370786,
|
|
"loss": 6.4609,
|
|
"mean_token_accuracy": 0.1293606199324131,
|
|
"num_tokens": 3673806.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.484398126602173,
|
|
"epoch": 0.17440833627693395,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999163770753784,
|
|
"loss": 6.4524,
|
|
"mean_token_accuracy": 0.13555625528097154,
|
|
"num_tokens": 3683238.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.558987331390381,
|
|
"epoch": 0.17484987636877428,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004999155163739869,
|
|
"loss": 6.4372,
|
|
"mean_token_accuracy": 0.13345005139708518,
|
|
"num_tokens": 3692161.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.510820627212524,
|
|
"epoch": 0.17529141646061464,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999146512666284,
|
|
"loss": 6.4535,
|
|
"mean_token_accuracy": 0.13496344164013863,
|
|
"num_tokens": 3701431.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.530817365646362,
|
|
"epoch": 0.17573295655245497,
|
|
"grad_norm": 0.890625,
|
|
"learning_rate": 0.0004999137817533197,
|
|
"loss": 6.4293,
|
|
"mean_token_accuracy": 0.1400221474468708,
|
|
"num_tokens": 3710963.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.48544750213623,
|
|
"epoch": 0.1761744966442953,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999129078340779,
|
|
"loss": 6.4221,
|
|
"mean_token_accuracy": 0.14005866199731826,
|
|
"num_tokens": 3720177.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.465721511840821,
|
|
"epoch": 0.17661603673613563,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999120295089202,
|
|
"loss": 6.3795,
|
|
"mean_token_accuracy": 0.14514245688915253,
|
|
"num_tokens": 3728123.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.571569442749023,
|
|
"epoch": 0.177057576827976,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004999111467778639,
|
|
"loss": 6.4978,
|
|
"mean_token_accuracy": 0.1349009484052658,
|
|
"num_tokens": 3736869.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.4819518566131595,
|
|
"epoch": 0.17749911691981632,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499910259640926,
|
|
"loss": 6.3989,
|
|
"mean_token_accuracy": 0.1346891440451145,
|
|
"num_tokens": 3745830.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 6.486936283111572,
|
|
"epoch": 0.17794065701165665,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499909368098124,
|
|
"loss": 6.4328,
|
|
"mean_token_accuracy": 0.13926308006048202,
|
|
"num_tokens": 3755019.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 6.517996597290039,
|
|
"epoch": 0.178382197103497,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004999084721494754,
|
|
"loss": 6.4076,
|
|
"mean_token_accuracy": 0.13306454047560692,
|
|
"num_tokens": 3764814.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 6.465504217147827,
|
|
"epoch": 0.17882373719533734,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999075717949978,
|
|
"loss": 6.384,
|
|
"mean_token_accuracy": 0.14004691764712335,
|
|
"num_tokens": 3774258.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 6.422070741653442,
|
|
"epoch": 0.17926527728717767,
|
|
"grad_norm": 0.9375,
|
|
"learning_rate": 0.0004999066670347089,
|
|
"loss": 6.4155,
|
|
"mean_token_accuracy": 0.14247353076934816,
|
|
"num_tokens": 3783961.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.416653776168824,
|
|
"epoch": 0.179706817379018,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004999057578686261,
|
|
"loss": 6.3804,
|
|
"mean_token_accuracy": 0.1390853337943554,
|
|
"num_tokens": 3792417.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 6.496200704574585,
|
|
"epoch": 0.18014835747085836,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999048442967675,
|
|
"loss": 6.3592,
|
|
"mean_token_accuracy": 0.13822411596775055,
|
|
"num_tokens": 3801479.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.456766891479492,
|
|
"epoch": 0.1805898975626987,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999039263191508,
|
|
"loss": 6.4305,
|
|
"mean_token_accuracy": 0.133939179033041,
|
|
"num_tokens": 3810799.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.5264753818511965,
|
|
"epoch": 0.18103143765453902,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999030039357943,
|
|
"loss": 6.48,
|
|
"mean_token_accuracy": 0.1298608623445034,
|
|
"num_tokens": 3820966.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 6.493787717819214,
|
|
"epoch": 0.18147297774637938,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999020771467158,
|
|
"loss": 6.4572,
|
|
"mean_token_accuracy": 0.1327923409640789,
|
|
"num_tokens": 3829247.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 6.596197843551636,
|
|
"epoch": 0.1819145178382197,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999011459519335,
|
|
"loss": 6.3948,
|
|
"mean_token_accuracy": 0.13827238082885743,
|
|
"num_tokens": 3838114.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.390679597854614,
|
|
"epoch": 0.18235605793006004,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999002103514655,
|
|
"loss": 6.4735,
|
|
"mean_token_accuracy": 0.13869670927524566,
|
|
"num_tokens": 3848075.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.585737419128418,
|
|
"epoch": 0.18279759802190038,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998992703453304,
|
|
"loss": 6.5106,
|
|
"mean_token_accuracy": 0.13122646436095237,
|
|
"num_tokens": 3857934.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 6.450649833679199,
|
|
"epoch": 0.18323913811374073,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004998983259335466,
|
|
"loss": 6.3519,
|
|
"mean_token_accuracy": 0.14236594662070273,
|
|
"num_tokens": 3866707.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 6.58008508682251,
|
|
"epoch": 0.18368067820558107,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998973771161324,
|
|
"loss": 6.4244,
|
|
"mean_token_accuracy": 0.1374949462711811,
|
|
"num_tokens": 3875233.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.389654541015625,
|
|
"epoch": 0.1841222182974214,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998964238931065,
|
|
"loss": 6.4131,
|
|
"mean_token_accuracy": 0.1403422772884369,
|
|
"num_tokens": 3885173.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 6.564711236953736,
|
|
"epoch": 0.18456375838926176,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998954662644876,
|
|
"loss": 6.3803,
|
|
"mean_token_accuracy": 0.13195990920066833,
|
|
"num_tokens": 3894198.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 6.44653811454773,
|
|
"epoch": 0.1850052984811021,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998945042302943,
|
|
"loss": 6.382,
|
|
"mean_token_accuracy": 0.1373509407043457,
|
|
"num_tokens": 3904076.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 6.4586262702941895,
|
|
"epoch": 0.18544683857294242,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998935377905457,
|
|
"loss": 6.4943,
|
|
"mean_token_accuracy": 0.13231708630919456,
|
|
"num_tokens": 3913204.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 6.587808132171631,
|
|
"epoch": 0.18588837866478275,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998925669452605,
|
|
"loss": 6.4565,
|
|
"mean_token_accuracy": 0.1339179016649723,
|
|
"num_tokens": 3922148.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 6.401728963851928,
|
|
"epoch": 0.1863299187566231,
|
|
"grad_norm": 0.921875,
|
|
"learning_rate": 0.0004998915916944579,
|
|
"loss": 6.4234,
|
|
"mean_token_accuracy": 0.1407448723912239,
|
|
"num_tokens": 3931333.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 6.545709466934204,
|
|
"epoch": 0.18677145884846344,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998906120381568,
|
|
"loss": 6.3789,
|
|
"mean_token_accuracy": 0.1448886923491955,
|
|
"num_tokens": 3941061.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 6.505582046508789,
|
|
"epoch": 0.18721299894030377,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998896279763766,
|
|
"loss": 6.4761,
|
|
"mean_token_accuracy": 0.13257319629192352,
|
|
"num_tokens": 3950075.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 6.455861520767212,
|
|
"epoch": 0.18765453903214413,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998886395091365,
|
|
"loss": 6.3345,
|
|
"mean_token_accuracy": 0.1409289576113224,
|
|
"num_tokens": 3958885.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 6.417616128921509,
|
|
"epoch": 0.18809607912398446,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998876466364559,
|
|
"loss": 6.437,
|
|
"mean_token_accuracy": 0.13843559697270394,
|
|
"num_tokens": 3968218.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 6.400978994369507,
|
|
"epoch": 0.1885376192158248,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998866493583541,
|
|
"loss": 6.364,
|
|
"mean_token_accuracy": 0.14499804973602295,
|
|
"num_tokens": 3977435.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.388528203964233,
|
|
"epoch": 0.18897915930766512,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004998856476748509,
|
|
"loss": 6.349,
|
|
"mean_token_accuracy": 0.14326094537973405,
|
|
"num_tokens": 3986608.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 6.440869951248169,
|
|
"epoch": 0.18942069939950548,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998846415859656,
|
|
"loss": 6.3602,
|
|
"mean_token_accuracy": 0.14130929261445999,
|
|
"num_tokens": 3996087.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.476541662216187,
|
|
"epoch": 0.1898622394913458,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998836310917182,
|
|
"loss": 6.4058,
|
|
"mean_token_accuracy": 0.13654726892709732,
|
|
"num_tokens": 4006257.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.478487300872803,
|
|
"epoch": 0.19030377958318614,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998826161921282,
|
|
"loss": 6.4072,
|
|
"mean_token_accuracy": 0.14593140706419944,
|
|
"num_tokens": 4015904.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 6.427832221984863,
|
|
"epoch": 0.1907453196750265,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004998815968872157,
|
|
"loss": 6.4181,
|
|
"mean_token_accuracy": 0.13653010204434396,
|
|
"num_tokens": 4025417.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 6.392728614807129,
|
|
"epoch": 0.19118685976686683,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998805731770007,
|
|
"loss": 6.2628,
|
|
"mean_token_accuracy": 0.15659967064857483,
|
|
"num_tokens": 4035181.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.4461814880371096,
|
|
"epoch": 0.19162839985870717,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.000499879545061503,
|
|
"loss": 6.4504,
|
|
"mean_token_accuracy": 0.13371687456965448,
|
|
"num_tokens": 4045112.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.5883321285247805,
|
|
"epoch": 0.1920699399505475,
|
|
"grad_norm": 0.9609375,
|
|
"learning_rate": 0.0004998785125407432,
|
|
"loss": 6.5425,
|
|
"mean_token_accuracy": 0.12125966772437095,
|
|
"num_tokens": 4054566.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 6.552111434936523,
|
|
"epoch": 0.19251148004238786,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.000499877475614741,
|
|
"loss": 6.3758,
|
|
"mean_token_accuracy": 0.1340583384037018,
|
|
"num_tokens": 4063960.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 6.3515486240386965,
|
|
"epoch": 0.1929530201342282,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998764342835169,
|
|
"loss": 6.3157,
|
|
"mean_token_accuracy": 0.14617449343204497,
|
|
"num_tokens": 4072620.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 6.406283712387085,
|
|
"epoch": 0.19339456022606852,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004998753885470915,
|
|
"loss": 6.2789,
|
|
"mean_token_accuracy": 0.14062159806489943,
|
|
"num_tokens": 4081590.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 6.425132322311401,
|
|
"epoch": 0.19383610031790888,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998743384054851,
|
|
"loss": 6.4202,
|
|
"mean_token_accuracy": 0.14044143706560136,
|
|
"num_tokens": 4090758.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 6.3962568759918215,
|
|
"epoch": 0.1942776404097492,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998732838587183,
|
|
"loss": 6.2458,
|
|
"mean_token_accuracy": 0.149021477997303,
|
|
"num_tokens": 4099281.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 6.40768404006958,
|
|
"epoch": 0.19471918050158954,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998722249068118,
|
|
"loss": 6.3451,
|
|
"mean_token_accuracy": 0.1404854990541935,
|
|
"num_tokens": 4108953.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 6.396564531326294,
|
|
"epoch": 0.19516072059342987,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998711615497863,
|
|
"loss": 6.4535,
|
|
"mean_token_accuracy": 0.13740591406822206,
|
|
"num_tokens": 4118799.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 6.579476261138916,
|
|
"epoch": 0.19560226068527023,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998700937876626,
|
|
"loss": 6.4122,
|
|
"mean_token_accuracy": 0.13571444600820542,
|
|
"num_tokens": 4127862.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 6.4028857231140135,
|
|
"epoch": 0.19604380077711056,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998690216204615,
|
|
"loss": 6.5068,
|
|
"mean_token_accuracy": 0.12704429849982263,
|
|
"num_tokens": 4139029.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 6.5047478675842285,
|
|
"epoch": 0.1964853408689509,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998679450482043,
|
|
"loss": 6.3028,
|
|
"mean_token_accuracy": 0.14782762974500657,
|
|
"num_tokens": 4148257.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 6.341253662109375,
|
|
"epoch": 0.19692688096079125,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499866864070912,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.14017492160201073,
|
|
"num_tokens": 4157626.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 6.429310417175293,
|
|
"epoch": 0.19736842105263158,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998657786886056,
|
|
"loss": 6.4713,
|
|
"mean_token_accuracy": 0.13993202298879623,
|
|
"num_tokens": 4166804.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.554042720794678,
|
|
"epoch": 0.1978099611444719,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004998646889013066,
|
|
"loss": 6.3607,
|
|
"mean_token_accuracy": 0.14240839183330536,
|
|
"num_tokens": 4175701.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 6.4098762512207035,
|
|
"epoch": 0.19825150123631224,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004998635947090362,
|
|
"loss": 6.4425,
|
|
"mean_token_accuracy": 0.14030273035168647,
|
|
"num_tokens": 4184711.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 6.475190782546997,
|
|
"epoch": 0.1986930413281526,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004998624961118158,
|
|
"loss": 6.4017,
|
|
"mean_token_accuracy": 0.14099944159388542,
|
|
"num_tokens": 4193931.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 6.44319109916687,
|
|
"epoch": 0.19913458141999293,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499861393109667,
|
|
"loss": 6.2899,
|
|
"mean_token_accuracy": 0.14828752726316452,
|
|
"num_tokens": 4203000.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 6.274944400787353,
|
|
"epoch": 0.19957612151183327,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004998602857026114,
|
|
"loss": 6.2991,
|
|
"mean_token_accuracy": 0.1458234503865242,
|
|
"num_tokens": 4211977.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 6.472422647476196,
|
|
"epoch": 0.20001766160367362,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998591738906708,
|
|
"loss": 6.3814,
|
|
"mean_token_accuracy": 0.15182094275951385,
|
|
"num_tokens": 4220375.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.494894886016846,
|
|
"epoch": 0.20045920169551396,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998580576738668,
|
|
"loss": 6.3793,
|
|
"mean_token_accuracy": 0.1367909237742424,
|
|
"num_tokens": 4230506.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 6.404332447052002,
|
|
"epoch": 0.2009007417873543,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004998569370522213,
|
|
"loss": 6.3524,
|
|
"mean_token_accuracy": 0.13977290093898773,
|
|
"num_tokens": 4240235.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 6.448247480392456,
|
|
"epoch": 0.20134228187919462,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004998558120257563,
|
|
"loss": 6.4189,
|
|
"mean_token_accuracy": 0.14339498728513717,
|
|
"num_tokens": 4249775.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 6.419510173797607,
|
|
"epoch": 0.20178382197103498,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998546825944938,
|
|
"loss": 6.3504,
|
|
"mean_token_accuracy": 0.141066338121891,
|
|
"num_tokens": 4258523.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 6.419132709503174,
|
|
"epoch": 0.2022253620628753,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499853548758456,
|
|
"loss": 6.358,
|
|
"mean_token_accuracy": 0.13891511633992196,
|
|
"num_tokens": 4267683.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 6.495290946960449,
|
|
"epoch": 0.20266690215471564,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000499852410517665,
|
|
"loss": 6.417,
|
|
"mean_token_accuracy": 0.13713881745934486,
|
|
"num_tokens": 4276903.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 6.426017999649048,
|
|
"epoch": 0.203108442246556,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004998512678721431,
|
|
"loss": 6.3661,
|
|
"mean_token_accuracy": 0.135909353941679,
|
|
"num_tokens": 4287257.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 6.447468948364258,
|
|
"epoch": 0.20354998233839633,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499850120821913,
|
|
"loss": 6.3875,
|
|
"mean_token_accuracy": 0.13645304143428802,
|
|
"num_tokens": 4297345.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 6.528428220748902,
|
|
"epoch": 0.20399152243023666,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998489693669967,
|
|
"loss": 6.3613,
|
|
"mean_token_accuracy": 0.13837311565876007,
|
|
"num_tokens": 4306533.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.392482471466065,
|
|
"epoch": 0.204433062522077,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499847813507417,
|
|
"loss": 6.4023,
|
|
"mean_token_accuracy": 0.14094773977994918,
|
|
"num_tokens": 4316338.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 6.399602174758911,
|
|
"epoch": 0.20487460261391735,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004998466532431966,
|
|
"loss": 6.3549,
|
|
"mean_token_accuracy": 0.13948202207684518,
|
|
"num_tokens": 4326585.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.544098567962647,
|
|
"epoch": 0.20531614270575768,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004998454885743581,
|
|
"loss": 6.4795,
|
|
"mean_token_accuracy": 0.1351695440709591,
|
|
"num_tokens": 4336490.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 6.363640975952149,
|
|
"epoch": 0.205757682797598,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998443195009242,
|
|
"loss": 6.3589,
|
|
"mean_token_accuracy": 0.1348782531917095,
|
|
"num_tokens": 4346264.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 6.461185693740845,
|
|
"epoch": 0.20619922288943837,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998431460229182,
|
|
"loss": 6.3318,
|
|
"mean_token_accuracy": 0.1448797807097435,
|
|
"num_tokens": 4355102.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 6.482104396820068,
|
|
"epoch": 0.2066407629812787,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998419681403627,
|
|
"loss": 6.5133,
|
|
"mean_token_accuracy": 0.13086750581860543,
|
|
"num_tokens": 4365569.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 6.507464361190796,
|
|
"epoch": 0.20708230307311903,
|
|
"grad_norm": 0.91796875,
|
|
"learning_rate": 0.0004998407858532809,
|
|
"loss": 6.36,
|
|
"mean_token_accuracy": 0.14296017587184906,
|
|
"num_tokens": 4375437.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 6.37006025314331,
|
|
"epoch": 0.20752384316495937,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499839599161696,
|
|
"loss": 6.3389,
|
|
"mean_token_accuracy": 0.1450774312019348,
|
|
"num_tokens": 4384420.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 6.460700035095215,
|
|
"epoch": 0.20796538325679972,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998384080656314,
|
|
"loss": 6.3106,
|
|
"mean_token_accuracy": 0.14050144031643869,
|
|
"num_tokens": 4393730.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 6.332716035842895,
|
|
"epoch": 0.20840692334864006,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.00049983721256511,
|
|
"loss": 6.2798,
|
|
"mean_token_accuracy": 0.14351205080747603,
|
|
"num_tokens": 4402731.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 6.3846518993377686,
|
|
"epoch": 0.2088484634404804,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998360126601556,
|
|
"loss": 6.3369,
|
|
"mean_token_accuracy": 0.14289727210998535,
|
|
"num_tokens": 4411606.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 6.3337644100189205,
|
|
"epoch": 0.20929000353232075,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998348083507916,
|
|
"loss": 6.4062,
|
|
"mean_token_accuracy": 0.14081210866570473,
|
|
"num_tokens": 4421685.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 6.558507633209229,
|
|
"epoch": 0.20973154362416108,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998335996370416,
|
|
"loss": 6.3782,
|
|
"mean_token_accuracy": 0.14399294778704644,
|
|
"num_tokens": 4431765.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 6.321137380599976,
|
|
"epoch": 0.2101730837160014,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998323865189291,
|
|
"loss": 6.3523,
|
|
"mean_token_accuracy": 0.13661579713225364,
|
|
"num_tokens": 4441191.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 6.481552457809448,
|
|
"epoch": 0.21061462380784174,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998311689964781,
|
|
"loss": 6.4322,
|
|
"mean_token_accuracy": 0.13680859059095382,
|
|
"num_tokens": 4450156.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 6.4807047843933105,
|
|
"epoch": 0.2110561638996821,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998299470697125,
|
|
"loss": 6.4163,
|
|
"mean_token_accuracy": 0.14260546639561653,
|
|
"num_tokens": 4459466.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 6.461113500595093,
|
|
"epoch": 0.21149770399152243,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998287207386559,
|
|
"loss": 6.4156,
|
|
"mean_token_accuracy": 0.14247968047857285,
|
|
"num_tokens": 4468539.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 6.525749444961548,
|
|
"epoch": 0.21193924408336276,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998274900033326,
|
|
"loss": 6.315,
|
|
"mean_token_accuracy": 0.14835015684366226,
|
|
"num_tokens": 4477579.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 6.236664247512818,
|
|
"epoch": 0.21238078417520312,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004998262548637667,
|
|
"loss": 6.2842,
|
|
"mean_token_accuracy": 0.14991160482168198,
|
|
"num_tokens": 4486800.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 6.444223546981812,
|
|
"epoch": 0.21282232426704345,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998250153199822,
|
|
"loss": 6.2465,
|
|
"mean_token_accuracy": 0.14562757611274718,
|
|
"num_tokens": 4495985.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 6.417825555801391,
|
|
"epoch": 0.21326386435888378,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998237713720036,
|
|
"loss": 6.4031,
|
|
"mean_token_accuracy": 0.14113787487149237,
|
|
"num_tokens": 4504944.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 6.344033908843994,
|
|
"epoch": 0.2137054044507241,
|
|
"grad_norm": 0.9296875,
|
|
"learning_rate": 0.0004998225230198552,
|
|
"loss": 6.2875,
|
|
"mean_token_accuracy": 0.14928205609321593,
|
|
"num_tokens": 4515402.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 6.412223052978516,
|
|
"epoch": 0.21414694454256447,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004998212702635614,
|
|
"loss": 6.355,
|
|
"mean_token_accuracy": 0.14224686175584794,
|
|
"num_tokens": 4525009.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 6.537710332870484,
|
|
"epoch": 0.2145884846344048,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998200131031469,
|
|
"loss": 6.4066,
|
|
"mean_token_accuracy": 0.142412006855011,
|
|
"num_tokens": 4534460.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 6.355252885818482,
|
|
"epoch": 0.21503002472624513,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998187515386361,
|
|
"loss": 6.2239,
|
|
"mean_token_accuracy": 0.15292632952332497,
|
|
"num_tokens": 4543748.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 6.32159743309021,
|
|
"epoch": 0.2154715648180855,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998174855700538,
|
|
"loss": 6.3515,
|
|
"mean_token_accuracy": 0.14536840543150903,
|
|
"num_tokens": 4552722.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 6.455835342407227,
|
|
"epoch": 0.21591310490992582,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004998162151974248,
|
|
"loss": 6.2244,
|
|
"mean_token_accuracy": 0.1433302193880081,
|
|
"num_tokens": 4561607.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 6.376346635818481,
|
|
"epoch": 0.21635464500176615,
|
|
"grad_norm": 0.89453125,
|
|
"learning_rate": 0.000499814940420774,
|
|
"loss": 6.4926,
|
|
"mean_token_accuracy": 0.13043315410614015,
|
|
"num_tokens": 4572524.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 6.473890399932861,
|
|
"epoch": 0.21679618509360649,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998136612401266,
|
|
"loss": 6.306,
|
|
"mean_token_accuracy": 0.13808697760105132,
|
|
"num_tokens": 4581601.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 6.362298917770386,
|
|
"epoch": 0.21723772518544684,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998123776555071,
|
|
"loss": 6.355,
|
|
"mean_token_accuracy": 0.13434374257922171,
|
|
"num_tokens": 4591795.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 6.4316198348999025,
|
|
"epoch": 0.21767926527728718,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998110896669412,
|
|
"loss": 6.3127,
|
|
"mean_token_accuracy": 0.14232389852404595,
|
|
"num_tokens": 4600745.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 6.403714847564697,
|
|
"epoch": 0.2181208053691275,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004998097972744539,
|
|
"loss": 6.3668,
|
|
"mean_token_accuracy": 0.13958390951156616,
|
|
"num_tokens": 4610490.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.421542739868164,
|
|
"epoch": 0.21856234546096787,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998085004780705,
|
|
"loss": 6.3027,
|
|
"mean_token_accuracy": 0.14644000679254532,
|
|
"num_tokens": 4619511.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 6.411759996414185,
|
|
"epoch": 0.2190038855528082,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998071992778164,
|
|
"loss": 6.3536,
|
|
"mean_token_accuracy": 0.13926490917801856,
|
|
"num_tokens": 4628186.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 6.416973400115967,
|
|
"epoch": 0.21944542564464853,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499805893673717,
|
|
"loss": 6.3574,
|
|
"mean_token_accuracy": 0.14341954439878463,
|
|
"num_tokens": 4637431.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 6.347478723526001,
|
|
"epoch": 0.21988696573648886,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004998045836657982,
|
|
"loss": 6.2093,
|
|
"mean_token_accuracy": 0.14213306605815887,
|
|
"num_tokens": 4646627.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 6.368200349807739,
|
|
"epoch": 0.22032850582832922,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998032692540853,
|
|
"loss": 6.3012,
|
|
"mean_token_accuracy": 0.14045739471912383,
|
|
"num_tokens": 4656095.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 6.476345205307007,
|
|
"epoch": 0.22077004592016955,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998019504386044,
|
|
"loss": 6.3876,
|
|
"mean_token_accuracy": 0.13640450164675713,
|
|
"num_tokens": 4665807.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 6.444914436340332,
|
|
"epoch": 0.22121158601200988,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998006272193809,
|
|
"loss": 6.3493,
|
|
"mean_token_accuracy": 0.13855071663856505,
|
|
"num_tokens": 4674459.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 6.361084604263306,
|
|
"epoch": 0.22165312610385024,
|
|
"grad_norm": 0.92578125,
|
|
"learning_rate": 0.0004997992995964412,
|
|
"loss": 6.4325,
|
|
"mean_token_accuracy": 0.13779560700058938,
|
|
"num_tokens": 4684063.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 6.45024299621582,
|
|
"epoch": 0.22209466619569057,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997979675698109,
|
|
"loss": 6.3029,
|
|
"mean_token_accuracy": 0.14212062656879426,
|
|
"num_tokens": 4692807.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 6.438331031799317,
|
|
"epoch": 0.2225362062875309,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997966311395164,
|
|
"loss": 6.2422,
|
|
"mean_token_accuracy": 0.14746622294187545,
|
|
"num_tokens": 4701100.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 6.284497547149658,
|
|
"epoch": 0.22297774637937123,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997952903055836,
|
|
"loss": 6.3071,
|
|
"mean_token_accuracy": 0.13971827551722527,
|
|
"num_tokens": 4710697.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 6.4242840766906735,
|
|
"epoch": 0.2234192864712116,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499793945068039,
|
|
"loss": 6.2875,
|
|
"mean_token_accuracy": 0.14202770590782166,
|
|
"num_tokens": 4718745.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 6.358328342437744,
|
|
"epoch": 0.22386082656305192,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004997925954269088,
|
|
"loss": 6.2493,
|
|
"mean_token_accuracy": 0.15010830983519555,
|
|
"num_tokens": 4728056.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 6.390234851837159,
|
|
"epoch": 0.22430236665489225,
|
|
"grad_norm": 0.9765625,
|
|
"learning_rate": 0.0004997912413822196,
|
|
"loss": 6.3892,
|
|
"mean_token_accuracy": 0.14226726815104485,
|
|
"num_tokens": 4737605.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 6.333422613143921,
|
|
"epoch": 0.2247439067467326,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997898829339979,
|
|
"loss": 6.216,
|
|
"mean_token_accuracy": 0.15019772350788116,
|
|
"num_tokens": 4746168.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 6.414393615722656,
|
|
"epoch": 0.22518544683857294,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.00049978852008227,
|
|
"loss": 6.2827,
|
|
"mean_token_accuracy": 0.14390757903456688,
|
|
"num_tokens": 4755072.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 6.292112064361572,
|
|
"epoch": 0.22562698693041328,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.000499787152827063,
|
|
"loss": 6.2887,
|
|
"mean_token_accuracy": 0.14927180036902427,
|
|
"num_tokens": 4764580.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 6.362084197998047,
|
|
"epoch": 0.2260685270222536,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997857811684035,
|
|
"loss": 6.3109,
|
|
"mean_token_accuracy": 0.14619807451963424,
|
|
"num_tokens": 4774135.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 6.443705081939697,
|
|
"epoch": 0.22651006711409397,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997844051063183,
|
|
"loss": 6.3931,
|
|
"mean_token_accuracy": 0.1407366193830967,
|
|
"num_tokens": 4784733.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 6.393408250808716,
|
|
"epoch": 0.2269516072059343,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004997830246408346,
|
|
"loss": 6.3304,
|
|
"mean_token_accuracy": 0.14791636019945145,
|
|
"num_tokens": 4795327.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 6.342170095443725,
|
|
"epoch": 0.22739314729777463,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004997816397719791,
|
|
"loss": 6.3028,
|
|
"mean_token_accuracy": 0.14114121049642564,
|
|
"num_tokens": 4804314.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 6.458865118026734,
|
|
"epoch": 0.227834687389615,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997802504997792,
|
|
"loss": 6.3913,
|
|
"mean_token_accuracy": 0.13652418628335,
|
|
"num_tokens": 4813637.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 6.382553291320801,
|
|
"epoch": 0.22827622748145532,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997788568242621,
|
|
"loss": 6.2591,
|
|
"mean_token_accuracy": 0.14338775500655174,
|
|
"num_tokens": 4823094.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 6.327674865722656,
|
|
"epoch": 0.22871776757329565,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.000499777458745455,
|
|
"loss": 6.1969,
|
|
"mean_token_accuracy": 0.1477431207895279,
|
|
"num_tokens": 4833199.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 6.392719554901123,
|
|
"epoch": 0.22915930766513598,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997760562633853,
|
|
"loss": 6.2909,
|
|
"mean_token_accuracy": 0.14219107255339622,
|
|
"num_tokens": 4842970.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 6.394578695297241,
|
|
"epoch": 0.22960084775697634,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997746493780804,
|
|
"loss": 6.3788,
|
|
"mean_token_accuracy": 0.13738251477479935,
|
|
"num_tokens": 4852043.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 6.376160097122193,
|
|
"epoch": 0.23004238784881667,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499773238089568,
|
|
"loss": 6.2693,
|
|
"mean_token_accuracy": 0.13766007199883462,
|
|
"num_tokens": 4862232.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 6.345012950897217,
|
|
"epoch": 0.230483927940657,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997718223978758,
|
|
"loss": 6.2081,
|
|
"mean_token_accuracy": 0.1466532751917839,
|
|
"num_tokens": 4871186.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 6.343753385543823,
|
|
"epoch": 0.23092546803249736,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997704023030315,
|
|
"loss": 6.3059,
|
|
"mean_token_accuracy": 0.150559451431036,
|
|
"num_tokens": 4879974.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 6.393126726150513,
|
|
"epoch": 0.2313670081243377,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997689778050627,
|
|
"loss": 6.3617,
|
|
"mean_token_accuracy": 0.14292784333229064,
|
|
"num_tokens": 4890300.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 6.369229030609131,
|
|
"epoch": 0.23180854821617802,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997675489039975,
|
|
"loss": 6.3301,
|
|
"mean_token_accuracy": 0.1426799289882183,
|
|
"num_tokens": 4900428.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 6.3974145412445065,
|
|
"epoch": 0.23225008830801835,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997661155998638,
|
|
"loss": 6.3245,
|
|
"mean_token_accuracy": 0.1442883849143982,
|
|
"num_tokens": 4910092.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 6.384046411514282,
|
|
"epoch": 0.2326916283998587,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997646778926898,
|
|
"loss": 6.3247,
|
|
"mean_token_accuracy": 0.13738622814416884,
|
|
"num_tokens": 4919593.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 6.373585891723633,
|
|
"epoch": 0.23313316849169904,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997632357825035,
|
|
"loss": 6.3001,
|
|
"mean_token_accuracy": 0.14202155098319053,
|
|
"num_tokens": 4929098.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 6.43012547492981,
|
|
"epoch": 0.23357470858353938,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997617892693333,
|
|
"loss": 6.3657,
|
|
"mean_token_accuracy": 0.1421157017350197,
|
|
"num_tokens": 4938265.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 6.394748878479004,
|
|
"epoch": 0.23401624867537973,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997603383532075,
|
|
"loss": 6.3016,
|
|
"mean_token_accuracy": 0.14679210409522056,
|
|
"num_tokens": 4946694.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 6.368328046798706,
|
|
"epoch": 0.23445778876722007,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997588830341545,
|
|
"loss": 6.3132,
|
|
"mean_token_accuracy": 0.1434150867164135,
|
|
"num_tokens": 4955296.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 6.32787938117981,
|
|
"epoch": 0.2348993288590604,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997574233122028,
|
|
"loss": 6.2759,
|
|
"mean_token_accuracy": 0.14597226828336715,
|
|
"num_tokens": 4964409.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 6.3884584426879885,
|
|
"epoch": 0.23534086895090076,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997559591873809,
|
|
"loss": 6.284,
|
|
"mean_token_accuracy": 0.1481903851032257,
|
|
"num_tokens": 4973449.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 6.345009279251099,
|
|
"epoch": 0.2357824090427411,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997544906597178,
|
|
"loss": 6.2779,
|
|
"mean_token_accuracy": 0.1470661997795105,
|
|
"num_tokens": 4983057.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 6.2613893985748295,
|
|
"epoch": 0.23622394913458142,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004997530177292418,
|
|
"loss": 6.3532,
|
|
"mean_token_accuracy": 0.13861697241663934,
|
|
"num_tokens": 4991950.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 6.461032247543335,
|
|
"epoch": 0.23666548922642175,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.0004997515403959823,
|
|
"loss": 6.2857,
|
|
"mean_token_accuracy": 0.14603266417980193,
|
|
"num_tokens": 5001042.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 6.356680679321289,
|
|
"epoch": 0.2371070293182621,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997500586599677,
|
|
"loss": 6.2198,
|
|
"mean_token_accuracy": 0.15022996366024016,
|
|
"num_tokens": 5009827.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 6.292784547805786,
|
|
"epoch": 0.23754856941010244,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004997485725212274,
|
|
"loss": 6.2662,
|
|
"mean_token_accuracy": 0.1465997129678726,
|
|
"num_tokens": 5018708.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 6.334398937225342,
|
|
"epoch": 0.23799010950194277,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997470819797903,
|
|
"loss": 6.1678,
|
|
"mean_token_accuracy": 0.149826068431139,
|
|
"num_tokens": 5027522.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 6.312096786499024,
|
|
"epoch": 0.23843164959378313,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997455870356857,
|
|
"loss": 6.2858,
|
|
"mean_token_accuracy": 0.14754335582256317,
|
|
"num_tokens": 5035755.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 6.38718318939209,
|
|
"epoch": 0.23887318968562346,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997440876889429,
|
|
"loss": 6.2373,
|
|
"mean_token_accuracy": 0.14902258217334746,
|
|
"num_tokens": 5045289.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 6.2602025985717775,
|
|
"epoch": 0.2393147297774638,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004997425839395913,
|
|
"loss": 6.2623,
|
|
"mean_token_accuracy": 0.14851141721010208,
|
|
"num_tokens": 5053774.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 6.4446056365966795,
|
|
"epoch": 0.23975626986930412,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004997410757876602,
|
|
"loss": 6.3368,
|
|
"mean_token_accuracy": 0.13768139705061913,
|
|
"num_tokens": 5062911.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 6.34313178062439,
|
|
"epoch": 0.24019780996114448,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004997395632331793,
|
|
"loss": 6.1974,
|
|
"mean_token_accuracy": 0.15056394785642624,
|
|
"num_tokens": 5072107.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 6.232982730865478,
|
|
"epoch": 0.2406393500529848,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997380462761781,
|
|
"loss": 6.1744,
|
|
"mean_token_accuracy": 0.15013156086206436,
|
|
"num_tokens": 5080588.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 6.360136985778809,
|
|
"epoch": 0.24108089014482514,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997365249166864,
|
|
"loss": 6.3571,
|
|
"mean_token_accuracy": 0.1455472856760025,
|
|
"num_tokens": 5090262.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 6.398047304153442,
|
|
"epoch": 0.2415224302366655,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997349991547342,
|
|
"loss": 6.2776,
|
|
"mean_token_accuracy": 0.15021264627575875,
|
|
"num_tokens": 5099285.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 6.356108903884888,
|
|
"epoch": 0.24196397032850583,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997334689903509,
|
|
"loss": 6.3226,
|
|
"mean_token_accuracy": 0.14855852872133254,
|
|
"num_tokens": 5109115.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 6.388672256469727,
|
|
"epoch": 0.24240551042034617,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004997319344235668,
|
|
"loss": 6.3429,
|
|
"mean_token_accuracy": 0.14180680066347123,
|
|
"num_tokens": 5117977.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 6.393019914627075,
|
|
"epoch": 0.2428470505121865,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499730395454412,
|
|
"loss": 6.3025,
|
|
"mean_token_accuracy": 0.1451731264591217,
|
|
"num_tokens": 5127457.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 6.300089502334595,
|
|
"epoch": 0.24328859060402686,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997288520829166,
|
|
"loss": 6.3466,
|
|
"mean_token_accuracy": 0.14065672382712363,
|
|
"num_tokens": 5137310.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 6.394161605834961,
|
|
"epoch": 0.2437301306958672,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997273043091107,
|
|
"loss": 6.2725,
|
|
"mean_token_accuracy": 0.14155926927924156,
|
|
"num_tokens": 5146963.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 6.31079797744751,
|
|
"epoch": 0.24417167078770752,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997257521330248,
|
|
"loss": 6.2601,
|
|
"mean_token_accuracy": 0.14022860154509545,
|
|
"num_tokens": 5155521.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 6.40408935546875,
|
|
"epoch": 0.24461321087954788,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997241955546892,
|
|
"loss": 6.237,
|
|
"mean_token_accuracy": 0.14580907300114632,
|
|
"num_tokens": 5165182.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 6.289978647232056,
|
|
"epoch": 0.2450547509713882,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004997226345741343,
|
|
"loss": 6.2649,
|
|
"mean_token_accuracy": 0.13975519686937332,
|
|
"num_tokens": 5175511.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 6.354159593582153,
|
|
"epoch": 0.24549629106322854,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499721069191391,
|
|
"loss": 6.2126,
|
|
"mean_token_accuracy": 0.14699607565999032,
|
|
"num_tokens": 5184772.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 6.288126516342163,
|
|
"epoch": 0.24593783115506887,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004997194994064896,
|
|
"loss": 6.3014,
|
|
"mean_token_accuracy": 0.14003223031759263,
|
|
"num_tokens": 5195136.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 6.391946315765381,
|
|
"epoch": 0.24637937124690923,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499717925219461,
|
|
"loss": 6.2279,
|
|
"mean_token_accuracy": 0.15093553364276885,
|
|
"num_tokens": 5203163.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 6.346267318725586,
|
|
"epoch": 0.24682091133874956,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997163466303362,
|
|
"loss": 6.3224,
|
|
"mean_token_accuracy": 0.14383373707532882,
|
|
"num_tokens": 5213233.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 6.329722881317139,
|
|
"epoch": 0.2472624514305899,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.000499714763639146,
|
|
"loss": 6.2346,
|
|
"mean_token_accuracy": 0.14386766105890275,
|
|
"num_tokens": 5222940.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 6.36411566734314,
|
|
"epoch": 0.24770399152243025,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997131762459211,
|
|
"loss": 6.2596,
|
|
"mean_token_accuracy": 0.14432956129312516,
|
|
"num_tokens": 5232263.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 6.369926023483276,
|
|
"epoch": 0.24814553161427058,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997115844506932,
|
|
"loss": 6.2334,
|
|
"mean_token_accuracy": 0.14295720756053926,
|
|
"num_tokens": 5241536.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 6.348195028305054,
|
|
"epoch": 0.2485870717061109,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004997099882534929,
|
|
"loss": 6.2732,
|
|
"mean_token_accuracy": 0.14211497604846954,
|
|
"num_tokens": 5250702.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 6.359058141708374,
|
|
"epoch": 0.24902861179795124,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004997083876543519,
|
|
"loss": 6.2763,
|
|
"mean_token_accuracy": 0.14498503208160402,
|
|
"num_tokens": 5259811.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 6.397128582000732,
|
|
"epoch": 0.2494701518897916,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004997067826533014,
|
|
"loss": 6.3615,
|
|
"mean_token_accuracy": 0.13723283037543296,
|
|
"num_tokens": 5270518.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 6.375804328918457,
|
|
"epoch": 0.24991169198163193,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004997051732503726,
|
|
"loss": 6.2458,
|
|
"mean_token_accuracy": 0.14747673273086548,
|
|
"num_tokens": 5279538.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 6.313772678375244,
|
|
"epoch": 0.25035323207347226,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997035594455975,
|
|
"loss": 6.2702,
|
|
"mean_token_accuracy": 0.13872402533888817,
|
|
"num_tokens": 5289633.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 6.370833015441894,
|
|
"epoch": 0.2507947721653126,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997019412390074,
|
|
"loss": 6.3603,
|
|
"mean_token_accuracy": 0.1444901555776596,
|
|
"num_tokens": 5299148.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 6.36065092086792,
|
|
"epoch": 0.2512363122571529,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499700318630634,
|
|
"loss": 6.2527,
|
|
"mean_token_accuracy": 0.1457889422774315,
|
|
"num_tokens": 5309090.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 6.384716939926148,
|
|
"epoch": 0.2516778523489933,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996986916205092,
|
|
"loss": 6.3297,
|
|
"mean_token_accuracy": 0.14129810705780982,
|
|
"num_tokens": 5318798.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 6.313976621627807,
|
|
"epoch": 0.25211939244083365,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996970602086648,
|
|
"loss": 6.1848,
|
|
"mean_token_accuracy": 0.15023760497570038,
|
|
"num_tokens": 5327915.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 6.251888847351074,
|
|
"epoch": 0.252560932532674,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996954243951327,
|
|
"loss": 6.2192,
|
|
"mean_token_accuracy": 0.15385923832654952,
|
|
"num_tokens": 5336970.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 6.299870014190674,
|
|
"epoch": 0.2530024726245143,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996937841799451,
|
|
"loss": 6.1821,
|
|
"mean_token_accuracy": 0.15226729065179825,
|
|
"num_tokens": 5345167.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 6.1627833366394045,
|
|
"epoch": 0.25344401271635464,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004996921395631342,
|
|
"loss": 6.1711,
|
|
"mean_token_accuracy": 0.14804726019501685,
|
|
"num_tokens": 5353399.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 6.380198335647583,
|
|
"epoch": 0.25388555280819497,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499690490544732,
|
|
"loss": 6.285,
|
|
"mean_token_accuracy": 0.146748573333025,
|
|
"num_tokens": 5363224.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 6.372404766082764,
|
|
"epoch": 0.2543270929000353,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996888371247707,
|
|
"loss": 6.2862,
|
|
"mean_token_accuracy": 0.14108646661043167,
|
|
"num_tokens": 5372274.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 6.302175998687744,
|
|
"epoch": 0.2547686329918757,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499687179303283,
|
|
"loss": 6.2746,
|
|
"mean_token_accuracy": 0.15104661732912064,
|
|
"num_tokens": 5380240.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 6.269204902648926,
|
|
"epoch": 0.255210173083716,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004996855170803012,
|
|
"loss": 6.138,
|
|
"mean_token_accuracy": 0.15041064321994782,
|
|
"num_tokens": 5389390.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 6.343308639526367,
|
|
"epoch": 0.25565171317555635,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996838504558581,
|
|
"loss": 6.2986,
|
|
"mean_token_accuracy": 0.14492825120687486,
|
|
"num_tokens": 5399425.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 6.379653215408325,
|
|
"epoch": 0.2560932532673967,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499682179429986,
|
|
"loss": 6.3089,
|
|
"mean_token_accuracy": 0.14139388352632523,
|
|
"num_tokens": 5408717.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 6.292103576660156,
|
|
"epoch": 0.256534793359237,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996805040027178,
|
|
"loss": 6.2399,
|
|
"mean_token_accuracy": 0.1403766691684723,
|
|
"num_tokens": 5418475.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 6.395513296127319,
|
|
"epoch": 0.25697633345107734,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996788241740863,
|
|
"loss": 6.2884,
|
|
"mean_token_accuracy": 0.143946073949337,
|
|
"num_tokens": 5428403.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 6.366812467575073,
|
|
"epoch": 0.2574178735429177,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996771399441243,
|
|
"loss": 6.3188,
|
|
"mean_token_accuracy": 0.14125285297632217,
|
|
"num_tokens": 5437347.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 6.395107555389404,
|
|
"epoch": 0.25785941363475806,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004996754513128652,
|
|
"loss": 6.2216,
|
|
"mean_token_accuracy": 0.1553879424929619,
|
|
"num_tokens": 5446804.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 6.245992279052734,
|
|
"epoch": 0.2583009537265984,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996737582803416,
|
|
"loss": 6.1701,
|
|
"mean_token_accuracy": 0.14774591475725174,
|
|
"num_tokens": 5455888.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 6.349690961837768,
|
|
"epoch": 0.2587424938184387,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996720608465868,
|
|
"loss": 6.1785,
|
|
"mean_token_accuracy": 0.14454589933156967,
|
|
"num_tokens": 5463977.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 6.251695680618286,
|
|
"epoch": 0.25918403391027905,
|
|
"grad_norm": 0.953125,
|
|
"learning_rate": 0.0004996703590116342,
|
|
"loss": 6.2901,
|
|
"mean_token_accuracy": 0.1413638859987259,
|
|
"num_tokens": 5473780.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 6.343139219284057,
|
|
"epoch": 0.2596255740021194,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996686527755171,
|
|
"loss": 6.1747,
|
|
"mean_token_accuracy": 0.15054369121789932,
|
|
"num_tokens": 5482151.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 6.287330961227417,
|
|
"epoch": 0.2600671140939597,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996669421382687,
|
|
"loss": 6.181,
|
|
"mean_token_accuracy": 0.15408090725541115,
|
|
"num_tokens": 5491103.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 6.23843822479248,
|
|
"epoch": 0.26050865418580005,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004996652270999228,
|
|
"loss": 6.2051,
|
|
"mean_token_accuracy": 0.1455566719174385,
|
|
"num_tokens": 5500367.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 6.401996898651123,
|
|
"epoch": 0.26095019427764043,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996635076605128,
|
|
"loss": 6.2392,
|
|
"mean_token_accuracy": 0.1509515941143036,
|
|
"num_tokens": 5509631.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 6.3384003162384035,
|
|
"epoch": 0.26139173436948077,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004996617838200725,
|
|
"loss": 6.2572,
|
|
"mean_token_accuracy": 0.14331620335578918,
|
|
"num_tokens": 5518635.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 6.241027069091797,
|
|
"epoch": 0.2618332744613211,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996600555786357,
|
|
"loss": 6.2142,
|
|
"mean_token_accuracy": 0.1464727446436882,
|
|
"num_tokens": 5527696.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 6.348132085800171,
|
|
"epoch": 0.26227481455316143,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996583229362362,
|
|
"loss": 6.1834,
|
|
"mean_token_accuracy": 0.14780823439359664,
|
|
"num_tokens": 5536632.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 6.378821849822998,
|
|
"epoch": 0.26271635464500176,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996565858929078,
|
|
"loss": 6.2627,
|
|
"mean_token_accuracy": 0.14528179541230202,
|
|
"num_tokens": 5545825.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 6.259585618972778,
|
|
"epoch": 0.2631578947368421,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996548444486847,
|
|
"loss": 6.1389,
|
|
"mean_token_accuracy": 0.15060140788555146,
|
|
"num_tokens": 5555158.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 6.116889953613281,
|
|
"epoch": 0.2635994348286824,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996530986036008,
|
|
"loss": 6.0795,
|
|
"mean_token_accuracy": 0.15272270664572715,
|
|
"num_tokens": 5564218.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 6.255494451522827,
|
|
"epoch": 0.2640409749205228,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996513483576907,
|
|
"loss": 6.2219,
|
|
"mean_token_accuracy": 0.14951637461781503,
|
|
"num_tokens": 5572760.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 6.423755645751953,
|
|
"epoch": 0.26448251501236314,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996495937109884,
|
|
"loss": 6.2825,
|
|
"mean_token_accuracy": 0.14191085398197173,
|
|
"num_tokens": 5581660.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 6.250067615509034,
|
|
"epoch": 0.26492405510420347,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996478346635283,
|
|
"loss": 6.1968,
|
|
"mean_token_accuracy": 0.1436440147459507,
|
|
"num_tokens": 5590664.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.26492405510420347,
|
|
"eval_entropy": 6.077911184598204,
|
|
"eval_loss": 6.2711029052734375,
|
|
"eval_mean_token_accuracy": 0.15016848111384373,
|
|
"eval_num_tokens": 5590664.0,
|
|
"eval_runtime": 26.2453,
|
|
"eval_samples_per_second": 1345.574,
|
|
"eval_steps_per_second": 168.221,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 6.211904573440552,
|
|
"epoch": 0.2653655951960438,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996460712153448,
|
|
"loss": 6.0603,
|
|
"mean_token_accuracy": 0.1621384307742119,
|
|
"num_tokens": 5598727.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 6.220605993270874,
|
|
"epoch": 0.26580713528788413,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004996443033664726,
|
|
"loss": 6.2746,
|
|
"mean_token_accuracy": 0.1402523137629032,
|
|
"num_tokens": 5608549.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 6.372675228118896,
|
|
"epoch": 0.26624867537972446,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004996425311169463,
|
|
"loss": 6.1935,
|
|
"mean_token_accuracy": 0.15231838524341584,
|
|
"num_tokens": 5617628.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 6.362924861907959,
|
|
"epoch": 0.2666902154715648,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004996407544668005,
|
|
"loss": 6.2649,
|
|
"mean_token_accuracy": 0.14236303716897963,
|
|
"num_tokens": 5627182.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 6.325074291229248,
|
|
"epoch": 0.2671317555634052,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996389734160701,
|
|
"loss": 6.293,
|
|
"mean_token_accuracy": 0.14348414838314055,
|
|
"num_tokens": 5636413.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 6.389656829833984,
|
|
"epoch": 0.2675732956552455,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049963718796479,
|
|
"loss": 6.2355,
|
|
"mean_token_accuracy": 0.14448917284607887,
|
|
"num_tokens": 5646025.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 6.272923135757447,
|
|
"epoch": 0.26801483574708584,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004996353981129952,
|
|
"loss": 6.2316,
|
|
"mean_token_accuracy": 0.15121424347162246,
|
|
"num_tokens": 5656577.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 6.341066169738769,
|
|
"epoch": 0.2684563758389262,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996336038607206,
|
|
"loss": 6.2194,
|
|
"mean_token_accuracy": 0.1483485922217369,
|
|
"num_tokens": 5665163.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 6.198571634292603,
|
|
"epoch": 0.2688979159307665,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996318052080015,
|
|
"loss": 6.2208,
|
|
"mean_token_accuracy": 0.14988780170679092,
|
|
"num_tokens": 5675196.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 6.395411729812622,
|
|
"epoch": 0.26933945602260684,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996300021548731,
|
|
"loss": 6.163,
|
|
"mean_token_accuracy": 0.14734217673540115,
|
|
"num_tokens": 5684361.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 6.142444658279419,
|
|
"epoch": 0.26978099611444717,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996281947013707,
|
|
"loss": 6.1584,
|
|
"mean_token_accuracy": 0.1533804029226303,
|
|
"num_tokens": 5694187.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 6.307489728927612,
|
|
"epoch": 0.27022253620628756,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996263828475298,
|
|
"loss": 6.2235,
|
|
"mean_token_accuracy": 0.1459271177649498,
|
|
"num_tokens": 5702858.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 6.287201023101806,
|
|
"epoch": 0.2706640762981279,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004996245665933857,
|
|
"loss": 6.124,
|
|
"mean_token_accuracy": 0.1559446483850479,
|
|
"num_tokens": 5712727.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 6.197343635559082,
|
|
"epoch": 0.2711056163899682,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996227459389741,
|
|
"loss": 6.2573,
|
|
"mean_token_accuracy": 0.14863042607903482,
|
|
"num_tokens": 5722904.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 6.400126695632935,
|
|
"epoch": 0.27154715648180855,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996209208843307,
|
|
"loss": 6.3578,
|
|
"mean_token_accuracy": 0.14644555673003196,
|
|
"num_tokens": 5732228.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 6.322457504272461,
|
|
"epoch": 0.2719886965736489,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004996190914294912,
|
|
"loss": 6.2574,
|
|
"mean_token_accuracy": 0.1481010966002941,
|
|
"num_tokens": 5743548.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 6.271878862380982,
|
|
"epoch": 0.2724302366654892,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004996172575744914,
|
|
"loss": 6.2867,
|
|
"mean_token_accuracy": 0.13876855000853539,
|
|
"num_tokens": 5754112.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 6.337986612319947,
|
|
"epoch": 0.27287177675732954,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004996154193193673,
|
|
"loss": 6.0993,
|
|
"mean_token_accuracy": 0.15423450618982315,
|
|
"num_tokens": 5763377.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 6.137074375152588,
|
|
"epoch": 0.27331331684916993,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996135766641549,
|
|
"loss": 6.0806,
|
|
"mean_token_accuracy": 0.15594624429941178,
|
|
"num_tokens": 5772000.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 6.32679500579834,
|
|
"epoch": 0.27375485694101026,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996117296088903,
|
|
"loss": 6.2131,
|
|
"mean_token_accuracy": 0.14269956201314926,
|
|
"num_tokens": 5782192.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 6.341950845718384,
|
|
"epoch": 0.2741963970328506,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996098781536095,
|
|
"loss": 6.2614,
|
|
"mean_token_accuracy": 0.1522984981536865,
|
|
"num_tokens": 5791163.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 6.3308337211608885,
|
|
"epoch": 0.2746379371246909,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996080222983492,
|
|
"loss": 6.259,
|
|
"mean_token_accuracy": 0.14087159857153891,
|
|
"num_tokens": 5801815.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 6.410363006591797,
|
|
"epoch": 0.27507947721653125,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996061620431453,
|
|
"loss": 6.2805,
|
|
"mean_token_accuracy": 0.1418459102511406,
|
|
"num_tokens": 5811545.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 6.3260101795196535,
|
|
"epoch": 0.2755210173083716,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004996042973880344,
|
|
"loss": 6.2846,
|
|
"mean_token_accuracy": 0.14676327556371688,
|
|
"num_tokens": 5820285.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 6.260831451416015,
|
|
"epoch": 0.2759625574002119,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004996024283330532,
|
|
"loss": 6.1242,
|
|
"mean_token_accuracy": 0.14824864715337754,
|
|
"num_tokens": 5829284.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 6.268897342681885,
|
|
"epoch": 0.2764040974920523,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499600554878238,
|
|
"loss": 6.2951,
|
|
"mean_token_accuracy": 0.1431872047483921,
|
|
"num_tokens": 5838973.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 6.278724241256714,
|
|
"epoch": 0.27684563758389263,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004995986770236258,
|
|
"loss": 6.1837,
|
|
"mean_token_accuracy": 0.15050409361720085,
|
|
"num_tokens": 5847434.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 6.304096031188965,
|
|
"epoch": 0.27728717767573297,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995967947692533,
|
|
"loss": 6.1808,
|
|
"mean_token_accuracy": 0.14390211701393127,
|
|
"num_tokens": 5856464.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 6.311007452011109,
|
|
"epoch": 0.2777287177675733,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995949081151571,
|
|
"loss": 6.2402,
|
|
"mean_token_accuracy": 0.14435049369931222,
|
|
"num_tokens": 5865892.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 6.353293609619141,
|
|
"epoch": 0.2781702578594136,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995930170613746,
|
|
"loss": 6.2075,
|
|
"mean_token_accuracy": 0.1576365649700165,
|
|
"num_tokens": 5874606.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 6.275713014602661,
|
|
"epoch": 0.27861179795125396,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995911216079425,
|
|
"loss": 6.2116,
|
|
"mean_token_accuracy": 0.15098029375076294,
|
|
"num_tokens": 5883853.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 6.34206337928772,
|
|
"epoch": 0.2790533380430943,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004995892217548981,
|
|
"loss": 6.2474,
|
|
"mean_token_accuracy": 0.1448906570672989,
|
|
"num_tokens": 5892841.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 6.324929285049438,
|
|
"epoch": 0.2794948781349347,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995873175022786,
|
|
"loss": 6.2295,
|
|
"mean_token_accuracy": 0.15006719902157784,
|
|
"num_tokens": 5901966.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 6.260264158248901,
|
|
"epoch": 0.279936418226775,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995854088501213,
|
|
"loss": 6.1281,
|
|
"mean_token_accuracy": 0.14603292495012282,
|
|
"num_tokens": 5910003.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 6.189469957351685,
|
|
"epoch": 0.28037795831861534,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004995834957984634,
|
|
"loss": 6.2606,
|
|
"mean_token_accuracy": 0.14485765993595123,
|
|
"num_tokens": 5920307.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 6.290289306640625,
|
|
"epoch": 0.28081949841045567,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004995815783473428,
|
|
"loss": 6.1153,
|
|
"mean_token_accuracy": 0.15536403357982637,
|
|
"num_tokens": 5929875.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 6.247414398193359,
|
|
"epoch": 0.281261038502296,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995796564967967,
|
|
"loss": 6.1021,
|
|
"mean_token_accuracy": 0.14264860302209853,
|
|
"num_tokens": 5938570.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 6.209772968292237,
|
|
"epoch": 0.28170257859413633,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995777302468628,
|
|
"loss": 6.2353,
|
|
"mean_token_accuracy": 0.14704401940107345,
|
|
"num_tokens": 5947693.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 6.332046413421631,
|
|
"epoch": 0.28214411868597666,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995757995975789,
|
|
"loss": 6.3286,
|
|
"mean_token_accuracy": 0.14671236276626587,
|
|
"num_tokens": 5957377.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 6.426817464828491,
|
|
"epoch": 0.28258565877781705,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995738645489828,
|
|
"loss": 6.2285,
|
|
"mean_token_accuracy": 0.14824536591768264,
|
|
"num_tokens": 5966443.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 6.258016681671142,
|
|
"epoch": 0.2830271988696574,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995719251011124,
|
|
"loss": 6.1471,
|
|
"mean_token_accuracy": 0.14848615527153014,
|
|
"num_tokens": 5975027.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 6.288780975341797,
|
|
"epoch": 0.2834687389614977,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995699812540058,
|
|
"loss": 6.2244,
|
|
"mean_token_accuracy": 0.15162927508354188,
|
|
"num_tokens": 5983722.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 6.282262182235717,
|
|
"epoch": 0.28391027905333804,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499568033007701,
|
|
"loss": 6.2324,
|
|
"mean_token_accuracy": 0.1441471680998802,
|
|
"num_tokens": 5993358.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 6.377847146987915,
|
|
"epoch": 0.2843518191451784,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004995660803622361,
|
|
"loss": 6.161,
|
|
"mean_token_accuracy": 0.1480330415070057,
|
|
"num_tokens": 6002743.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 6.323432493209839,
|
|
"epoch": 0.2847933592370187,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995641233176494,
|
|
"loss": 6.3205,
|
|
"mean_token_accuracy": 0.14193187803030013,
|
|
"num_tokens": 6013727.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 6.404472589492798,
|
|
"epoch": 0.28523489932885904,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995621618739792,
|
|
"loss": 6.2652,
|
|
"mean_token_accuracy": 0.14536072462797164,
|
|
"num_tokens": 6023397.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 6.271181678771972,
|
|
"epoch": 0.2856764394206994,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499560196031264,
|
|
"loss": 6.1638,
|
|
"mean_token_accuracy": 0.15034203678369523,
|
|
"num_tokens": 6032453.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 6.19854884147644,
|
|
"epoch": 0.28611797951253976,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004995582257895423,
|
|
"loss": 6.1258,
|
|
"mean_token_accuracy": 0.14395371079444885,
|
|
"num_tokens": 6042441.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 6.274320507049561,
|
|
"epoch": 0.2865595196043801,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004995562511488528,
|
|
"loss": 6.1197,
|
|
"mean_token_accuracy": 0.15483347177505494,
|
|
"num_tokens": 6050881.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 6.220722341537476,
|
|
"epoch": 0.2870010596962204,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004995542721092337,
|
|
"loss": 6.1872,
|
|
"mean_token_accuracy": 0.15040701180696486,
|
|
"num_tokens": 6060804.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 6.368199014663697,
|
|
"epoch": 0.28744259978806075,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995522886707244,
|
|
"loss": 6.2847,
|
|
"mean_token_accuracy": 0.14250023737549783,
|
|
"num_tokens": 6070774.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 6.330855846405029,
|
|
"epoch": 0.2878841398799011,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995503008333634,
|
|
"loss": 6.2491,
|
|
"mean_token_accuracy": 0.14379776269197464,
|
|
"num_tokens": 6080158.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 6.32779221534729,
|
|
"epoch": 0.2883256799717414,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995483085971897,
|
|
"loss": 6.16,
|
|
"mean_token_accuracy": 0.15112278908491134,
|
|
"num_tokens": 6089183.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 6.200159549713135,
|
|
"epoch": 0.2887672200635818,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995463119622424,
|
|
"loss": 6.2524,
|
|
"mean_token_accuracy": 0.14461245387792587,
|
|
"num_tokens": 6098536.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 6.196738433837891,
|
|
"epoch": 0.28920876015542213,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004995443109285604,
|
|
"loss": 6.0112,
|
|
"mean_token_accuracy": 0.1629092276096344,
|
|
"num_tokens": 6107745.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 6.289895725250244,
|
|
"epoch": 0.28965030024726246,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004995423054961832,
|
|
"loss": 6.2158,
|
|
"mean_token_accuracy": 0.15729496926069259,
|
|
"num_tokens": 6117512.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 6.227946424484253,
|
|
"epoch": 0.2900918403391028,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049954029566515,
|
|
"loss": 6.2178,
|
|
"mean_token_accuracy": 0.15285916179418563,
|
|
"num_tokens": 6126030.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 6.346371364593506,
|
|
"epoch": 0.2905333804309431,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004995382814355,
|
|
"loss": 6.2625,
|
|
"mean_token_accuracy": 0.14053603783249854,
|
|
"num_tokens": 6134888.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 6.351688432693481,
|
|
"epoch": 0.29097492052278345,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995362628072728,
|
|
"loss": 6.2117,
|
|
"mean_token_accuracy": 0.1513692669570446,
|
|
"num_tokens": 6144274.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 6.218261671066284,
|
|
"epoch": 0.2914164606146238,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995342397805078,
|
|
"loss": 6.2217,
|
|
"mean_token_accuracy": 0.1503726065158844,
|
|
"num_tokens": 6153406.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 6.262264680862427,
|
|
"epoch": 0.29185800070646417,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004995322123552448,
|
|
"loss": 6.1233,
|
|
"mean_token_accuracy": 0.1533094823360443,
|
|
"num_tokens": 6162743.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 6.2753763675689695,
|
|
"epoch": 0.2922995407983045,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995301805315235,
|
|
"loss": 6.1201,
|
|
"mean_token_accuracy": 0.15415377020835877,
|
|
"num_tokens": 6171997.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 6.181772375106812,
|
|
"epoch": 0.29274108089014483,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995281443093837,
|
|
"loss": 6.1422,
|
|
"mean_token_accuracy": 0.15062253326177596,
|
|
"num_tokens": 6181275.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 6.289641571044922,
|
|
"epoch": 0.29318262098198516,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995261036888652,
|
|
"loss": 6.2135,
|
|
"mean_token_accuracy": 0.14647497087717057,
|
|
"num_tokens": 6191640.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 6.357338809967041,
|
|
"epoch": 0.2936241610738255,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004995240586700081,
|
|
"loss": 6.2137,
|
|
"mean_token_accuracy": 0.14479927867650985,
|
|
"num_tokens": 6201508.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 6.220925617218017,
|
|
"epoch": 0.2940657011656658,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004995220092528522,
|
|
"loss": 6.1079,
|
|
"mean_token_accuracy": 0.15776659697294235,
|
|
"num_tokens": 6209902.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 6.257762432098389,
|
|
"epoch": 0.29450724125750616,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499519955437438,
|
|
"loss": 6.2514,
|
|
"mean_token_accuracy": 0.14023935049772263,
|
|
"num_tokens": 6219760.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 6.248654699325561,
|
|
"epoch": 0.29494878134934654,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995178972238054,
|
|
"loss": 6.2308,
|
|
"mean_token_accuracy": 0.14278148710727692,
|
|
"num_tokens": 6228721.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 6.228511571884155,
|
|
"epoch": 0.2953903214411869,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499515834611995,
|
|
"loss": 6.0645,
|
|
"mean_token_accuracy": 0.16055997535586358,
|
|
"num_tokens": 6237070.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 6.2931300640106205,
|
|
"epoch": 0.2958318615330272,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004995137676020472,
|
|
"loss": 6.2051,
|
|
"mean_token_accuracy": 0.1466339647769928,
|
|
"num_tokens": 6245659.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 6.231089639663696,
|
|
"epoch": 0.29627340162486754,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995116961940023,
|
|
"loss": 6.1736,
|
|
"mean_token_accuracy": 0.15321153849363328,
|
|
"num_tokens": 6255175.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 6.241381883621216,
|
|
"epoch": 0.29671494171670787,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004995096203879009,
|
|
"loss": 6.1761,
|
|
"mean_token_accuracy": 0.1443149983882904,
|
|
"num_tokens": 6264962.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 6.29106593132019,
|
|
"epoch": 0.2971564818085482,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995075401837837,
|
|
"loss": 6.0989,
|
|
"mean_token_accuracy": 0.15684471875429154,
|
|
"num_tokens": 6273411.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 6.135526657104492,
|
|
"epoch": 0.29759802190038853,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995054555816915,
|
|
"loss": 6.0795,
|
|
"mean_token_accuracy": 0.15481019616127015,
|
|
"num_tokens": 6282618.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 6.155969190597534,
|
|
"epoch": 0.2980395619922289,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995033665816651,
|
|
"loss": 6.0726,
|
|
"mean_token_accuracy": 0.15434197783470155,
|
|
"num_tokens": 6292008.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 6.147941923141479,
|
|
"epoch": 0.29848110208406925,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995012731837454,
|
|
"loss": 6.0441,
|
|
"mean_token_accuracy": 0.15588683634996414,
|
|
"num_tokens": 6301079.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 6.323877429962158,
|
|
"epoch": 0.2989226421759096,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004994991753879736,
|
|
"loss": 6.2273,
|
|
"mean_token_accuracy": 0.14481185078620912,
|
|
"num_tokens": 6310543.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 6.276103544235229,
|
|
"epoch": 0.2993641822677499,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004994970731943904,
|
|
"loss": 6.1692,
|
|
"mean_token_accuracy": 0.1476268857717514,
|
|
"num_tokens": 6320037.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 6.177714109420776,
|
|
"epoch": 0.29980572235959024,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994949666030374,
|
|
"loss": 6.1795,
|
|
"mean_token_accuracy": 0.15479264855384828,
|
|
"num_tokens": 6328948.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 6.212888717651367,
|
|
"epoch": 0.3002472624514306,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994928556139557,
|
|
"loss": 6.0425,
|
|
"mean_token_accuracy": 0.1581359773874283,
|
|
"num_tokens": 6337449.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 6.162185001373291,
|
|
"epoch": 0.3006888025432709,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004994907402271865,
|
|
"loss": 6.1028,
|
|
"mean_token_accuracy": 0.15499115511775016,
|
|
"num_tokens": 6346130.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 6.15036244392395,
|
|
"epoch": 0.3011303426351113,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004994886204427715,
|
|
"loss": 6.1344,
|
|
"mean_token_accuracy": 0.15851569175720215,
|
|
"num_tokens": 6354973.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 6.328536748886108,
|
|
"epoch": 0.3015718827269516,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994864962607519,
|
|
"loss": 6.2148,
|
|
"mean_token_accuracy": 0.14824963808059693,
|
|
"num_tokens": 6364107.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 6.26984658241272,
|
|
"epoch": 0.30201342281879195,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004994843676811697,
|
|
"loss": 6.1438,
|
|
"mean_token_accuracy": 0.14578014612197876,
|
|
"num_tokens": 6372859.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 6.184666395187378,
|
|
"epoch": 0.3024549629106323,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994822347040664,
|
|
"loss": 6.0282,
|
|
"mean_token_accuracy": 0.1636571153998375,
|
|
"num_tokens": 6381818.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 6.257387351989746,
|
|
"epoch": 0.3028965030024726,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994800973294837,
|
|
"loss": 6.1797,
|
|
"mean_token_accuracy": 0.1482721135020256,
|
|
"num_tokens": 6391460.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 6.25637993812561,
|
|
"epoch": 0.30333804309431295,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004994779555574636,
|
|
"loss": 6.2086,
|
|
"mean_token_accuracy": 0.15168215036392213,
|
|
"num_tokens": 6401461.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 6.262447023391724,
|
|
"epoch": 0.3037795831861533,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000499475809388048,
|
|
"loss": 6.0842,
|
|
"mean_token_accuracy": 0.14643362984061242,
|
|
"num_tokens": 6410435.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 6.279139566421509,
|
|
"epoch": 0.30422112327799367,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499473658821279,
|
|
"loss": 6.2844,
|
|
"mean_token_accuracy": 0.14141586795449257,
|
|
"num_tokens": 6420710.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 6.305064296722412,
|
|
"epoch": 0.304662663369834,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004994715038571986,
|
|
"loss": 6.1276,
|
|
"mean_token_accuracy": 0.149975299090147,
|
|
"num_tokens": 6429882.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 6.167029666900635,
|
|
"epoch": 0.30510420346167433,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994693444958493,
|
|
"loss": 6.1142,
|
|
"mean_token_accuracy": 0.15782218649983407,
|
|
"num_tokens": 6439183.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 6.307245588302612,
|
|
"epoch": 0.30554574355351466,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.000499467180737273,
|
|
"loss": 6.288,
|
|
"mean_token_accuracy": 0.14309904649853705,
|
|
"num_tokens": 6448460.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 6.270366525650024,
|
|
"epoch": 0.305987283645355,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994650125815124,
|
|
"loss": 6.1656,
|
|
"mean_token_accuracy": 0.14977657794952393,
|
|
"num_tokens": 6457687.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 6.241862058639526,
|
|
"epoch": 0.3064288237371953,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004994628400286097,
|
|
"loss": 6.1063,
|
|
"mean_token_accuracy": 0.15189075246453285,
|
|
"num_tokens": 6467057.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 6.209890508651734,
|
|
"epoch": 0.30687036382903565,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994606630786078,
|
|
"loss": 6.1745,
|
|
"mean_token_accuracy": 0.15533267706632614,
|
|
"num_tokens": 6477484.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 6.248384809494018,
|
|
"epoch": 0.30731190392087604,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994584817315492,
|
|
"loss": 6.233,
|
|
"mean_token_accuracy": 0.14967372938990592,
|
|
"num_tokens": 6488381.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 6.30739917755127,
|
|
"epoch": 0.30775344401271637,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994562959874765,
|
|
"loss": 6.1277,
|
|
"mean_token_accuracy": 0.1523176297545433,
|
|
"num_tokens": 6498279.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 6.166545104980469,
|
|
"epoch": 0.3081949841045567,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004994541058464326,
|
|
"loss": 6.1476,
|
|
"mean_token_accuracy": 0.14837286472320557,
|
|
"num_tokens": 6508008.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 6.321815013885498,
|
|
"epoch": 0.30863652419639703,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994519113084605,
|
|
"loss": 6.1947,
|
|
"mean_token_accuracy": 0.1496584579348564,
|
|
"num_tokens": 6517263.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 6.262754631042481,
|
|
"epoch": 0.30907806428823736,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994497123736029,
|
|
"loss": 6.2631,
|
|
"mean_token_accuracy": 0.1426304429769516,
|
|
"num_tokens": 6527682.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 6.267001819610596,
|
|
"epoch": 0.3095196043800777,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994475090419034,
|
|
"loss": 6.1298,
|
|
"mean_token_accuracy": 0.15242742300033568,
|
|
"num_tokens": 6537143.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 6.250511837005615,
|
|
"epoch": 0.309961144471918,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994453013134047,
|
|
"loss": 6.1352,
|
|
"mean_token_accuracy": 0.14925057888031007,
|
|
"num_tokens": 6546561.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 6.185484886169434,
|
|
"epoch": 0.3104026845637584,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994430891881502,
|
|
"loss": 6.124,
|
|
"mean_token_accuracy": 0.15090959072113036,
|
|
"num_tokens": 6555806.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 6.202518653869629,
|
|
"epoch": 0.31084422465559874,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004994408726661832,
|
|
"loss": 6.166,
|
|
"mean_token_accuracy": 0.1501057654619217,
|
|
"num_tokens": 6564836.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 6.245743179321289,
|
|
"epoch": 0.3112857647474391,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004994386517475472,
|
|
"loss": 6.1713,
|
|
"mean_token_accuracy": 0.1496300369501114,
|
|
"num_tokens": 6574997.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 6.246868562698364,
|
|
"epoch": 0.3117273048392794,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004994364264322856,
|
|
"loss": 6.1715,
|
|
"mean_token_accuracy": 0.14895060658454895,
|
|
"num_tokens": 6584589.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 6.162112808227539,
|
|
"epoch": 0.31216884493111974,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004994341967204421,
|
|
"loss": 6.1199,
|
|
"mean_token_accuracy": 0.15284974724054337,
|
|
"num_tokens": 6592898.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 6.326304626464844,
|
|
"epoch": 0.31261038502296007,
|
|
"grad_norm": 0.984375,
|
|
"learning_rate": 0.0004994319626120603,
|
|
"loss": 6.1268,
|
|
"mean_token_accuracy": 0.16072259843349457,
|
|
"num_tokens": 6602454.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 6.1308966159820555,
|
|
"epoch": 0.3130519251148004,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994297241071841,
|
|
"loss": 6.1467,
|
|
"mean_token_accuracy": 0.1551404133439064,
|
|
"num_tokens": 6612454.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 6.29251446723938,
|
|
"epoch": 0.3134934652066408,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499427481205857,
|
|
"loss": 6.1423,
|
|
"mean_token_accuracy": 0.15539032369852065,
|
|
"num_tokens": 6621286.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 6.253758335113526,
|
|
"epoch": 0.3139350052984811,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004994252339081234,
|
|
"loss": 6.0651,
|
|
"mean_token_accuracy": 0.15716515630483627,
|
|
"num_tokens": 6629684.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 6.167656517028808,
|
|
"epoch": 0.31437654539032145,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499422982214027,
|
|
"loss": 6.1775,
|
|
"mean_token_accuracy": 0.15429823398590087,
|
|
"num_tokens": 6638836.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 6.254745101928711,
|
|
"epoch": 0.3148180854821618,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994207261236121,
|
|
"loss": 6.1189,
|
|
"mean_token_accuracy": 0.14906142503023148,
|
|
"num_tokens": 6647337.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 6.145832633972168,
|
|
"epoch": 0.3152596255740021,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994184656369227,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.145336801558733,
|
|
"num_tokens": 6657246.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 6.310924911499024,
|
|
"epoch": 0.31570116566584244,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994162007540033,
|
|
"loss": 6.2505,
|
|
"mean_token_accuracy": 0.14196238815784454,
|
|
"num_tokens": 6667981.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 6.2675220489501955,
|
|
"epoch": 0.3161427057576828,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004994139314748981,
|
|
"loss": 6.0974,
|
|
"mean_token_accuracy": 0.15776362121105195,
|
|
"num_tokens": 6677462.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 6.107239675521851,
|
|
"epoch": 0.31658424584952316,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004994116577996517,
|
|
"loss": 6.1152,
|
|
"mean_token_accuracy": 0.15776521414518357,
|
|
"num_tokens": 6687225.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 6.2434509754180905,
|
|
"epoch": 0.3170257859413635,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004994093797283084,
|
|
"loss": 6.2122,
|
|
"mean_token_accuracy": 0.1487804666161537,
|
|
"num_tokens": 6696345.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 6.313143348693847,
|
|
"epoch": 0.3174673260332038,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994070972609132,
|
|
"loss": 6.1867,
|
|
"mean_token_accuracy": 0.14744829311966895,
|
|
"num_tokens": 6706207.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 6.24591326713562,
|
|
"epoch": 0.31790886612504415,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004994048103975103,
|
|
"loss": 6.0904,
|
|
"mean_token_accuracy": 0.15303746610879898,
|
|
"num_tokens": 6714430.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 6.204165840148926,
|
|
"epoch": 0.3183504062168845,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499402519138145,
|
|
"loss": 6.1197,
|
|
"mean_token_accuracy": 0.1557806834578514,
|
|
"num_tokens": 6724139.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 6.151504611968994,
|
|
"epoch": 0.3187919463087248,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004994002234828619,
|
|
"loss": 6.1566,
|
|
"mean_token_accuracy": 0.1508852459490299,
|
|
"num_tokens": 6733525.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 6.321462392807007,
|
|
"epoch": 0.31923348640056515,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499397923431706,
|
|
"loss": 6.2666,
|
|
"mean_token_accuracy": 0.1465996690094471,
|
|
"num_tokens": 6743738.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 6.337309741973877,
|
|
"epoch": 0.31967502649240553,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004993956189847226,
|
|
"loss": 6.1012,
|
|
"mean_token_accuracy": 0.15459158420562744,
|
|
"num_tokens": 6752900.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 6.150611686706543,
|
|
"epoch": 0.32011656658424587,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004993933101419565,
|
|
"loss": 6.106,
|
|
"mean_token_accuracy": 0.1578878253698349,
|
|
"num_tokens": 6761776.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 6.23021969795227,
|
|
"epoch": 0.3205581066760862,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004993909969034531,
|
|
"loss": 6.2486,
|
|
"mean_token_accuracy": 0.1432959534227848,
|
|
"num_tokens": 6771543.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 6.232846355438232,
|
|
"epoch": 0.3209996467679265,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004993886792692576,
|
|
"loss": 6.1261,
|
|
"mean_token_accuracy": 0.15022226572036743,
|
|
"num_tokens": 6780010.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 6.273227500915527,
|
|
"epoch": 0.32144118685976686,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004993863572394156,
|
|
"loss": 6.186,
|
|
"mean_token_accuracy": 0.14826097190380097,
|
|
"num_tokens": 6788830.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 6.266057395935059,
|
|
"epoch": 0.3218827269516072,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993840308139724,
|
|
"loss": 6.163,
|
|
"mean_token_accuracy": 0.15327871441841126,
|
|
"num_tokens": 6799121.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 6.232186365127563,
|
|
"epoch": 0.3223242670434475,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004993816999929738,
|
|
"loss": 6.1253,
|
|
"mean_token_accuracy": 0.15162717401981354,
|
|
"num_tokens": 6808519.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 6.129374217987061,
|
|
"epoch": 0.3227658071352879,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004993793647764651,
|
|
"loss": 6.1462,
|
|
"mean_token_accuracy": 0.16090914756059646,
|
|
"num_tokens": 6817583.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 6.254270458221436,
|
|
"epoch": 0.32320734722712824,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004993770251644923,
|
|
"loss": 6.1315,
|
|
"mean_token_accuracy": 0.15279360860586166,
|
|
"num_tokens": 6827732.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 6.188901424407959,
|
|
"epoch": 0.32364888731896857,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993746811571013,
|
|
"loss": 6.1101,
|
|
"mean_token_accuracy": 0.15757904648780824,
|
|
"num_tokens": 6836903.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 6.073100471496582,
|
|
"epoch": 0.3240904274108089,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004993723327543379,
|
|
"loss": 6.119,
|
|
"mean_token_accuracy": 0.15876784324645996,
|
|
"num_tokens": 6845684.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 6.200423860549927,
|
|
"epoch": 0.32453196750264923,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499369979956248,
|
|
"loss": 6.1035,
|
|
"mean_token_accuracy": 0.14955462887883186,
|
|
"num_tokens": 6854650.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 6.267183446884156,
|
|
"epoch": 0.32497350759448956,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993676227628779,
|
|
"loss": 6.0977,
|
|
"mean_token_accuracy": 0.1460177183151245,
|
|
"num_tokens": 6864340.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 6.208888530731201,
|
|
"epoch": 0.3254150476863299,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993652611742736,
|
|
"loss": 6.1343,
|
|
"mean_token_accuracy": 0.15321332961320877,
|
|
"num_tokens": 6873244.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 6.15152006149292,
|
|
"epoch": 0.3258565877781703,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004993628951904815,
|
|
"loss": 6.0176,
|
|
"mean_token_accuracy": 0.15752123296260834,
|
|
"num_tokens": 6882392.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 6.2300208568572994,
|
|
"epoch": 0.3262981278700106,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993605248115479,
|
|
"loss": 6.2446,
|
|
"mean_token_accuracy": 0.1435020685195923,
|
|
"num_tokens": 6891515.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 6.330176734924317,
|
|
"epoch": 0.32673966796185094,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993581500375191,
|
|
"loss": 6.1177,
|
|
"mean_token_accuracy": 0.14757276177406312,
|
|
"num_tokens": 6900104.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 6.2001354694366455,
|
|
"epoch": 0.3271812080536913,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004993557708684417,
|
|
"loss": 6.2227,
|
|
"mean_token_accuracy": 0.14519331306219102,
|
|
"num_tokens": 6910443.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 6.208211469650268,
|
|
"epoch": 0.3276227481455316,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004993533873043625,
|
|
"loss": 6.1062,
|
|
"mean_token_accuracy": 0.15348025262355805,
|
|
"num_tokens": 6919882.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 6.281739521026611,
|
|
"epoch": 0.32806428823737194,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499350999345328,
|
|
"loss": 6.2142,
|
|
"mean_token_accuracy": 0.14149869233369827,
|
|
"num_tokens": 6929757.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 6.222444868087768,
|
|
"epoch": 0.32850582832921227,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499348606991385,
|
|
"loss": 6.201,
|
|
"mean_token_accuracy": 0.1466899633407593,
|
|
"num_tokens": 6938578.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 6.181999588012696,
|
|
"epoch": 0.32894736842105265,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993462102425805,
|
|
"loss": 6.0957,
|
|
"mean_token_accuracy": 0.14744215086102486,
|
|
"num_tokens": 6947920.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 6.170785808563233,
|
|
"epoch": 0.329388908512893,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993438090989612,
|
|
"loss": 6.0325,
|
|
"mean_token_accuracy": 0.15373560339212416,
|
|
"num_tokens": 6957291.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 6.204143905639649,
|
|
"epoch": 0.3298304486047333,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993414035605743,
|
|
"loss": 6.0944,
|
|
"mean_token_accuracy": 0.16135938167572023,
|
|
"num_tokens": 6966504.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 6.26701283454895,
|
|
"epoch": 0.33027198869657365,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993389936274669,
|
|
"loss": 6.1381,
|
|
"mean_token_accuracy": 0.15607366263866423,
|
|
"num_tokens": 6976234.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 6.179308748245239,
|
|
"epoch": 0.330713528788414,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993365792996862,
|
|
"loss": 6.1882,
|
|
"mean_token_accuracy": 0.1475161299109459,
|
|
"num_tokens": 6986372.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 6.258656454086304,
|
|
"epoch": 0.3311550688802543,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993341605772795,
|
|
"loss": 6.0768,
|
|
"mean_token_accuracy": 0.155860435962677,
|
|
"num_tokens": 6995553.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 6.148722696304321,
|
|
"epoch": 0.33159660897209464,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004993317374602941,
|
|
"loss": 5.9902,
|
|
"mean_token_accuracy": 0.16303742080926895,
|
|
"num_tokens": 7005675.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 6.081928777694702,
|
|
"epoch": 0.33203814906393503,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004993293099487777,
|
|
"loss": 6.1198,
|
|
"mean_token_accuracy": 0.14885973036289216,
|
|
"num_tokens": 7014953.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 6.263726997375488,
|
|
"epoch": 0.33247968915577536,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993268780427776,
|
|
"loss": 6.1445,
|
|
"mean_token_accuracy": 0.1564795732498169,
|
|
"num_tokens": 7025075.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 6.201370096206665,
|
|
"epoch": 0.3329212292476157,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004993244417423416,
|
|
"loss": 6.1224,
|
|
"mean_token_accuracy": 0.15349680185317993,
|
|
"num_tokens": 7034286.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 6.201808977127075,
|
|
"epoch": 0.333362769339456,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004993220010475174,
|
|
"loss": 6.1767,
|
|
"mean_token_accuracy": 0.14727241545915604,
|
|
"num_tokens": 7043957.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 6.2423442840576175,
|
|
"epoch": 0.33380430943129635,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004993195559583526,
|
|
"loss": 6.091,
|
|
"mean_token_accuracy": 0.15680659636855127,
|
|
"num_tokens": 7053170.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 6.163934850692749,
|
|
"epoch": 0.3342458495231367,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004993171064748954,
|
|
"loss": 6.0909,
|
|
"mean_token_accuracy": 0.15337430387735368,
|
|
"num_tokens": 7062469.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 6.245288276672364,
|
|
"epoch": 0.334687389614977,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004993146525971937,
|
|
"loss": 6.0751,
|
|
"mean_token_accuracy": 0.15703266561031343,
|
|
"num_tokens": 7071463.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 6.30460000038147,
|
|
"epoch": 0.3351289297068174,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004993121943252955,
|
|
"loss": 6.1685,
|
|
"mean_token_accuracy": 0.1484996944665909,
|
|
"num_tokens": 7081574.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 6.146984243392945,
|
|
"epoch": 0.33557046979865773,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004993097316592489,
|
|
"loss": 6.0826,
|
|
"mean_token_accuracy": 0.15749146193265914,
|
|
"num_tokens": 7090835.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 6.072944211959839,
|
|
"epoch": 0.33601200989049806,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004993072645991023,
|
|
"loss": 6.0092,
|
|
"mean_token_accuracy": 0.14467609971761702,
|
|
"num_tokens": 7100521.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 6.172339153289795,
|
|
"epoch": 0.3364535499823384,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000499304793144904,
|
|
"loss": 6.108,
|
|
"mean_token_accuracy": 0.15104963332414628,
|
|
"num_tokens": 7109011.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 6.258471536636352,
|
|
"epoch": 0.3368950900741787,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004993023172967022,
|
|
"loss": 6.1451,
|
|
"mean_token_accuracy": 0.14512295573949813,
|
|
"num_tokens": 7119128.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 6.266326761245727,
|
|
"epoch": 0.33733663016601906,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004992998370545458,
|
|
"loss": 6.2073,
|
|
"mean_token_accuracy": 0.14363647550344466,
|
|
"num_tokens": 7128313.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 6.234633827209473,
|
|
"epoch": 0.3377781702578594,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004992973524184831,
|
|
"loss": 6.1686,
|
|
"mean_token_accuracy": 0.14797609224915503,
|
|
"num_tokens": 7137567.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 6.209169197082519,
|
|
"epoch": 0.3382197103496998,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992948633885627,
|
|
"loss": 6.1355,
|
|
"mean_token_accuracy": 0.15170362889766692,
|
|
"num_tokens": 7147254.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 6.123815822601318,
|
|
"epoch": 0.3386612504415401,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992923699648335,
|
|
"loss": 6.055,
|
|
"mean_token_accuracy": 0.15891509354114533,
|
|
"num_tokens": 7156215.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 6.286255836486816,
|
|
"epoch": 0.33910279053338044,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992898721473445,
|
|
"loss": 6.1341,
|
|
"mean_token_accuracy": 0.14742862731218337,
|
|
"num_tokens": 7165653.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 6.107093811035156,
|
|
"epoch": 0.33954433062522077,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992873699361444,
|
|
"loss": 6.0116,
|
|
"mean_token_accuracy": 0.15364348739385605,
|
|
"num_tokens": 7174797.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 6.231772947311401,
|
|
"epoch": 0.3399858707170611,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992848633312822,
|
|
"loss": 6.076,
|
|
"mean_token_accuracy": 0.15550567209720612,
|
|
"num_tokens": 7184040.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 6.206028461456299,
|
|
"epoch": 0.34042741080890143,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004992823523328071,
|
|
"loss": 6.0162,
|
|
"mean_token_accuracy": 0.156949782371521,
|
|
"num_tokens": 7193533.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 6.040725946426392,
|
|
"epoch": 0.34086895090074176,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004992798369407684,
|
|
"loss": 6.1271,
|
|
"mean_token_accuracy": 0.1540011927485466,
|
|
"num_tokens": 7203355.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 6.231026697158813,
|
|
"epoch": 0.34131049099258215,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992773171552152,
|
|
"loss": 6.0662,
|
|
"mean_token_accuracy": 0.15278246700763704,
|
|
"num_tokens": 7212569.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 6.1147346019744875,
|
|
"epoch": 0.3417520310844225,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992747929761968,
|
|
"loss": 6.0862,
|
|
"mean_token_accuracy": 0.1568584769964218,
|
|
"num_tokens": 7221886.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 6.20992078781128,
|
|
"epoch": 0.3421935711762628,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004992722644037628,
|
|
"loss": 6.1399,
|
|
"mean_token_accuracy": 0.14878576919436454,
|
|
"num_tokens": 7232065.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 6.2117961883544925,
|
|
"epoch": 0.34263511126810314,
|
|
"grad_norm": 0.98828125,
|
|
"learning_rate": 0.0004992697314379628,
|
|
"loss": 6.0802,
|
|
"mean_token_accuracy": 0.1546872690320015,
|
|
"num_tokens": 7242287.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 6.163637351989746,
|
|
"epoch": 0.3430766513599435,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992671940788462,
|
|
"loss": 6.0744,
|
|
"mean_token_accuracy": 0.15107770562171935,
|
|
"num_tokens": 7250638.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 6.146958065032959,
|
|
"epoch": 0.3435181914517838,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992646523264628,
|
|
"loss": 6.0492,
|
|
"mean_token_accuracy": 0.1611901268362999,
|
|
"num_tokens": 7259526.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 6.119075059890747,
|
|
"epoch": 0.34395973154362414,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992621061808625,
|
|
"loss": 6.0604,
|
|
"mean_token_accuracy": 0.15892861932516097,
|
|
"num_tokens": 7269260.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 6.161959505081176,
|
|
"epoch": 0.3444012716354645,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499259555642095,
|
|
"loss": 6.0786,
|
|
"mean_token_accuracy": 0.15186367481946944,
|
|
"num_tokens": 7279061.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 6.21967043876648,
|
|
"epoch": 0.34484281172730485,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004992570007102104,
|
|
"loss": 6.0831,
|
|
"mean_token_accuracy": 0.15683933049440385,
|
|
"num_tokens": 7289721.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 6.214530658721924,
|
|
"epoch": 0.3452843518191452,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992544413852587,
|
|
"loss": 6.2136,
|
|
"mean_token_accuracy": 0.14554179906845094,
|
|
"num_tokens": 7299836.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 6.225806951522827,
|
|
"epoch": 0.3457258919109855,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00049925187766729,
|
|
"loss": 6.2249,
|
|
"mean_token_accuracy": 0.14765020608901977,
|
|
"num_tokens": 7309306.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 6.249141550064087,
|
|
"epoch": 0.34616743200282585,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992493095563545,
|
|
"loss": 6.1238,
|
|
"mean_token_accuracy": 0.16257761269807816,
|
|
"num_tokens": 7318486.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 6.111485767364502,
|
|
"epoch": 0.3466089720946662,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004992467370525026,
|
|
"loss": 6.083,
|
|
"mean_token_accuracy": 0.1557182028889656,
|
|
"num_tokens": 7327764.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 6.248839044570923,
|
|
"epoch": 0.3470505121865065,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992441601557848,
|
|
"loss": 6.168,
|
|
"mean_token_accuracy": 0.15208059847354888,
|
|
"num_tokens": 7337118.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 6.197002506256103,
|
|
"epoch": 0.3474920522783469,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004992415788662514,
|
|
"loss": 6.0958,
|
|
"mean_token_accuracy": 0.1525207430124283,
|
|
"num_tokens": 7345973.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 6.194469833374024,
|
|
"epoch": 0.34793359237018723,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004992389931839529,
|
|
"loss": 6.0928,
|
|
"mean_token_accuracy": 0.1534672871232033,
|
|
"num_tokens": 7355776.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 6.168269205093384,
|
|
"epoch": 0.34837513246202756,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992364031089401,
|
|
"loss": 6.1161,
|
|
"mean_token_accuracy": 0.1555505856871605,
|
|
"num_tokens": 7364820.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 6.2373566150665285,
|
|
"epoch": 0.3488166725538679,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004992338086412636,
|
|
"loss": 6.1218,
|
|
"mean_token_accuracy": 0.14930969327688218,
|
|
"num_tokens": 7373881.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 6.135814476013183,
|
|
"epoch": 0.3492582126457082,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004992312097809744,
|
|
"loss": 6.0892,
|
|
"mean_token_accuracy": 0.15562115162611007,
|
|
"num_tokens": 7383095.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 6.195784330368042,
|
|
"epoch": 0.34969975273754855,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992286065281234,
|
|
"loss": 6.1216,
|
|
"mean_token_accuracy": 0.15000923871994018,
|
|
"num_tokens": 7392702.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 6.247405576705932,
|
|
"epoch": 0.3501412928293889,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004992259988827614,
|
|
"loss": 6.2769,
|
|
"mean_token_accuracy": 0.14470015615224838,
|
|
"num_tokens": 7402560.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 6.284795522689819,
|
|
"epoch": 0.35058283292122927,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004992233868449397,
|
|
"loss": 6.051,
|
|
"mean_token_accuracy": 0.15953975468873977,
|
|
"num_tokens": 7412375.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 6.195076513290405,
|
|
"epoch": 0.3510243730130696,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004992207704147093,
|
|
"loss": 6.0928,
|
|
"mean_token_accuracy": 0.15187776535749437,
|
|
"num_tokens": 7420734.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 6.101401853561401,
|
|
"epoch": 0.35146591310490993,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004992181495921216,
|
|
"loss": 6.0919,
|
|
"mean_token_accuracy": 0.15785269439220428,
|
|
"num_tokens": 7430156.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 6.222190999984742,
|
|
"epoch": 0.35190745319675026,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004992155243772277,
|
|
"loss": 6.1007,
|
|
"mean_token_accuracy": 0.1574430137872696,
|
|
"num_tokens": 7439189.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 6.2206672668457035,
|
|
"epoch": 0.3523489932885906,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004992128947700795,
|
|
"loss": 6.1249,
|
|
"mean_token_accuracy": 0.1455397441983223,
|
|
"num_tokens": 7448768.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 6.180650806427002,
|
|
"epoch": 0.3527905333804309,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499210260770728,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.15497558265924455,
|
|
"num_tokens": 7458468.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 6.150880622863769,
|
|
"epoch": 0.35323207347227126,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499207622379225,
|
|
"loss": 6.1475,
|
|
"mean_token_accuracy": 0.1480465464293957,
|
|
"num_tokens": 7467351.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 6.2564455509185795,
|
|
"epoch": 0.35367361356411164,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004992049795956222,
|
|
"loss": 6.1012,
|
|
"mean_token_accuracy": 0.14711347222328186,
|
|
"num_tokens": 7477116.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 6.234343814849853,
|
|
"epoch": 0.354115153655952,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992023324199715,
|
|
"loss": 6.0249,
|
|
"mean_token_accuracy": 0.15656134784221648,
|
|
"num_tokens": 7485216.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 6.075209522247315,
|
|
"epoch": 0.3545566937477923,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004991996808523245,
|
|
"loss": 6.1798,
|
|
"mean_token_accuracy": 0.1469147637486458,
|
|
"num_tokens": 7495084.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 6.306225442886353,
|
|
"epoch": 0.35499823383963264,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004991970248927332,
|
|
"loss": 6.1973,
|
|
"mean_token_accuracy": 0.14604026302695275,
|
|
"num_tokens": 7503219.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 6.196821165084839,
|
|
"epoch": 0.35543977393147297,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004991943645412498,
|
|
"loss": 5.9766,
|
|
"mean_token_accuracy": 0.1642112761735916,
|
|
"num_tokens": 7511961.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.985270690917969,
|
|
"epoch": 0.3558813140233133,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991916997979263,
|
|
"loss": 6.1096,
|
|
"mean_token_accuracy": 0.1546097069978714,
|
|
"num_tokens": 7521927.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 6.103000640869141,
|
|
"epoch": 0.35632285411515363,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004991890306628149,
|
|
"loss": 6.0451,
|
|
"mean_token_accuracy": 0.15554122179746627,
|
|
"num_tokens": 7531093.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 6.198269557952881,
|
|
"epoch": 0.356764394206994,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004991863571359678,
|
|
"loss": 6.0078,
|
|
"mean_token_accuracy": 0.15710966363549234,
|
|
"num_tokens": 7539813.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 6.244335699081421,
|
|
"epoch": 0.35720593429883435,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004991836792174376,
|
|
"loss": 6.1875,
|
|
"mean_token_accuracy": 0.1476308137178421,
|
|
"num_tokens": 7548954.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 6.100079536437988,
|
|
"epoch": 0.3576474743906747,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004991809969072765,
|
|
"loss": 6.0322,
|
|
"mean_token_accuracy": 0.1548540085554123,
|
|
"num_tokens": 7558275.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 6.13811764717102,
|
|
"epoch": 0.358089014482515,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004991783102055371,
|
|
"loss": 6.0688,
|
|
"mean_token_accuracy": 0.15873494520783424,
|
|
"num_tokens": 7567870.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 6.249936914443969,
|
|
"epoch": 0.35853055457435534,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991756191122723,
|
|
"loss": 6.0385,
|
|
"mean_token_accuracy": 0.15568251237273217,
|
|
"num_tokens": 7577485.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 6.142808294296264,
|
|
"epoch": 0.3589720946661957,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004991729236275346,
|
|
"loss": 6.0888,
|
|
"mean_token_accuracy": 0.150615693628788,
|
|
"num_tokens": 7587398.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 6.10182294845581,
|
|
"epoch": 0.359413634758036,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004991702237513768,
|
|
"loss": 6.0896,
|
|
"mean_token_accuracy": 0.1515856146812439,
|
|
"num_tokens": 7596310.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 6.236705541610718,
|
|
"epoch": 0.3598551748498764,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004991675194838517,
|
|
"loss": 6.1214,
|
|
"mean_token_accuracy": 0.14487750828266144,
|
|
"num_tokens": 7606971.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 6.157118654251098,
|
|
"epoch": 0.3602967149417167,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004991648108250125,
|
|
"loss": 6.1073,
|
|
"mean_token_accuracy": 0.15116591304540633,
|
|
"num_tokens": 7616636.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 6.23088059425354,
|
|
"epoch": 0.36073825503355705,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499162097774912,
|
|
"loss": 6.1325,
|
|
"mean_token_accuracy": 0.15135489255189896,
|
|
"num_tokens": 7625128.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 6.168088626861572,
|
|
"epoch": 0.3611797951253974,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004991593803336037,
|
|
"loss": 6.0464,
|
|
"mean_token_accuracy": 0.15914286002516748,
|
|
"num_tokens": 7634224.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 6.109174299240112,
|
|
"epoch": 0.3616213352172377,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004991566585011405,
|
|
"loss": 6.0204,
|
|
"mean_token_accuracy": 0.1594786301255226,
|
|
"num_tokens": 7643392.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 6.046089458465576,
|
|
"epoch": 0.36206287530907805,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004991539322775758,
|
|
"loss": 6.0402,
|
|
"mean_token_accuracy": 0.15390734672546386,
|
|
"num_tokens": 7652491.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 6.333108568191529,
|
|
"epoch": 0.3625044154009184,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991512016629632,
|
|
"loss": 6.2631,
|
|
"mean_token_accuracy": 0.1374726377427578,
|
|
"num_tokens": 7663192.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 6.233335781097412,
|
|
"epoch": 0.36294595549275877,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991484666573558,
|
|
"loss": 6.0951,
|
|
"mean_token_accuracy": 0.1491971492767334,
|
|
"num_tokens": 7672801.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 6.150677967071533,
|
|
"epoch": 0.3633874955845991,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991457272608077,
|
|
"loss": 6.1107,
|
|
"mean_token_accuracy": 0.1492188058793545,
|
|
"num_tokens": 7683013.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 6.215946054458618,
|
|
"epoch": 0.3638290356764394,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004991429834733721,
|
|
"loss": 6.1377,
|
|
"mean_token_accuracy": 0.14726671427488328,
|
|
"num_tokens": 7692760.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 6.212368583679199,
|
|
"epoch": 0.36427057576827976,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000499140235295103,
|
|
"loss": 6.0806,
|
|
"mean_token_accuracy": 0.15191175639629365,
|
|
"num_tokens": 7702005.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 6.164025735855103,
|
|
"epoch": 0.3647121158601201,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004991374827260542,
|
|
"loss": 6.129,
|
|
"mean_token_accuracy": 0.1487259477376938,
|
|
"num_tokens": 7711604.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 6.113088750839234,
|
|
"epoch": 0.3651536559519604,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004991347257662795,
|
|
"loss": 6.0471,
|
|
"mean_token_accuracy": 0.153825144469738,
|
|
"num_tokens": 7720434.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 6.218751907348633,
|
|
"epoch": 0.36559519604380075,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499131964415833,
|
|
"loss": 6.0877,
|
|
"mean_token_accuracy": 0.1532296895980835,
|
|
"num_tokens": 7730733.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 6.158688402175903,
|
|
"epoch": 0.36603673613564114,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004991291986747689,
|
|
"loss": 6.1581,
|
|
"mean_token_accuracy": 0.153475009649992,
|
|
"num_tokens": 7740394.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 6.191565799713135,
|
|
"epoch": 0.36647827622748147,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991264285431412,
|
|
"loss": 6.1045,
|
|
"mean_token_accuracy": 0.15052587389945984,
|
|
"num_tokens": 7748919.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 6.293946027755737,
|
|
"epoch": 0.3669198163193218,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004991236540210041,
|
|
"loss": 6.1009,
|
|
"mean_token_accuracy": 0.1537343256175518,
|
|
"num_tokens": 7758591.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 6.244499015808105,
|
|
"epoch": 0.36736135641116213,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004991208751084122,
|
|
"loss": 6.1485,
|
|
"mean_token_accuracy": 0.1491132453083992,
|
|
"num_tokens": 7768269.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 6.249349975585938,
|
|
"epoch": 0.36780289650300246,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004991180918054199,
|
|
"loss": 6.1767,
|
|
"mean_token_accuracy": 0.1463681861758232,
|
|
"num_tokens": 7776980.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 6.171654605865479,
|
|
"epoch": 0.3682444365948428,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004991153041120815,
|
|
"loss": 6.0811,
|
|
"mean_token_accuracy": 0.15733396261930466,
|
|
"num_tokens": 7786664.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 6.12093939781189,
|
|
"epoch": 0.3686859766866831,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004991125120284519,
|
|
"loss": 5.9525,
|
|
"mean_token_accuracy": 0.1650825932621956,
|
|
"num_tokens": 7794981.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 6.110823106765747,
|
|
"epoch": 0.3691275167785235,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004991097155545856,
|
|
"loss": 6.0387,
|
|
"mean_token_accuracy": 0.1560913234949112,
|
|
"num_tokens": 7804850.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 6.106446075439453,
|
|
"epoch": 0.36956905687036384,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004991069146905374,
|
|
"loss": 6.052,
|
|
"mean_token_accuracy": 0.15863640755414962,
|
|
"num_tokens": 7814117.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 6.183217334747314,
|
|
"epoch": 0.3700105969622042,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004991041094363621,
|
|
"loss": 6.0928,
|
|
"mean_token_accuracy": 0.15649753212928771,
|
|
"num_tokens": 7823384.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 6.2000589847564695,
|
|
"epoch": 0.3704521370540445,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004991012997921149,
|
|
"loss": 6.0863,
|
|
"mean_token_accuracy": 0.154317145049572,
|
|
"num_tokens": 7834072.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 6.225698471069336,
|
|
"epoch": 0.37089367714588484,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004990984857578506,
|
|
"loss": 6.0718,
|
|
"mean_token_accuracy": 0.1523341119289398,
|
|
"num_tokens": 7843334.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 6.040940427780152,
|
|
"epoch": 0.37133521723772517,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004990956673336245,
|
|
"loss": 6.0132,
|
|
"mean_token_accuracy": 0.15985633432865143,
|
|
"num_tokens": 7853187.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 6.121350336074829,
|
|
"epoch": 0.3717767573295655,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990928445194917,
|
|
"loss": 6.0836,
|
|
"mean_token_accuracy": 0.1519768550992012,
|
|
"num_tokens": 7862262.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 6.15352463722229,
|
|
"epoch": 0.3722182974214059,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990900173155074,
|
|
"loss": 6.0184,
|
|
"mean_token_accuracy": 0.15704632550477982,
|
|
"num_tokens": 7870725.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 6.124208784103393,
|
|
"epoch": 0.3726598375132462,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990871857217273,
|
|
"loss": 5.9447,
|
|
"mean_token_accuracy": 0.15963577330112458,
|
|
"num_tokens": 7879492.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 6.087597894668579,
|
|
"epoch": 0.37310137760508655,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004990843497382066,
|
|
"loss": 6.0649,
|
|
"mean_token_accuracy": 0.15454517751932145,
|
|
"num_tokens": 7888710.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 6.171204519271851,
|
|
"epoch": 0.3735429176969269,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990815093650009,
|
|
"loss": 6.0846,
|
|
"mean_token_accuracy": 0.15838453769683838,
|
|
"num_tokens": 7898313.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 6.163542985916138,
|
|
"epoch": 0.3739844577887672,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004990786646021659,
|
|
"loss": 6.0972,
|
|
"mean_token_accuracy": 0.15132492929697036,
|
|
"num_tokens": 7906726.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 6.143102693557739,
|
|
"epoch": 0.37442599788060754,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004990758154497573,
|
|
"loss": 6.0083,
|
|
"mean_token_accuracy": 0.16228035539388658,
|
|
"num_tokens": 7915554.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 6.224862670898437,
|
|
"epoch": 0.3748675379724479,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004990729619078309,
|
|
"loss": 6.1357,
|
|
"mean_token_accuracy": 0.15337878912687303,
|
|
"num_tokens": 7924732.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 6.230613327026367,
|
|
"epoch": 0.37530907806428826,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004990701039764427,
|
|
"loss": 6.061,
|
|
"mean_token_accuracy": 0.1594015821814537,
|
|
"num_tokens": 7933808.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 6.101580095291138,
|
|
"epoch": 0.3757506181561286,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004990672416556485,
|
|
"loss": 6.0319,
|
|
"mean_token_accuracy": 0.15845490843057633,
|
|
"num_tokens": 7942558.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 6.129454803466797,
|
|
"epoch": 0.3761921582479689,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004990643749455045,
|
|
"loss": 6.1321,
|
|
"mean_token_accuracy": 0.14685340449213982,
|
|
"num_tokens": 7951951.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 6.2848834037780765,
|
|
"epoch": 0.37663369833980925,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990615038460667,
|
|
"loss": 6.0247,
|
|
"mean_token_accuracy": 0.1576920345425606,
|
|
"num_tokens": 7960543.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 6.126399230957031,
|
|
"epoch": 0.3770752384316496,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990586283573916,
|
|
"loss": 6.007,
|
|
"mean_token_accuracy": 0.155757275223732,
|
|
"num_tokens": 7970273.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 6.158202123641968,
|
|
"epoch": 0.3775167785234899,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990557484795355,
|
|
"loss": 6.1038,
|
|
"mean_token_accuracy": 0.1517634019255638,
|
|
"num_tokens": 7978888.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 6.186517333984375,
|
|
"epoch": 0.37795831861533025,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004990528642125545,
|
|
"loss": 6.0938,
|
|
"mean_token_accuracy": 0.15008659660816193,
|
|
"num_tokens": 7988843.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 6.1609944820404055,
|
|
"epoch": 0.37839985870717063,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990499755565055,
|
|
"loss": 6.0944,
|
|
"mean_token_accuracy": 0.15281789302825927,
|
|
"num_tokens": 7998403.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 6.190595436096191,
|
|
"epoch": 0.37884139879901096,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004990470825114448,
|
|
"loss": 6.1108,
|
|
"mean_token_accuracy": 0.15413309782743453,
|
|
"num_tokens": 8007493.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 6.112000417709351,
|
|
"epoch": 0.3792829388908513,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990441850774292,
|
|
"loss": 6.0581,
|
|
"mean_token_accuracy": 0.1584509640932083,
|
|
"num_tokens": 8016399.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 6.215829849243164,
|
|
"epoch": 0.3797244789826916,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004990412832545155,
|
|
"loss": 6.0591,
|
|
"mean_token_accuracy": 0.1544952630996704,
|
|
"num_tokens": 8025891.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 6.077403736114502,
|
|
"epoch": 0.38016601907453196,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990383770427603,
|
|
"loss": 6.0097,
|
|
"mean_token_accuracy": 0.15796091556549072,
|
|
"num_tokens": 8034733.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 6.135886240005493,
|
|
"epoch": 0.3806075591663723,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004990354664422209,
|
|
"loss": 6.0527,
|
|
"mean_token_accuracy": 0.15982279628515245,
|
|
"num_tokens": 8044443.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 6.160210800170899,
|
|
"epoch": 0.3810490992582126,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004990325514529541,
|
|
"loss": 6.0403,
|
|
"mean_token_accuracy": 0.16555903106927872,
|
|
"num_tokens": 8053920.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 6.1733404159545895,
|
|
"epoch": 0.381490639350053,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004990296320750169,
|
|
"loss": 6.0242,
|
|
"mean_token_accuracy": 0.15823598951101303,
|
|
"num_tokens": 8063455.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 6.164728498458862,
|
|
"epoch": 0.38193217944189334,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990267083084667,
|
|
"loss": 6.1096,
|
|
"mean_token_accuracy": 0.15378031879663467,
|
|
"num_tokens": 8073353.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 6.216868829727173,
|
|
"epoch": 0.38237371953373367,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990237801533607,
|
|
"loss": 6.1198,
|
|
"mean_token_accuracy": 0.1568579077720642,
|
|
"num_tokens": 8082164.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 6.222427320480347,
|
|
"epoch": 0.382815259625574,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990208476097562,
|
|
"loss": 6.126,
|
|
"mean_token_accuracy": 0.15226055085659027,
|
|
"num_tokens": 8090790.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 6.293479156494141,
|
|
"epoch": 0.38325679971741433,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004990179106777109,
|
|
"loss": 6.1346,
|
|
"mean_token_accuracy": 0.14660235047340392,
|
|
"num_tokens": 8100412.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 6.121231889724731,
|
|
"epoch": 0.38369833980925466,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004990149693572819,
|
|
"loss": 6.1328,
|
|
"mean_token_accuracy": 0.14986878782510757,
|
|
"num_tokens": 8110002.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 6.1944701194763185,
|
|
"epoch": 0.384139879901095,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004990120236485271,
|
|
"loss": 6.0262,
|
|
"mean_token_accuracy": 0.15945157259702683,
|
|
"num_tokens": 8118898.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 6.093376398086548,
|
|
"epoch": 0.3845814199929354,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004990090735515043,
|
|
"loss": 6.0696,
|
|
"mean_token_accuracy": 0.1521653488278389,
|
|
"num_tokens": 8128207.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 6.162809753417969,
|
|
"epoch": 0.3850229600847757,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499006119066271,
|
|
"loss": 6.0659,
|
|
"mean_token_accuracy": 0.15488530248403548,
|
|
"num_tokens": 8137317.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 6.234450483322144,
|
|
"epoch": 0.38546450017661604,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004990031601928854,
|
|
"loss": 6.0901,
|
|
"mean_token_accuracy": 0.15392402857542037,
|
|
"num_tokens": 8146519.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 6.081244421005249,
|
|
"epoch": 0.3859060402684564,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990001969314051,
|
|
"loss": 6.1156,
|
|
"mean_token_accuracy": 0.150378455221653,
|
|
"num_tokens": 8155916.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 6.188456583023071,
|
|
"epoch": 0.3863475803602967,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989972292818884,
|
|
"loss": 6.1306,
|
|
"mean_token_accuracy": 0.15578001141548156,
|
|
"num_tokens": 8165251.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 6.154488277435303,
|
|
"epoch": 0.38678912045213704,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004989942572443934,
|
|
"loss": 6.1046,
|
|
"mean_token_accuracy": 0.154154072701931,
|
|
"num_tokens": 8174761.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 6.2399732112884525,
|
|
"epoch": 0.38723066054397737,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004989912808189784,
|
|
"loss": 6.0624,
|
|
"mean_token_accuracy": 0.1586390733718872,
|
|
"num_tokens": 8182980.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 6.286262083053589,
|
|
"epoch": 0.38767220063581775,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004989883000057013,
|
|
"loss": 6.1013,
|
|
"mean_token_accuracy": 0.15117012858390808,
|
|
"num_tokens": 8193306.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 6.023512887954712,
|
|
"epoch": 0.3881137407276581,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498985314804621,
|
|
"loss": 5.9827,
|
|
"mean_token_accuracy": 0.16275162994861603,
|
|
"num_tokens": 8203446.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 6.129458618164063,
|
|
"epoch": 0.3885552808194984,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004989823252157958,
|
|
"loss": 6.0411,
|
|
"mean_token_accuracy": 0.14927417337894439,
|
|
"num_tokens": 8212414.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 6.107169580459595,
|
|
"epoch": 0.38899682091133875,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004989793312392841,
|
|
"loss": 5.9633,
|
|
"mean_token_accuracy": 0.16139311194419861,
|
|
"num_tokens": 8221509.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 6.121396112442016,
|
|
"epoch": 0.3894383610031791,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004989763328751448,
|
|
"loss": 6.0825,
|
|
"mean_token_accuracy": 0.15815991312265396,
|
|
"num_tokens": 8230017.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 6.1887977600097654,
|
|
"epoch": 0.3898799010950194,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004989733301234365,
|
|
"loss": 6.0249,
|
|
"mean_token_accuracy": 0.15967779457569123,
|
|
"num_tokens": 8238853.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 6.224899578094482,
|
|
"epoch": 0.39032144118685974,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000498970322984218,
|
|
"loss": 6.1023,
|
|
"mean_token_accuracy": 0.1519481733441353,
|
|
"num_tokens": 8248112.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 6.203225469589233,
|
|
"epoch": 0.39076298127870013,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989673114575483,
|
|
"loss": 6.0589,
|
|
"mean_token_accuracy": 0.1536906696856022,
|
|
"num_tokens": 8257166.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 6.128660726547241,
|
|
"epoch": 0.39120452137054046,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004989642955434863,
|
|
"loss": 6.0714,
|
|
"mean_token_accuracy": 0.1515656217932701,
|
|
"num_tokens": 8266229.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 6.163424253463745,
|
|
"epoch": 0.3916460614623808,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989612752420912,
|
|
"loss": 6.039,
|
|
"mean_token_accuracy": 0.15465213656425475,
|
|
"num_tokens": 8275102.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 6.022084140777588,
|
|
"epoch": 0.3920876015542211,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000498958250553422,
|
|
"loss": 5.9631,
|
|
"mean_token_accuracy": 0.16303362101316451,
|
|
"num_tokens": 8283847.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 6.193567132949829,
|
|
"epoch": 0.39252914164606145,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004989552214775381,
|
|
"loss": 6.0871,
|
|
"mean_token_accuracy": 0.15131851583719252,
|
|
"num_tokens": 8292622.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 6.194511032104492,
|
|
"epoch": 0.3929706817379018,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004989521880144988,
|
|
"loss": 5.9982,
|
|
"mean_token_accuracy": 0.17004551142454147,
|
|
"num_tokens": 8301026.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 6.184074640274048,
|
|
"epoch": 0.3934122218297421,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004989491501643635,
|
|
"loss": 6.2578,
|
|
"mean_token_accuracy": 0.14575279951095582,
|
|
"num_tokens": 8310977.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 6.20936131477356,
|
|
"epoch": 0.3938537619215825,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004989461079271916,
|
|
"loss": 6.0296,
|
|
"mean_token_accuracy": 0.16169211864471436,
|
|
"num_tokens": 8319391.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 6.169527339935303,
|
|
"epoch": 0.39429530201342283,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004989430613030429,
|
|
"loss": 5.9922,
|
|
"mean_token_accuracy": 0.15554805994033813,
|
|
"num_tokens": 8328639.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 6.117289400100708,
|
|
"epoch": 0.39473684210526316,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000498940010291977,
|
|
"loss": 6.096,
|
|
"mean_token_accuracy": 0.15155849754810333,
|
|
"num_tokens": 8338190.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 6.100512361526489,
|
|
"epoch": 0.3951783821971035,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004989369548940536,
|
|
"loss": 6.0395,
|
|
"mean_token_accuracy": 0.15572706907987593,
|
|
"num_tokens": 8346874.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 6.190318775177002,
|
|
"epoch": 0.3956199222889438,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004989338951093327,
|
|
"loss": 6.0178,
|
|
"mean_token_accuracy": 0.16222874522209169,
|
|
"num_tokens": 8356446.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 6.1148622035980225,
|
|
"epoch": 0.39606146238078416,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004989308309378741,
|
|
"loss": 6.0199,
|
|
"mean_token_accuracy": 0.15970418155193328,
|
|
"num_tokens": 8365694.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 6.157066297531128,
|
|
"epoch": 0.3965030024726245,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004989277623797379,
|
|
"loss": 6.0754,
|
|
"mean_token_accuracy": 0.1534503474831581,
|
|
"num_tokens": 8374282.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 6.160027027130127,
|
|
"epoch": 0.3969445425644649,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004989246894349841,
|
|
"loss": 6.0372,
|
|
"mean_token_accuracy": 0.1563117504119873,
|
|
"num_tokens": 8383315.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 6.214316082000733,
|
|
"epoch": 0.3973860826563052,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004989216121036732,
|
|
"loss": 6.0236,
|
|
"mean_token_accuracy": 0.15833714008331298,
|
|
"num_tokens": 8392263.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 6.106395578384399,
|
|
"epoch": 0.39782762274814554,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004989185303858651,
|
|
"loss": 6.0696,
|
|
"mean_token_accuracy": 0.14981550127267837,
|
|
"num_tokens": 8400734.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 6.156090450286865,
|
|
"epoch": 0.39826916283998587,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004989154442816203,
|
|
"loss": 6.0793,
|
|
"mean_token_accuracy": 0.15283239632844925,
|
|
"num_tokens": 8410635.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 6.205539274215698,
|
|
"epoch": 0.3987107029318262,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004989123537909994,
|
|
"loss": 6.0664,
|
|
"mean_token_accuracy": 0.15485348254442216,
|
|
"num_tokens": 8420111.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 6.103611946105957,
|
|
"epoch": 0.39915224302366653,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004989092589140629,
|
|
"loss": 6.0177,
|
|
"mean_token_accuracy": 0.1484901040792465,
|
|
"num_tokens": 8429459.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 6.117227792739868,
|
|
"epoch": 0.39959378311550686,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004989061596508712,
|
|
"loss": 6.0403,
|
|
"mean_token_accuracy": 0.16086821481585503,
|
|
"num_tokens": 8438083.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 6.07325234413147,
|
|
"epoch": 0.40003532320734725,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004989030560014853,
|
|
"loss": 6.0506,
|
|
"mean_token_accuracy": 0.15744656324386597,
|
|
"num_tokens": 8447713.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 6.1523223400115965,
|
|
"epoch": 0.4004768632991876,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988999479659657,
|
|
"loss": 6.0226,
|
|
"mean_token_accuracy": 0.158653824031353,
|
|
"num_tokens": 8457394.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 6.117883396148682,
|
|
"epoch": 0.4009184033910279,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004988968355443737,
|
|
"loss": 5.9913,
|
|
"mean_token_accuracy": 0.16181344538927078,
|
|
"num_tokens": 8467333.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 6.124732971191406,
|
|
"epoch": 0.40135994348286824,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988937187367699,
|
|
"loss": 6.1032,
|
|
"mean_token_accuracy": 0.15110900700092317,
|
|
"num_tokens": 8477530.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 6.217570829391479,
|
|
"epoch": 0.4018014835747086,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004988905975432154,
|
|
"loss": 6.1802,
|
|
"mean_token_accuracy": 0.15269524306058885,
|
|
"num_tokens": 8486861.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 6.170765733718872,
|
|
"epoch": 0.4022430236665489,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004988874719637715,
|
|
"loss": 5.9867,
|
|
"mean_token_accuracy": 0.158355513215065,
|
|
"num_tokens": 8496541.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 6.052766561508179,
|
|
"epoch": 0.40268456375838924,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004988843419984994,
|
|
"loss": 6.064,
|
|
"mean_token_accuracy": 0.15494307354092599,
|
|
"num_tokens": 8505667.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 6.24229063987732,
|
|
"epoch": 0.4031261038502296,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004988812076474604,
|
|
"loss": 6.052,
|
|
"mean_token_accuracy": 0.15319542214274406,
|
|
"num_tokens": 8515133.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 6.170625066757202,
|
|
"epoch": 0.40356764394206995,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004988780689107158,
|
|
"loss": 6.007,
|
|
"mean_token_accuracy": 0.1620650038123131,
|
|
"num_tokens": 8524797.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 6.0161412239074705,
|
|
"epoch": 0.4040091840339103,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004988749257883271,
|
|
"loss": 6.0288,
|
|
"mean_token_accuracy": 0.15016194060444832,
|
|
"num_tokens": 8534667.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 6.143600559234619,
|
|
"epoch": 0.4044507241257506,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000498871778280356,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.15943924337625504,
|
|
"num_tokens": 8543874.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 6.1106805324554445,
|
|
"epoch": 0.40489226421759095,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988686263868641,
|
|
"loss": 6.0353,
|
|
"mean_token_accuracy": 0.1529652863740921,
|
|
"num_tokens": 8553620.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 6.17731556892395,
|
|
"epoch": 0.4053338043094313,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004988654701079131,
|
|
"loss": 6.1113,
|
|
"mean_token_accuracy": 0.15474483817815782,
|
|
"num_tokens": 8563857.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 6.2202249526977536,
|
|
"epoch": 0.4057753444012716,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004988623094435649,
|
|
"loss": 6.0898,
|
|
"mean_token_accuracy": 0.1531184583902359,
|
|
"num_tokens": 8572677.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 6.140727758407593,
|
|
"epoch": 0.406216884493112,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004988591443938813,
|
|
"loss": 6.016,
|
|
"mean_token_accuracy": 0.15535678565502167,
|
|
"num_tokens": 8581907.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 6.138199281692505,
|
|
"epoch": 0.4066584245849523,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004988559749589244,
|
|
"loss": 6.0609,
|
|
"mean_token_accuracy": 0.15539143681526185,
|
|
"num_tokens": 8591701.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 6.19168872833252,
|
|
"epoch": 0.40709996467679266,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004988528011387563,
|
|
"loss": 6.1128,
|
|
"mean_token_accuracy": 0.15955741629004477,
|
|
"num_tokens": 8601054.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 6.115007019042968,
|
|
"epoch": 0.407541504768633,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988496229334392,
|
|
"loss": 6.0089,
|
|
"mean_token_accuracy": 0.16271338164806365,
|
|
"num_tokens": 8610346.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 6.082312822341919,
|
|
"epoch": 0.4079830448604733,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004988464403430352,
|
|
"loss": 6.0114,
|
|
"mean_token_accuracy": 0.1530932977795601,
|
|
"num_tokens": 8620823.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 6.228973913192749,
|
|
"epoch": 0.40842458495231365,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004988432533676067,
|
|
"loss": 6.1457,
|
|
"mean_token_accuracy": 0.14871701523661612,
|
|
"num_tokens": 8630184.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 6.265304517745972,
|
|
"epoch": 0.408866125044154,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004988400620072163,
|
|
"loss": 6.1541,
|
|
"mean_token_accuracy": 0.14276653826236724,
|
|
"num_tokens": 8640064.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 6.173982954025268,
|
|
"epoch": 0.40930766513599437,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004988368662619263,
|
|
"loss": 6.0413,
|
|
"mean_token_accuracy": 0.15715423077344895,
|
|
"num_tokens": 8650503.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 6.191125011444091,
|
|
"epoch": 0.4097492052278347,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004988336661317994,
|
|
"loss": 6.0652,
|
|
"mean_token_accuracy": 0.15463789254426957,
|
|
"num_tokens": 8659125.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 6.157907199859619,
|
|
"epoch": 0.41019074531967503,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004988304616168984,
|
|
"loss": 6.0841,
|
|
"mean_token_accuracy": 0.15441161543130874,
|
|
"num_tokens": 8668193.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 6.116786527633667,
|
|
"epoch": 0.41063228541151536,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004988272527172858,
|
|
"loss": 6.0384,
|
|
"mean_token_accuracy": 0.15524010509252548,
|
|
"num_tokens": 8677515.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 6.21679277420044,
|
|
"epoch": 0.4110738255033557,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004988240394330246,
|
|
"loss": 6.0764,
|
|
"mean_token_accuracy": 0.14973534047603607,
|
|
"num_tokens": 8687549.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 6.157021522521973,
|
|
"epoch": 0.411515365595196,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004988208217641778,
|
|
"loss": 6.0639,
|
|
"mean_token_accuracy": 0.15594548732042313,
|
|
"num_tokens": 8697371.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 6.13969612121582,
|
|
"epoch": 0.41195690568703636,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004988175997108086,
|
|
"loss": 6.0735,
|
|
"mean_token_accuracy": 0.15609194859862327,
|
|
"num_tokens": 8707193.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 6.1602945804595945,
|
|
"epoch": 0.41239844577887674,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004988143732729797,
|
|
"loss": 6.0644,
|
|
"mean_token_accuracy": 0.15450926274061202,
|
|
"num_tokens": 8716052.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 6.081143188476562,
|
|
"epoch": 0.4128399858707171,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004988111424507546,
|
|
"loss": 6.0209,
|
|
"mean_token_accuracy": 0.1590244859457016,
|
|
"num_tokens": 8726140.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 6.134521245956421,
|
|
"epoch": 0.4132815259625574,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004988079072441964,
|
|
"loss": 6.0028,
|
|
"mean_token_accuracy": 0.16453344523906707,
|
|
"num_tokens": 8735299.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 6.157465314865112,
|
|
"epoch": 0.41372306605439774,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004988046676533687,
|
|
"loss": 6.0908,
|
|
"mean_token_accuracy": 0.15370513945817948,
|
|
"num_tokens": 8744686.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 6.140842294692993,
|
|
"epoch": 0.41416460614623807,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004988014236783347,
|
|
"loss": 6.063,
|
|
"mean_token_accuracy": 0.1632717102766037,
|
|
"num_tokens": 8754942.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 6.232805824279785,
|
|
"epoch": 0.4146061462380784,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987981753191582,
|
|
"loss": 6.0523,
|
|
"mean_token_accuracy": 0.15246548503637314,
|
|
"num_tokens": 8764054.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 6.0288361549377445,
|
|
"epoch": 0.41504768632991873,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004987949225759027,
|
|
"loss": 5.9405,
|
|
"mean_token_accuracy": 0.15906094312667846,
|
|
"num_tokens": 8773050.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 6.1974766731262205,
|
|
"epoch": 0.4154892264217591,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004987916654486321,
|
|
"loss": 6.0544,
|
|
"mean_token_accuracy": 0.1537775442004204,
|
|
"num_tokens": 8782476.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 6.254334783554077,
|
|
"epoch": 0.41593076651359945,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004987884039374099,
|
|
"loss": 6.1212,
|
|
"mean_token_accuracy": 0.1502728283405304,
|
|
"num_tokens": 8791147.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 6.237736749649048,
|
|
"epoch": 0.4163723066054398,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004987851380423001,
|
|
"loss": 6.1069,
|
|
"mean_token_accuracy": 0.1571350358426571,
|
|
"num_tokens": 8801151.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 6.067666149139404,
|
|
"epoch": 0.4168138466972801,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004987818677633668,
|
|
"loss": 6.0577,
|
|
"mean_token_accuracy": 0.14970119222998618,
|
|
"num_tokens": 8809587.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 6.103376960754394,
|
|
"epoch": 0.41725538678912044,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000498778593100674,
|
|
"loss": 5.94,
|
|
"mean_token_accuracy": 0.15202507078647615,
|
|
"num_tokens": 8818168.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 6.131261110305786,
|
|
"epoch": 0.4176969268809608,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004987753140542857,
|
|
"loss": 6.0105,
|
|
"mean_token_accuracy": 0.16188293397426606,
|
|
"num_tokens": 8827477.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 6.104963493347168,
|
|
"epoch": 0.4181384669728011,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004987720306242664,
|
|
"loss": 5.9847,
|
|
"mean_token_accuracy": 0.16286925673484803,
|
|
"num_tokens": 8837067.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 6.133073472976685,
|
|
"epoch": 0.4185800070646415,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004987687428106803,
|
|
"loss": 6.0872,
|
|
"mean_token_accuracy": 0.16172488033771515,
|
|
"num_tokens": 8845790.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 6.185379123687744,
|
|
"epoch": 0.4190215471564818,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004987654506135917,
|
|
"loss": 6.0226,
|
|
"mean_token_accuracy": 0.15944662541151047,
|
|
"num_tokens": 8855242.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 6.192673587799073,
|
|
"epoch": 0.41946308724832215,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987621540330652,
|
|
"loss": 6.0723,
|
|
"mean_token_accuracy": 0.15567026063799858,
|
|
"num_tokens": 8864459.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 6.19939341545105,
|
|
"epoch": 0.4199046273401625,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004987588530691653,
|
|
"loss": 6.1385,
|
|
"mean_token_accuracy": 0.14711003005504608,
|
|
"num_tokens": 8875028.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 6.135413789749146,
|
|
"epoch": 0.4203461674320028,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004987555477219569,
|
|
"loss": 5.9786,
|
|
"mean_token_accuracy": 0.15931818783283233,
|
|
"num_tokens": 8883857.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 6.110503196716309,
|
|
"epoch": 0.42078770752384315,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004987522379915045,
|
|
"loss": 5.9705,
|
|
"mean_token_accuracy": 0.15814343243837356,
|
|
"num_tokens": 8893499.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 6.095895624160766,
|
|
"epoch": 0.4212292476156835,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498748923877873,
|
|
"loss": 5.9891,
|
|
"mean_token_accuracy": 0.1591852620244026,
|
|
"num_tokens": 8903368.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 6.126567029953003,
|
|
"epoch": 0.42167078770752386,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004987456053811273,
|
|
"loss": 6.0966,
|
|
"mean_token_accuracy": 0.1539273589849472,
|
|
"num_tokens": 8912701.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 6.1158490657806395,
|
|
"epoch": 0.4221123277993642,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004987422825013325,
|
|
"loss": 6.0444,
|
|
"mean_token_accuracy": 0.15109995752573013,
|
|
"num_tokens": 8921962.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 6.1833864688873295,
|
|
"epoch": 0.4225538678912045,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004987389552385536,
|
|
"loss": 6.0307,
|
|
"mean_token_accuracy": 0.15406385958194732,
|
|
"num_tokens": 8931923.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 6.131430625915527,
|
|
"epoch": 0.42299540798304486,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004987356235928558,
|
|
"loss": 6.0635,
|
|
"mean_token_accuracy": 0.1470622941851616,
|
|
"num_tokens": 8940403.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 6.131550025939942,
|
|
"epoch": 0.4234369480748852,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004987322875643044,
|
|
"loss": 5.9887,
|
|
"mean_token_accuracy": 0.16142310500144957,
|
|
"num_tokens": 8949377.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 6.2069591045379635,
|
|
"epoch": 0.4238784881667255,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004987289471529647,
|
|
"loss": 6.1304,
|
|
"mean_token_accuracy": 0.1479768604040146,
|
|
"num_tokens": 8958719.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 6.165348720550537,
|
|
"epoch": 0.42432002825856585,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004987256023589022,
|
|
"loss": 6.1048,
|
|
"mean_token_accuracy": 0.14835046380758285,
|
|
"num_tokens": 8968226.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 6.13749942779541,
|
|
"epoch": 0.42476156835040624,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004987222531821824,
|
|
"loss": 6.0528,
|
|
"mean_token_accuracy": 0.14842675924301146,
|
|
"num_tokens": 8976670.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 6.09095401763916,
|
|
"epoch": 0.42520310844224657,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004987188996228709,
|
|
"loss": 5.9901,
|
|
"mean_token_accuracy": 0.16065036058425902,
|
|
"num_tokens": 8986185.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 6.155765628814697,
|
|
"epoch": 0.4256446485340869,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004987155416810334,
|
|
"loss": 5.9644,
|
|
"mean_token_accuracy": 0.16488435715436936,
|
|
"num_tokens": 8995124.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 6.094462633132935,
|
|
"epoch": 0.42608618862592723,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004987121793567356,
|
|
"loss": 6.0704,
|
|
"mean_token_accuracy": 0.15586088821291924,
|
|
"num_tokens": 9004380.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 6.142639970779419,
|
|
"epoch": 0.42652772871776756,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004987088126500436,
|
|
"loss": 5.963,
|
|
"mean_token_accuracy": 0.16008084118366242,
|
|
"num_tokens": 9013791.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 6.069281530380249,
|
|
"epoch": 0.4269692688096079,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000498705441561023,
|
|
"loss": 5.9998,
|
|
"mean_token_accuracy": 0.16714757829904556,
|
|
"num_tokens": 9023076.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 6.315524005889893,
|
|
"epoch": 0.4274108089014482,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004987020660897401,
|
|
"loss": 6.1089,
|
|
"mean_token_accuracy": 0.14946657419204712,
|
|
"num_tokens": 9032720.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 6.199988222122192,
|
|
"epoch": 0.4278523489932886,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000498698686236261,
|
|
"loss": 6.0577,
|
|
"mean_token_accuracy": 0.1480626255273819,
|
|
"num_tokens": 9042652.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 6.085298490524292,
|
|
"epoch": 0.42829388908512894,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004986953020006519,
|
|
"loss": 6.1376,
|
|
"mean_token_accuracy": 0.14969860166311263,
|
|
"num_tokens": 9052172.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 6.134743070602417,
|
|
"epoch": 0.4287354291769693,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004986919133829788,
|
|
"loss": 5.9956,
|
|
"mean_token_accuracy": 0.15985623747110367,
|
|
"num_tokens": 9061798.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 6.040613985061645,
|
|
"epoch": 0.4291769692688096,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004986885203833086,
|
|
"loss": 5.9499,
|
|
"mean_token_accuracy": 0.15626863837242128,
|
|
"num_tokens": 9070429.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 6.098531293869018,
|
|
"epoch": 0.42961850936064994,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004986851230017075,
|
|
"loss": 6.0492,
|
|
"mean_token_accuracy": 0.16342882812023163,
|
|
"num_tokens": 9080200.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 6.163259077072143,
|
|
"epoch": 0.43006004945249027,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004986817212382419,
|
|
"loss": 6.0382,
|
|
"mean_token_accuracy": 0.15508455336093901,
|
|
"num_tokens": 9089555.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 6.126989889144897,
|
|
"epoch": 0.4305015895443306,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986783150929786,
|
|
"loss": 6.0263,
|
|
"mean_token_accuracy": 0.15248029232025145,
|
|
"num_tokens": 9099091.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 6.172487115859985,
|
|
"epoch": 0.430943129636171,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986749045659845,
|
|
"loss": 6.1075,
|
|
"mean_token_accuracy": 0.14817112535238267,
|
|
"num_tokens": 9109008.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 6.152895927429199,
|
|
"epoch": 0.4313846697280113,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004986714896573261,
|
|
"loss": 5.9513,
|
|
"mean_token_accuracy": 0.16084639877080917,
|
|
"num_tokens": 9117882.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 6.139704322814941,
|
|
"epoch": 0.43182620981985165,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004986680703670704,
|
|
"loss": 6.1215,
|
|
"mean_token_accuracy": 0.15487379878759383,
|
|
"num_tokens": 9126860.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 6.141199541091919,
|
|
"epoch": 0.432267749911692,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004986646466952845,
|
|
"loss": 6.0119,
|
|
"mean_token_accuracy": 0.15635189563035964,
|
|
"num_tokens": 9135819.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 6.10592885017395,
|
|
"epoch": 0.4327092900035323,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004986612186420353,
|
|
"loss": 5.9934,
|
|
"mean_token_accuracy": 0.15510470867156984,
|
|
"num_tokens": 9145302.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 6.044864749908447,
|
|
"epoch": 0.43315083009537264,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986577862073901,
|
|
"loss": 6.0575,
|
|
"mean_token_accuracy": 0.15394357293844224,
|
|
"num_tokens": 9154667.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 6.2363903522491455,
|
|
"epoch": 0.43359237018721297,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986543493914159,
|
|
"loss": 6.098,
|
|
"mean_token_accuracy": 0.14959411323070526,
|
|
"num_tokens": 9164562.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 6.208088731765747,
|
|
"epoch": 0.43403391027905336,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004986509081941805,
|
|
"loss": 6.1071,
|
|
"mean_token_accuracy": 0.15276289731264114,
|
|
"num_tokens": 9174872.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 6.079789733886718,
|
|
"epoch": 0.4344754503708937,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004986474626157507,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.17531196177005767,
|
|
"num_tokens": 9184322.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 6.058652734756469,
|
|
"epoch": 0.434916990462734,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004986440126561945,
|
|
"loss": 5.9894,
|
|
"mean_token_accuracy": 0.15689075142145156,
|
|
"num_tokens": 9194450.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 6.151858711242676,
|
|
"epoch": 0.43535853055457435,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004986405583155792,
|
|
"loss": 6.0305,
|
|
"mean_token_accuracy": 0.15203123837709426,
|
|
"num_tokens": 9203658.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 6.100604057312012,
|
|
"epoch": 0.4358000706464147,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004986370995939725,
|
|
"loss": 6.0239,
|
|
"mean_token_accuracy": 0.15403898507356645,
|
|
"num_tokens": 9213609.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 6.112157011032105,
|
|
"epoch": 0.436241610738255,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004986336364914423,
|
|
"loss": 6.0414,
|
|
"mean_token_accuracy": 0.15200137123465537,
|
|
"num_tokens": 9222704.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 6.147404766082763,
|
|
"epoch": 0.43668315083009535,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004986301690080564,
|
|
"loss": 6.0472,
|
|
"mean_token_accuracy": 0.1494756668806076,
|
|
"num_tokens": 9231599.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 6.176404428482056,
|
|
"epoch": 0.43712469092193573,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986266971438826,
|
|
"loss": 6.0761,
|
|
"mean_token_accuracy": 0.15215079635381698,
|
|
"num_tokens": 9241886.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 6.142639207839966,
|
|
"epoch": 0.43756623101377606,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000498623220898989,
|
|
"loss": 6.0651,
|
|
"mean_token_accuracy": 0.1492701292037964,
|
|
"num_tokens": 9251085.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 6.155983543395996,
|
|
"epoch": 0.4380077711056164,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004986197402734436,
|
|
"loss": 6.0272,
|
|
"mean_token_accuracy": 0.15382544845342636,
|
|
"num_tokens": 9259601.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 6.18567385673523,
|
|
"epoch": 0.4384493111974567,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004986162552673148,
|
|
"loss": 6.0699,
|
|
"mean_token_accuracy": 0.15705521255731583,
|
|
"num_tokens": 9268935.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 6.133043241500855,
|
|
"epoch": 0.43889085128929706,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004986127658806706,
|
|
"loss": 6.0813,
|
|
"mean_token_accuracy": 0.15293170362710953,
|
|
"num_tokens": 9277647.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 6.120587873458862,
|
|
"epoch": 0.4393323913811374,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004986092721135796,
|
|
"loss": 6.0199,
|
|
"mean_token_accuracy": 0.15239207521080972,
|
|
"num_tokens": 9286610.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 6.135066556930542,
|
|
"epoch": 0.4397739314729777,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004986057739661101,
|
|
"loss": 6.1032,
|
|
"mean_token_accuracy": 0.1508208692073822,
|
|
"num_tokens": 9295946.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 6.1115028858184814,
|
|
"epoch": 0.4402154715648181,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004986022714383307,
|
|
"loss": 6.0042,
|
|
"mean_token_accuracy": 0.1543491631746292,
|
|
"num_tokens": 9304903.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 6.15944766998291,
|
|
"epoch": 0.44065701165665844,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004985987645303099,
|
|
"loss": 5.9185,
|
|
"mean_token_accuracy": 0.16130532771348954,
|
|
"num_tokens": 9313606.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 6.016908359527588,
|
|
"epoch": 0.44109855174849877,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004985952532421164,
|
|
"loss": 6.0275,
|
|
"mean_token_accuracy": 0.15621849447488784,
|
|
"num_tokens": 9322815.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 6.17736701965332,
|
|
"epoch": 0.4415400918403391,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004985917375738193,
|
|
"loss": 5.9593,
|
|
"mean_token_accuracy": 0.15536014586687089,
|
|
"num_tokens": 9332630.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 6.067361211776733,
|
|
"epoch": 0.44198163193217943,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004985882175254871,
|
|
"loss": 5.9846,
|
|
"mean_token_accuracy": 0.15410226881504058,
|
|
"num_tokens": 9342216.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 6.146738815307617,
|
|
"epoch": 0.44242317202401976,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004985846930971887,
|
|
"loss": 6.0431,
|
|
"mean_token_accuracy": 0.1559235379099846,
|
|
"num_tokens": 9352295.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 6.107598447799683,
|
|
"epoch": 0.4428647121158601,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004985811642889937,
|
|
"loss": 6.0348,
|
|
"mean_token_accuracy": 0.15718846172094345,
|
|
"num_tokens": 9361462.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 6.144611740112305,
|
|
"epoch": 0.4433062522077005,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004985776311009705,
|
|
"loss": 5.9684,
|
|
"mean_token_accuracy": 0.15345567613840103,
|
|
"num_tokens": 9370379.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 6.103248453140258,
|
|
"epoch": 0.4437477922995408,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004985740935331888,
|
|
"loss": 6.0098,
|
|
"mean_token_accuracy": 0.15823583900928498,
|
|
"num_tokens": 9379255.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 6.085464525222778,
|
|
"epoch": 0.44418933239138114,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004985705515857177,
|
|
"loss": 5.99,
|
|
"mean_token_accuracy": 0.14899933189153672,
|
|
"num_tokens": 9389313.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 6.020923233032226,
|
|
"epoch": 0.4446308724832215,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004985670052586268,
|
|
"loss": 5.7916,
|
|
"mean_token_accuracy": 0.17029385417699813,
|
|
"num_tokens": 9397778.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 6.082318449020386,
|
|
"epoch": 0.4450724125750618,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004985634545519853,
|
|
"loss": 6.0589,
|
|
"mean_token_accuracy": 0.15298160612583162,
|
|
"num_tokens": 9407831.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 6.0772803783416744,
|
|
"epoch": 0.44551395266690214,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004985598994658629,
|
|
"loss": 5.9669,
|
|
"mean_token_accuracy": 0.15956881046295165,
|
|
"num_tokens": 9418458.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 6.016619396209717,
|
|
"epoch": 0.44595549275874247,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004985563400003291,
|
|
"loss": 5.8707,
|
|
"mean_token_accuracy": 0.17422997653484346,
|
|
"num_tokens": 9426911.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 6.14707236289978,
|
|
"epoch": 0.44639703285058285,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004985527761554539,
|
|
"loss": 6.0541,
|
|
"mean_token_accuracy": 0.1595864400267601,
|
|
"num_tokens": 9435896.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 6.207449245452881,
|
|
"epoch": 0.4468385729424232,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000498549207931307,
|
|
"loss": 6.0612,
|
|
"mean_token_accuracy": 0.15011950582265854,
|
|
"num_tokens": 9445272.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 6.06727409362793,
|
|
"epoch": 0.4472801130342635,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004985456353279581,
|
|
"loss": 6.0228,
|
|
"mean_token_accuracy": 0.15027147233486177,
|
|
"num_tokens": 9455377.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 6.108924150466919,
|
|
"epoch": 0.44772165312610385,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004985420583454774,
|
|
"loss": 6.0477,
|
|
"mean_token_accuracy": 0.1528654247522354,
|
|
"num_tokens": 9464918.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 6.118135070800781,
|
|
"epoch": 0.4481631932179442,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004985384769839349,
|
|
"loss": 6.0416,
|
|
"mean_token_accuracy": 0.15041410326957702,
|
|
"num_tokens": 9473322.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 6.191469526290893,
|
|
"epoch": 0.4486047333097845,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004985348912434008,
|
|
"loss": 5.969,
|
|
"mean_token_accuracy": 0.16189600080251693,
|
|
"num_tokens": 9482255.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 6.092324352264404,
|
|
"epoch": 0.44904627340162484,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004985313011239452,
|
|
"loss": 5.9959,
|
|
"mean_token_accuracy": 0.1581245869398117,
|
|
"num_tokens": 9491709.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 6.024058151245117,
|
|
"epoch": 0.4494878134934652,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004985277066256388,
|
|
"loss": 5.9807,
|
|
"mean_token_accuracy": 0.16034325063228608,
|
|
"num_tokens": 9500594.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 6.112436008453369,
|
|
"epoch": 0.44992935358530556,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004985241077485515,
|
|
"loss": 6.0018,
|
|
"mean_token_accuracy": 0.15738717019557952,
|
|
"num_tokens": 9509088.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 6.149496126174927,
|
|
"epoch": 0.4503708936771459,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004985205044927541,
|
|
"loss": 5.9725,
|
|
"mean_token_accuracy": 0.15938366055488587,
|
|
"num_tokens": 9517776.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 6.069735097885132,
|
|
"epoch": 0.4508124337689862,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004985168968583173,
|
|
"loss": 6.0348,
|
|
"mean_token_accuracy": 0.15837667435407637,
|
|
"num_tokens": 9527080.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 6.106137084960937,
|
|
"epoch": 0.45125397386082655,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004985132848453114,
|
|
"loss": 5.8952,
|
|
"mean_token_accuracy": 0.16738586127758026,
|
|
"num_tokens": 9536358.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 6.04980616569519,
|
|
"epoch": 0.4516955139526669,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004985096684538075,
|
|
"loss": 5.9813,
|
|
"mean_token_accuracy": 0.15662433505058287,
|
|
"num_tokens": 9545528.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 6.060785722732544,
|
|
"epoch": 0.4521370540445072,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004985060476838763,
|
|
"loss": 6.0113,
|
|
"mean_token_accuracy": 0.15891691744327546,
|
|
"num_tokens": 9554433.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 6.073312520980835,
|
|
"epoch": 0.4525785941363476,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004985024225355887,
|
|
"loss": 5.9656,
|
|
"mean_token_accuracy": 0.1543935567140579,
|
|
"num_tokens": 9563932.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 6.100546646118164,
|
|
"epoch": 0.45302013422818793,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004984987930090158,
|
|
"loss": 5.9552,
|
|
"mean_token_accuracy": 0.15897123962640763,
|
|
"num_tokens": 9572829.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 6.025969839096069,
|
|
"epoch": 0.45346167432002826,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004984951591042285,
|
|
"loss": 6.0045,
|
|
"mean_token_accuracy": 0.16426790058612822,
|
|
"num_tokens": 9583597.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 6.24551477432251,
|
|
"epoch": 0.4539032144118686,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004984915208212983,
|
|
"loss": 6.0912,
|
|
"mean_token_accuracy": 0.15156230181455613,
|
|
"num_tokens": 9593114.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 6.167973184585572,
|
|
"epoch": 0.4543447545037089,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004984878781602964,
|
|
"loss": 5.9722,
|
|
"mean_token_accuracy": 0.1515617176890373,
|
|
"num_tokens": 9601454.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 6.117550992965699,
|
|
"epoch": 0.45478629459554926,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004984842311212939,
|
|
"loss": 6.0266,
|
|
"mean_token_accuracy": 0.1547075927257538,
|
|
"num_tokens": 9611994.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 6.099160146713257,
|
|
"epoch": 0.4552278346873896,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004984805797043625,
|
|
"loss": 5.9819,
|
|
"mean_token_accuracy": 0.1554704263806343,
|
|
"num_tokens": 9621318.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 6.086650943756103,
|
|
"epoch": 0.45566937477923,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004984769239095736,
|
|
"loss": 5.9871,
|
|
"mean_token_accuracy": 0.1622908428311348,
|
|
"num_tokens": 9630270.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 6.161284017562866,
|
|
"epoch": 0.4561109148710703,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004984732637369989,
|
|
"loss": 6.0164,
|
|
"mean_token_accuracy": 0.15295830443501474,
|
|
"num_tokens": 9640391.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 6.111026573181152,
|
|
"epoch": 0.45655245496291064,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004984695991867099,
|
|
"loss": 6.0302,
|
|
"mean_token_accuracy": 0.15423648655414582,
|
|
"num_tokens": 9648827.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 6.066336679458618,
|
|
"epoch": 0.45699399505475097,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004984659302587788,
|
|
"loss": 5.9651,
|
|
"mean_token_accuracy": 0.1540757015347481,
|
|
"num_tokens": 9657940.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 6.132708024978638,
|
|
"epoch": 0.4574355351465913,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000498462256953277,
|
|
"loss": 6.0864,
|
|
"mean_token_accuracy": 0.15138714611530305,
|
|
"num_tokens": 9668296.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 6.187513446807861,
|
|
"epoch": 0.45787707523843163,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004984585792702767,
|
|
"loss": 6.0401,
|
|
"mean_token_accuracy": 0.14838093519210815,
|
|
"num_tokens": 9677914.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 6.183677721023559,
|
|
"epoch": 0.45831861533027196,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004984548972098501,
|
|
"loss": 6.174,
|
|
"mean_token_accuracy": 0.13970668166875838,
|
|
"num_tokens": 9687339.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 6.133589363098144,
|
|
"epoch": 0.45876015542211235,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000498451210772069,
|
|
"loss": 5.9454,
|
|
"mean_token_accuracy": 0.16496401354670526,
|
|
"num_tokens": 9696889.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 6.0567436695098875,
|
|
"epoch": 0.4592016955139527,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004984475199570058,
|
|
"loss": 5.9136,
|
|
"mean_token_accuracy": 0.1610390767455101,
|
|
"num_tokens": 9705498.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 6.099144554138183,
|
|
"epoch": 0.459643235605793,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004984438247647329,
|
|
"loss": 5.9902,
|
|
"mean_token_accuracy": 0.15816803127527237,
|
|
"num_tokens": 9713936.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 6.152586174011231,
|
|
"epoch": 0.46008477569763334,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004984401251953223,
|
|
"loss": 6.06,
|
|
"mean_token_accuracy": 0.15645882338285447,
|
|
"num_tokens": 9723924.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 6.179743766784668,
|
|
"epoch": 0.4605263157894737,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004984364212488469,
|
|
"loss": 6.0873,
|
|
"mean_token_accuracy": 0.15282048285007477,
|
|
"num_tokens": 9733427.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 6.035888767242431,
|
|
"epoch": 0.460967855881314,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004984327129253789,
|
|
"loss": 5.8837,
|
|
"mean_token_accuracy": 0.16696648448705673,
|
|
"num_tokens": 9742054.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 6.049908781051636,
|
|
"epoch": 0.46140939597315433,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004984290002249914,
|
|
"loss": 6.063,
|
|
"mean_token_accuracy": 0.1488596171140671,
|
|
"num_tokens": 9752447.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 6.131543159484863,
|
|
"epoch": 0.4618509360649947,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004984252831477567,
|
|
"loss": 5.8886,
|
|
"mean_token_accuracy": 0.16800648123025894,
|
|
"num_tokens": 9760878.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.97862868309021,
|
|
"epoch": 0.46229247615683505,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004984215616937477,
|
|
"loss": 5.9523,
|
|
"mean_token_accuracy": 0.1603931352496147,
|
|
"num_tokens": 9770200.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 6.214956617355346,
|
|
"epoch": 0.4627340162486754,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004984178358630374,
|
|
"loss": 6.0469,
|
|
"mean_token_accuracy": 0.15920519679784775,
|
|
"num_tokens": 9780303.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 6.130382776260376,
|
|
"epoch": 0.4631755563405157,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004984141056556989,
|
|
"loss": 5.9529,
|
|
"mean_token_accuracy": 0.16305534839630126,
|
|
"num_tokens": 9790248.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 6.11019229888916,
|
|
"epoch": 0.46361709643235605,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004984103710718051,
|
|
"loss": 6.0503,
|
|
"mean_token_accuracy": 0.1460764303803444,
|
|
"num_tokens": 9799345.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 6.091036796569824,
|
|
"epoch": 0.4640586365241964,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000498406632111429,
|
|
"loss": 5.9308,
|
|
"mean_token_accuracy": 0.15862552225589752,
|
|
"num_tokens": 9808561.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 6.106261253356934,
|
|
"epoch": 0.4645001766160367,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004984028887746443,
|
|
"loss": 5.9739,
|
|
"mean_token_accuracy": 0.16280067563056946,
|
|
"num_tokens": 9818324.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.984898710250855,
|
|
"epoch": 0.4649417167078771,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004983991410615239,
|
|
"loss": 5.9645,
|
|
"mean_token_accuracy": 0.16320008635520936,
|
|
"num_tokens": 9827935.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 6.089527177810669,
|
|
"epoch": 0.4653832567997174,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004983953889721414,
|
|
"loss": 5.9782,
|
|
"mean_token_accuracy": 0.15890799909830094,
|
|
"num_tokens": 9837118.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 6.149358510971069,
|
|
"epoch": 0.46582479689155776,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004983916325065703,
|
|
"loss": 6.0455,
|
|
"mean_token_accuracy": 0.1533641681075096,
|
|
"num_tokens": 9846197.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 6.196761894226074,
|
|
"epoch": 0.4662663369833981,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004983878716648842,
|
|
"loss": 6.107,
|
|
"mean_token_accuracy": 0.14858163744211197,
|
|
"num_tokens": 9856120.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 6.119464254379272,
|
|
"epoch": 0.4667078770752384,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004983841064471567,
|
|
"loss": 6.056,
|
|
"mean_token_accuracy": 0.1537718027830124,
|
|
"num_tokens": 9865599.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 6.096899557113647,
|
|
"epoch": 0.46714941716707875,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004983803368534617,
|
|
"loss": 5.9688,
|
|
"mean_token_accuracy": 0.15628019720315933,
|
|
"num_tokens": 9876471.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 6.196185064315796,
|
|
"epoch": 0.4675909572589191,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004983765628838728,
|
|
"loss": 6.0124,
|
|
"mean_token_accuracy": 0.15632506608963012,
|
|
"num_tokens": 9887680.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 6.205327701568604,
|
|
"epoch": 0.46803249735075947,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004983727845384641,
|
|
"loss": 6.0798,
|
|
"mean_token_accuracy": 0.15046066045761108,
|
|
"num_tokens": 9897366.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 6.112229824066162,
|
|
"epoch": 0.4684740374425998,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004983690018173096,
|
|
"loss": 6.0473,
|
|
"mean_token_accuracy": 0.15282203108072281,
|
|
"num_tokens": 9907437.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 6.03530478477478,
|
|
"epoch": 0.46891557753444013,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004983652147204834,
|
|
"loss": 5.8493,
|
|
"mean_token_accuracy": 0.16888897120952606,
|
|
"num_tokens": 9915766.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 6.039136505126953,
|
|
"epoch": 0.46935711762628046,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004983614232480598,
|
|
"loss": 6.0557,
|
|
"mean_token_accuracy": 0.1539039731025696,
|
|
"num_tokens": 9925389.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 6.1289163589477536,
|
|
"epoch": 0.4697986577181208,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004983576274001127,
|
|
"loss": 6.0973,
|
|
"mean_token_accuracy": 0.15069840773940085,
|
|
"num_tokens": 9935798.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 6.164807653427124,
|
|
"epoch": 0.4702401978099611,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000498353827176717,
|
|
"loss": 5.9409,
|
|
"mean_token_accuracy": 0.16221913695335388,
|
|
"num_tokens": 9945579.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 6.1347403049469,
|
|
"epoch": 0.4706817379018015,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004983500225779466,
|
|
"loss": 5.9683,
|
|
"mean_token_accuracy": 0.15579652935266494,
|
|
"num_tokens": 9955248.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 6.085616779327393,
|
|
"epoch": 0.47112327799364184,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004983462136038764,
|
|
"loss": 5.9757,
|
|
"mean_token_accuracy": 0.16684675961732864,
|
|
"num_tokens": 9965078.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 6.127761745452881,
|
|
"epoch": 0.4715648180854822,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004983424002545809,
|
|
"loss": 5.9417,
|
|
"mean_token_accuracy": 0.16493815779685975,
|
|
"num_tokens": 9973932.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 6.168215322494507,
|
|
"epoch": 0.4720063581773225,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004983385825301348,
|
|
"loss": 6.0002,
|
|
"mean_token_accuracy": 0.1548996612429619,
|
|
"num_tokens": 9983315.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 6.058854579925537,
|
|
"epoch": 0.47244789826916284,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004983347604306129,
|
|
"loss": 5.9641,
|
|
"mean_token_accuracy": 0.15613725483417512,
|
|
"num_tokens": 9992987.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 6.019079732894897,
|
|
"epoch": 0.47288943836100317,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004983309339560899,
|
|
"loss": 6.0237,
|
|
"mean_token_accuracy": 0.15275818705558777,
|
|
"num_tokens": 10002268.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 6.169502782821655,
|
|
"epoch": 0.4733309784528435,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004983271031066412,
|
|
"loss": 6.0223,
|
|
"mean_token_accuracy": 0.14990446120500564,
|
|
"num_tokens": 10011772.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 6.224719524383545,
|
|
"epoch": 0.4737725185446839,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004983232678823414,
|
|
"loss": 6.0393,
|
|
"mean_token_accuracy": 0.15731042325496675,
|
|
"num_tokens": 10021069.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 6.063888311386108,
|
|
"epoch": 0.4742140586365242,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004983194282832657,
|
|
"loss": 5.9557,
|
|
"mean_token_accuracy": 0.161125111579895,
|
|
"num_tokens": 10029706.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.960548067092896,
|
|
"epoch": 0.47465559872836455,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004983155843094895,
|
|
"loss": 5.8997,
|
|
"mean_token_accuracy": 0.16574549674987793,
|
|
"num_tokens": 10039633.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 6.088515663146973,
|
|
"epoch": 0.4750971388202049,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004983117359610881,
|
|
"loss": 5.9683,
|
|
"mean_token_accuracy": 0.1593565970659256,
|
|
"num_tokens": 10048675.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 6.142480039596558,
|
|
"epoch": 0.4755386789120452,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004983078832381367,
|
|
"loss": 5.9571,
|
|
"mean_token_accuracy": 0.16286074072122575,
|
|
"num_tokens": 10057447.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 6.040254259109497,
|
|
"epoch": 0.47598021900388554,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004983040261407109,
|
|
"loss": 6.0401,
|
|
"mean_token_accuracy": 0.15268651247024537,
|
|
"num_tokens": 10067020.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 6.183468341827393,
|
|
"epoch": 0.47642175909572587,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004983001646688863,
|
|
"loss": 6.0737,
|
|
"mean_token_accuracy": 0.15229557305574418,
|
|
"num_tokens": 10076466.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 6.156057071685791,
|
|
"epoch": 0.47686329918756626,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004982962988227383,
|
|
"loss": 5.9383,
|
|
"mean_token_accuracy": 0.1562927931547165,
|
|
"num_tokens": 10085384.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 6.085453033447266,
|
|
"epoch": 0.4773048392794066,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000498292428602343,
|
|
"loss": 6.0543,
|
|
"mean_token_accuracy": 0.15518272593617438,
|
|
"num_tokens": 10094015.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 6.131287050247193,
|
|
"epoch": 0.4777463793712469,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004982885540077758,
|
|
"loss": 5.9489,
|
|
"mean_token_accuracy": 0.16021449863910675,
|
|
"num_tokens": 10103107.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 6.061293983459473,
|
|
"epoch": 0.47818791946308725,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004982846750391129,
|
|
"loss": 5.883,
|
|
"mean_token_accuracy": 0.165081886947155,
|
|
"num_tokens": 10112408.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 6.103114128112793,
|
|
"epoch": 0.4786294595549276,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004982807916964303,
|
|
"loss": 5.9587,
|
|
"mean_token_accuracy": 0.15787963271141053,
|
|
"num_tokens": 10121605.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 6.012549638748169,
|
|
"epoch": 0.4790709996467679,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000498276903979804,
|
|
"loss": 5.9534,
|
|
"mean_token_accuracy": 0.1598804622888565,
|
|
"num_tokens": 10130484.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.984193134307861,
|
|
"epoch": 0.47951253973860825,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00049827301188931,
|
|
"loss": 5.906,
|
|
"mean_token_accuracy": 0.16575224399566652,
|
|
"num_tokens": 10140002.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 6.203652048110962,
|
|
"epoch": 0.47995407983044863,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004982691154250247,
|
|
"loss": 6.0681,
|
|
"mean_token_accuracy": 0.1568043977022171,
|
|
"num_tokens": 10150287.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 6.098856639862061,
|
|
"epoch": 0.48039561992228896,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004982652145870245,
|
|
"loss": 5.9027,
|
|
"mean_token_accuracy": 0.16656555682420732,
|
|
"num_tokens": 10160615.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.997969436645508,
|
|
"epoch": 0.4808371600141293,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004982613093753856,
|
|
"loss": 5.9701,
|
|
"mean_token_accuracy": 0.16098668649792672,
|
|
"num_tokens": 10170271.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 6.051625394821167,
|
|
"epoch": 0.4812787001059696,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004982573997901847,
|
|
"loss": 5.9498,
|
|
"mean_token_accuracy": 0.16217143833637238,
|
|
"num_tokens": 10179663.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 6.167967081069946,
|
|
"epoch": 0.48172024019780996,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982534858314982,
|
|
"loss": 6.0302,
|
|
"mean_token_accuracy": 0.1533224031329155,
|
|
"num_tokens": 10188608.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 6.123821449279785,
|
|
"epoch": 0.4821617802896503,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004982495674994031,
|
|
"loss": 5.9888,
|
|
"mean_token_accuracy": 0.1592209592461586,
|
|
"num_tokens": 10198462.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.982554721832275,
|
|
"epoch": 0.4826033203814906,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982456447939758,
|
|
"loss": 5.9323,
|
|
"mean_token_accuracy": 0.16437650620937347,
|
|
"num_tokens": 10208362.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 6.075630807876587,
|
|
"epoch": 0.483044860473331,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004982417177152933,
|
|
"loss": 5.955,
|
|
"mean_token_accuracy": 0.1612927421927452,
|
|
"num_tokens": 10217915.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 6.136414337158203,
|
|
"epoch": 0.48348640056517134,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004982377862634325,
|
|
"loss": 6.0141,
|
|
"mean_token_accuracy": 0.16026580333709717,
|
|
"num_tokens": 10226913.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 6.0771276473999025,
|
|
"epoch": 0.48392794065701167,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004982338504384705,
|
|
"loss": 5.9819,
|
|
"mean_token_accuracy": 0.15748471468687059,
|
|
"num_tokens": 10236516.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 6.1290271282196045,
|
|
"epoch": 0.484369480748852,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982299102404843,
|
|
"loss": 5.9977,
|
|
"mean_token_accuracy": 0.15856396406888962,
|
|
"num_tokens": 10245492.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.977679443359375,
|
|
"epoch": 0.48481102084069233,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.000498225965669551,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.17316461503505706,
|
|
"num_tokens": 10254094.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 6.03317437171936,
|
|
"epoch": 0.48525256093253266,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004982220167257482,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.1637963816523552,
|
|
"num_tokens": 10263645.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 6.013968801498413,
|
|
"epoch": 0.485694101024373,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004982180634091529,
|
|
"loss": 5.8919,
|
|
"mean_token_accuracy": 0.16816270351409912,
|
|
"num_tokens": 10273416.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 6.0709892272949215,
|
|
"epoch": 0.4861356411162134,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004982141057198427,
|
|
"loss": 5.9987,
|
|
"mean_token_accuracy": 0.15827764123678206,
|
|
"num_tokens": 10283194.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 6.133906984329224,
|
|
"epoch": 0.4865771812080537,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004982101436578952,
|
|
"loss": 5.9737,
|
|
"mean_token_accuracy": 0.15922945886850357,
|
|
"num_tokens": 10292188.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 6.0081017971038815,
|
|
"epoch": 0.48701872129989404,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004982061772233878,
|
|
"loss": 5.9009,
|
|
"mean_token_accuracy": 0.17360175549983978,
|
|
"num_tokens": 10301539.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 6.000823640823365,
|
|
"epoch": 0.4874602613917344,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004982022064163984,
|
|
"loss": 5.9093,
|
|
"mean_token_accuracy": 0.16490670889616013,
|
|
"num_tokens": 10310379.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 6.19787015914917,
|
|
"epoch": 0.4879018014835747,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004981982312370047,
|
|
"loss": 6.1108,
|
|
"mean_token_accuracy": 0.1488179437816143,
|
|
"num_tokens": 10320622.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 6.114509153366089,
|
|
"epoch": 0.48834334157541504,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004981942516852847,
|
|
"loss": 5.9356,
|
|
"mean_token_accuracy": 0.15839738100767137,
|
|
"num_tokens": 10330525.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 6.041282749176025,
|
|
"epoch": 0.48878488166725537,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004981902677613161,
|
|
"loss": 5.9671,
|
|
"mean_token_accuracy": 0.1586508110165596,
|
|
"num_tokens": 10340288.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 6.1574572086334225,
|
|
"epoch": 0.48922642175909575,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004981862794651771,
|
|
"loss": 6.0482,
|
|
"mean_token_accuracy": 0.15438254177570343,
|
|
"num_tokens": 10349836.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 6.124531698226929,
|
|
"epoch": 0.4896679618509361,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004981822867969459,
|
|
"loss": 5.9835,
|
|
"mean_token_accuracy": 0.1561306193470955,
|
|
"num_tokens": 10359481.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 6.077211523056031,
|
|
"epoch": 0.4901095019427764,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004981782897567006,
|
|
"loss": 5.9606,
|
|
"mean_token_accuracy": 0.16075108498334884,
|
|
"num_tokens": 10369837.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 6.108371019363403,
|
|
"epoch": 0.49055104203461675,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004981742883445195,
|
|
"loss": 5.927,
|
|
"mean_token_accuracy": 0.16440120637416838,
|
|
"num_tokens": 10379276.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 6.127094459533692,
|
|
"epoch": 0.4909925821264571,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000498170282560481,
|
|
"loss": 6.0562,
|
|
"mean_token_accuracy": 0.15287164598703384,
|
|
"num_tokens": 10388878.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 6.158031272888183,
|
|
"epoch": 0.4914341222182974,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004981662724046637,
|
|
"loss": 6.0416,
|
|
"mean_token_accuracy": 0.15418365895748137,
|
|
"num_tokens": 10399148.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 6.072212409973145,
|
|
"epoch": 0.49187566231013774,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000498162257877146,
|
|
"loss": 5.9705,
|
|
"mean_token_accuracy": 0.16276238560676576,
|
|
"num_tokens": 10409002.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 6.052406740188599,
|
|
"epoch": 0.4923172024019781,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004981582389780065,
|
|
"loss": 6.0182,
|
|
"mean_token_accuracy": 0.15543360412120819,
|
|
"num_tokens": 10418083.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 6.2516388416290285,
|
|
"epoch": 0.49275874249381846,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004981542157073241,
|
|
"loss": 6.0216,
|
|
"mean_token_accuracy": 0.15745319724082946,
|
|
"num_tokens": 10428134.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 6.102121114730835,
|
|
"epoch": 0.4932002825856588,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004981501880651775,
|
|
"loss": 6.0144,
|
|
"mean_token_accuracy": 0.15700092390179635,
|
|
"num_tokens": 10437436.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 6.011652183532715,
|
|
"epoch": 0.4936418226774991,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004981461560516457,
|
|
"loss": 5.9437,
|
|
"mean_token_accuracy": 0.15731440335512162,
|
|
"num_tokens": 10446176.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 6.185711860656738,
|
|
"epoch": 0.49408336276933945,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004981421196668075,
|
|
"loss": 6.1086,
|
|
"mean_token_accuracy": 0.15340851247310638,
|
|
"num_tokens": 10455953.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 6.20690336227417,
|
|
"epoch": 0.4945249028611798,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004981380789107422,
|
|
"loss": 5.9589,
|
|
"mean_token_accuracy": 0.15672969669103623,
|
|
"num_tokens": 10465129.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 6.003274393081665,
|
|
"epoch": 0.4949664429530201,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004981340337835287,
|
|
"loss": 5.9608,
|
|
"mean_token_accuracy": 0.1587096706032753,
|
|
"num_tokens": 10474376.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 6.04667649269104,
|
|
"epoch": 0.4954079830448605,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004981299842852464,
|
|
"loss": 6.02,
|
|
"mean_token_accuracy": 0.1601344585418701,
|
|
"num_tokens": 10484849.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 6.119048452377319,
|
|
"epoch": 0.49584952313670083,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004981259304159747,
|
|
"loss": 5.9104,
|
|
"mean_token_accuracy": 0.17004732936620712,
|
|
"num_tokens": 10493950.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 6.0551595211029055,
|
|
"epoch": 0.49629106322854116,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004981218721757929,
|
|
"loss": 5.877,
|
|
"mean_token_accuracy": 0.16873401552438735,
|
|
"num_tokens": 10502890.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.9873803615570065,
|
|
"epoch": 0.4967326033203815,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004981178095647805,
|
|
"loss": 5.8707,
|
|
"mean_token_accuracy": 0.160433566570282,
|
|
"num_tokens": 10511702.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 6.003235149383545,
|
|
"epoch": 0.4971741434122218,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004981137425830171,
|
|
"loss": 5.9481,
|
|
"mean_token_accuracy": 0.16074998080730438,
|
|
"num_tokens": 10520212.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 6.066579341888428,
|
|
"epoch": 0.49761568350406216,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004981096712305825,
|
|
"loss": 5.971,
|
|
"mean_token_accuracy": 0.1613484501838684,
|
|
"num_tokens": 10529652.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 6.099881601333618,
|
|
"epoch": 0.4980572235959025,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004981055955075561,
|
|
"loss": 6.0225,
|
|
"mean_token_accuracy": 0.158574703335762,
|
|
"num_tokens": 10537980.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 6.089739847183227,
|
|
"epoch": 0.4984987636877429,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004981015154140181,
|
|
"loss": 5.8995,
|
|
"mean_token_accuracy": 0.16764541566371918,
|
|
"num_tokens": 10546854.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.998034429550171,
|
|
"epoch": 0.4989403037795832,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004980974309500483,
|
|
"loss": 5.923,
|
|
"mean_token_accuracy": 0.16142944097518921,
|
|
"num_tokens": 10555691.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 6.080042123794556,
|
|
"epoch": 0.49938184387142354,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004980933421157267,
|
|
"loss": 6.0191,
|
|
"mean_token_accuracy": 0.1503999724984169,
|
|
"num_tokens": 10564750.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 6.1378124237060545,
|
|
"epoch": 0.49982338396326387,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980892489111334,
|
|
"loss": 6.0389,
|
|
"mean_token_accuracy": 0.15747978240251542,
|
|
"num_tokens": 10573515.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 6.055855131149292,
|
|
"epoch": 0.5002649240551043,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004980851513363486,
|
|
"loss": 5.9161,
|
|
"mean_token_accuracy": 0.1634524017572403,
|
|
"num_tokens": 10582730.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.963370323181152,
|
|
"epoch": 0.5007064641469445,
|
|
"grad_norm": 5.15625,
|
|
"learning_rate": 0.0004980810493914526,
|
|
"loss": 5.8141,
|
|
"mean_token_accuracy": 0.18174145370721817,
|
|
"num_tokens": 10591267.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 6.040016841888428,
|
|
"epoch": 0.5011480042387849,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980769430765256,
|
|
"loss": 5.918,
|
|
"mean_token_accuracy": 0.15779468268156052,
|
|
"num_tokens": 10599776.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 6.018471956253052,
|
|
"epoch": 0.5015895443306252,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004980728323916484,
|
|
"loss": 5.9294,
|
|
"mean_token_accuracy": 0.15198549777269363,
|
|
"num_tokens": 10609670.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 6.0253712177276615,
|
|
"epoch": 0.5020310844224656,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004980687173369009,
|
|
"loss": 5.8857,
|
|
"mean_token_accuracy": 0.16054306030273438,
|
|
"num_tokens": 10619032.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 6.13008975982666,
|
|
"epoch": 0.5024726245143059,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004980645979123644,
|
|
"loss": 5.9696,
|
|
"mean_token_accuracy": 0.15761574804782869,
|
|
"num_tokens": 10628851.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 6.1254706382751465,
|
|
"epoch": 0.5029141646061462,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980604741181192,
|
|
"loss": 5.9384,
|
|
"mean_token_accuracy": 0.1661163553595543,
|
|
"num_tokens": 10638335.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 6.104246044158936,
|
|
"epoch": 0.5033557046979866,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004980563459542461,
|
|
"loss": 6.0288,
|
|
"mean_token_accuracy": 0.15877759456634521,
|
|
"num_tokens": 10648450.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 6.129976463317871,
|
|
"epoch": 0.5037972447898269,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004980522134208261,
|
|
"loss": 6.0499,
|
|
"mean_token_accuracy": 0.15289226770401002,
|
|
"num_tokens": 10658700.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 6.125590181350708,
|
|
"epoch": 0.5042387848816673,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004980480765179401,
|
|
"loss": 6.0498,
|
|
"mean_token_accuracy": 0.15848255753517151,
|
|
"num_tokens": 10667952.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 6.015667390823364,
|
|
"epoch": 0.5046803249735076,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004980439352456692,
|
|
"loss": 5.8856,
|
|
"mean_token_accuracy": 0.16176477670669556,
|
|
"num_tokens": 10677228.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 6.1053516387939455,
|
|
"epoch": 0.505121865065348,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004980397896040944,
|
|
"loss": 5.974,
|
|
"mean_token_accuracy": 0.16118671298027037,
|
|
"num_tokens": 10686183.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 6.190397262573242,
|
|
"epoch": 0.5055634051571882,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004980356395932969,
|
|
"loss": 5.9997,
|
|
"mean_token_accuracy": 0.15813823491334916,
|
|
"num_tokens": 10695293.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 6.0877281665802006,
|
|
"epoch": 0.5060049452490286,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004980314852133581,
|
|
"loss": 5.9647,
|
|
"mean_token_accuracy": 0.1628525137901306,
|
|
"num_tokens": 10704853.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 6.098628950119019,
|
|
"epoch": 0.506446485340869,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004980273264643594,
|
|
"loss": 5.9596,
|
|
"mean_token_accuracy": 0.15651024580001832,
|
|
"num_tokens": 10714307.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.91804838180542,
|
|
"epoch": 0.5068880254327093,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004980231633463822,
|
|
"loss": 5.8195,
|
|
"mean_token_accuracy": 0.16056904792785645,
|
|
"num_tokens": 10723513.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 6.041822719573974,
|
|
"epoch": 0.5073295655245497,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004980189958595081,
|
|
"loss": 6.009,
|
|
"mean_token_accuracy": 0.15504895150661469,
|
|
"num_tokens": 10732809.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 6.180126619338989,
|
|
"epoch": 0.5077711056163899,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004980148240038186,
|
|
"loss": 6.0285,
|
|
"mean_token_accuracy": 0.16252532303333284,
|
|
"num_tokens": 10742960.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 6.167002391815186,
|
|
"epoch": 0.5082126457082303,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004980106477793957,
|
|
"loss": 6.0962,
|
|
"mean_token_accuracy": 0.15922853201627732,
|
|
"num_tokens": 10752835.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 6.090708923339844,
|
|
"epoch": 0.5086541858000706,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004980064671863209,
|
|
"loss": 5.9568,
|
|
"mean_token_accuracy": 0.16255878955125808,
|
|
"num_tokens": 10762139.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 6.121759986877441,
|
|
"epoch": 0.509095725891911,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004980022822246763,
|
|
"loss": 6.0284,
|
|
"mean_token_accuracy": 0.1527095004916191,
|
|
"num_tokens": 10771961.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 6.189744853973389,
|
|
"epoch": 0.5095372659837514,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004979980928945439,
|
|
"loss": 6.0799,
|
|
"mean_token_accuracy": 0.14973168522119523,
|
|
"num_tokens": 10780163.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 6.200748443603516,
|
|
"epoch": 0.5099788060755917,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004979938991960056,
|
|
"loss": 6.0327,
|
|
"mean_token_accuracy": 0.156923408806324,
|
|
"num_tokens": 10788339.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 6.119477415084839,
|
|
"epoch": 0.510420346167432,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004979897011291436,
|
|
"loss": 6.0257,
|
|
"mean_token_accuracy": 0.15160492956638336,
|
|
"num_tokens": 10797315.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 6.097667026519775,
|
|
"epoch": 0.5108618862592723,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004979854986940402,
|
|
"loss": 5.9344,
|
|
"mean_token_accuracy": 0.15801928341388702,
|
|
"num_tokens": 10806864.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 6.078370189666748,
|
|
"epoch": 0.5113034263511127,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004979812918907777,
|
|
"loss": 5.971,
|
|
"mean_token_accuracy": 0.15703734010457993,
|
|
"num_tokens": 10816417.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 6.111645221710205,
|
|
"epoch": 0.511744966442953,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004979770807194385,
|
|
"loss": 6.0491,
|
|
"mean_token_accuracy": 0.15200948417186738,
|
|
"num_tokens": 10825692.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 6.125649499893188,
|
|
"epoch": 0.5121865065347934,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004979728651801051,
|
|
"loss": 6.0184,
|
|
"mean_token_accuracy": 0.1593027725815773,
|
|
"num_tokens": 10834652.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 6.142418766021729,
|
|
"epoch": 0.5126280466266337,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004979686452728602,
|
|
"loss": 6.0045,
|
|
"mean_token_accuracy": 0.15610153079032899,
|
|
"num_tokens": 10844491.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 6.076530647277832,
|
|
"epoch": 0.513069586718474,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004979644209977863,
|
|
"loss": 5.9576,
|
|
"mean_token_accuracy": 0.16532048285007478,
|
|
"num_tokens": 10853948.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 6.112010383605957,
|
|
"epoch": 0.5135111268103144,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004979601923549661,
|
|
"loss": 5.9547,
|
|
"mean_token_accuracy": 0.16036146879196167,
|
|
"num_tokens": 10861745.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 6.1195940494537355,
|
|
"epoch": 0.5139526669021547,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004979559593444826,
|
|
"loss": 6.0383,
|
|
"mean_token_accuracy": 0.1518269196152687,
|
|
"num_tokens": 10871239.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 6.060505533218384,
|
|
"epoch": 0.5143942069939951,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004979517219664187,
|
|
"loss": 5.8893,
|
|
"mean_token_accuracy": 0.16240942627191543,
|
|
"num_tokens": 10880883.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 6.087244081497192,
|
|
"epoch": 0.5148357470858353,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004979474802208572,
|
|
"loss": 5.9569,
|
|
"mean_token_accuracy": 0.16193697676062585,
|
|
"num_tokens": 10890002.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 6.015149307250977,
|
|
"epoch": 0.5152772871776757,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004979432341078816,
|
|
"loss": 5.8864,
|
|
"mean_token_accuracy": 0.15554073452949524,
|
|
"num_tokens": 10898299.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 6.089050912857056,
|
|
"epoch": 0.5157188272695161,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004979389836275746,
|
|
"loss": 6.0774,
|
|
"mean_token_accuracy": 0.14996080696582795,
|
|
"num_tokens": 10907918.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 6.133493900299072,
|
|
"epoch": 0.5161603673613564,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004979347287800198,
|
|
"loss": 5.9596,
|
|
"mean_token_accuracy": 0.15501177459955215,
|
|
"num_tokens": 10917318.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 6.060728597640991,
|
|
"epoch": 0.5166019074531968,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004979304695653005,
|
|
"loss": 5.9144,
|
|
"mean_token_accuracy": 0.16151465103030205,
|
|
"num_tokens": 10926338.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.939118814468384,
|
|
"epoch": 0.5170434475450371,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004979262059835001,
|
|
"loss": 5.9432,
|
|
"mean_token_accuracy": 0.16033163890242577,
|
|
"num_tokens": 10935244.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 6.143280649185181,
|
|
"epoch": 0.5174849876368774,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004979219380347021,
|
|
"loss": 5.9905,
|
|
"mean_token_accuracy": 0.15961788594722748,
|
|
"num_tokens": 10943764.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 6.160701942443848,
|
|
"epoch": 0.5179265277287177,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004979176657189901,
|
|
"loss": 5.9547,
|
|
"mean_token_accuracy": 0.16057219803333284,
|
|
"num_tokens": 10952580.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.9959807872772215,
|
|
"epoch": 0.5183680678205581,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004979133890364477,
|
|
"loss": 5.9825,
|
|
"mean_token_accuracy": 0.1650010645389557,
|
|
"num_tokens": 10961745.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.959691524505615,
|
|
"epoch": 0.5188096079123985,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497909107987159,
|
|
"loss": 5.9087,
|
|
"mean_token_accuracy": 0.15640757530927657,
|
|
"num_tokens": 10971018.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 6.110277318954468,
|
|
"epoch": 0.5192511480042388,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004979048225712074,
|
|
"loss": 5.9408,
|
|
"mean_token_accuracy": 0.1647150531411171,
|
|
"num_tokens": 10980070.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 6.112304973602295,
|
|
"epoch": 0.5196926880960792,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979005327886772,
|
|
"loss": 5.9115,
|
|
"mean_token_accuracy": 0.15777890086174012,
|
|
"num_tokens": 10989090.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 6.028247594833374,
|
|
"epoch": 0.5201342281879194,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004978962386396525,
|
|
"loss": 5.9676,
|
|
"mean_token_accuracy": 0.16010279804468155,
|
|
"num_tokens": 10998652.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.936372900009156,
|
|
"epoch": 0.5205757682797598,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978919401242171,
|
|
"loss": 5.8702,
|
|
"mean_token_accuracy": 0.17004906684160231,
|
|
"num_tokens": 11006323.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.953699111938477,
|
|
"epoch": 0.5210173083716001,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004978876372424553,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.17337664365768432,
|
|
"num_tokens": 11015054.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 6.08972134590149,
|
|
"epoch": 0.5214588484634405,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004978833299944515,
|
|
"loss": 5.9875,
|
|
"mean_token_accuracy": 0.15530108660459518,
|
|
"num_tokens": 11025787.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 6.201382303237915,
|
|
"epoch": 0.5219003885552809,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004978790183802901,
|
|
"loss": 6.0957,
|
|
"mean_token_accuracy": 0.15121813490986824,
|
|
"num_tokens": 11036091.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 6.078470230102539,
|
|
"epoch": 0.5223419286471211,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004978747024000554,
|
|
"loss": 6.0798,
|
|
"mean_token_accuracy": 0.15142182558774947,
|
|
"num_tokens": 11046442.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 6.137741947174073,
|
|
"epoch": 0.5227834687389615,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004978703820538321,
|
|
"loss": 6.0767,
|
|
"mean_token_accuracy": 0.15347778424620628,
|
|
"num_tokens": 11056907.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 6.088362789154052,
|
|
"epoch": 0.5232250088308018,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004978660573417048,
|
|
"loss": 5.9226,
|
|
"mean_token_accuracy": 0.165810264647007,
|
|
"num_tokens": 11065298.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 6.02788200378418,
|
|
"epoch": 0.5236665489226422,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000497861728263758,
|
|
"loss": 5.9277,
|
|
"mean_token_accuracy": 0.16316543370485306,
|
|
"num_tokens": 11074509.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 6.140137195587158,
|
|
"epoch": 0.5241080890144825,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004978573948200769,
|
|
"loss": 5.9728,
|
|
"mean_token_accuracy": 0.16266270875930786,
|
|
"num_tokens": 11082433.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 6.069718503952027,
|
|
"epoch": 0.5245496291063229,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004978530570107461,
|
|
"loss": 5.9336,
|
|
"mean_token_accuracy": 0.16215700507164002,
|
|
"num_tokens": 11090838.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 6.054668140411377,
|
|
"epoch": 0.5249911691981632,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004978487148358506,
|
|
"loss": 5.9288,
|
|
"mean_token_accuracy": 0.1637731358408928,
|
|
"num_tokens": 11100048.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.982475614547729,
|
|
"epoch": 0.5254327092900035,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004978443682954756,
|
|
"loss": 5.9389,
|
|
"mean_token_accuracy": 0.16071433573961258,
|
|
"num_tokens": 11109871.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 6.093014669418335,
|
|
"epoch": 0.5258742493818439,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004978400173897061,
|
|
"loss": 5.924,
|
|
"mean_token_accuracy": 0.1567403718829155,
|
|
"num_tokens": 11119450.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 6.094651889801026,
|
|
"epoch": 0.5263157894736842,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004978356621186275,
|
|
"loss": 5.8755,
|
|
"mean_token_accuracy": 0.17035853564739228,
|
|
"num_tokens": 11128321.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 6.019528484344482,
|
|
"epoch": 0.5267573295655246,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978313024823249,
|
|
"loss": 5.9397,
|
|
"mean_token_accuracy": 0.15585954636335372,
|
|
"num_tokens": 11138657.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 6.009906387329101,
|
|
"epoch": 0.5271988696573648,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004978269384808839,
|
|
"loss": 5.9209,
|
|
"mean_token_accuracy": 0.16706853806972505,
|
|
"num_tokens": 11147704.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 6.089446401596069,
|
|
"epoch": 0.5276404097492052,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978225701143898,
|
|
"loss": 6.0138,
|
|
"mean_token_accuracy": 0.1562186986207962,
|
|
"num_tokens": 11156979.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 6.104236221313476,
|
|
"epoch": 0.5280819498410456,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004978181973829284,
|
|
"loss": 5.8985,
|
|
"mean_token_accuracy": 0.15810604989528657,
|
|
"num_tokens": 11165539.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 6.11846137046814,
|
|
"epoch": 0.5285234899328859,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004978138202865851,
|
|
"loss": 5.9758,
|
|
"mean_token_accuracy": 0.1503559224307537,
|
|
"num_tokens": 11175911.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 6.050204277038574,
|
|
"epoch": 0.5289650300247263,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004978094388254459,
|
|
"loss": 5.9883,
|
|
"mean_token_accuracy": 0.15720559507608414,
|
|
"num_tokens": 11185365.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 6.095414018630981,
|
|
"epoch": 0.5294065701165666,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004978050529995965,
|
|
"loss": 5.8979,
|
|
"mean_token_accuracy": 0.16393667906522752,
|
|
"num_tokens": 11195318.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 6.074192810058594,
|
|
"epoch": 0.5298481102084069,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004978006628091228,
|
|
"loss": 6.0079,
|
|
"mean_token_accuracy": 0.15748494416475295,
|
|
"num_tokens": 11205246.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5298481102084069,
|
|
"eval_entropy": 5.929435700441406,
|
|
"eval_loss": 5.984138011932373,
|
|
"eval_mean_token_accuracy": 0.16603976909265278,
|
|
"eval_num_tokens": 11205246.0,
|
|
"eval_runtime": 26.1397,
|
|
"eval_samples_per_second": 1351.009,
|
|
"eval_steps_per_second": 168.9,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 6.097824764251709,
|
|
"epoch": 0.5302896503002472,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977962682541108,
|
|
"loss": 5.9636,
|
|
"mean_token_accuracy": 0.15681301951408386,
|
|
"num_tokens": 11215183.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 6.0872814655303955,
|
|
"epoch": 0.5307311903920876,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004977918693346467,
|
|
"loss": 5.9167,
|
|
"mean_token_accuracy": 0.15939552038908006,
|
|
"num_tokens": 11224127.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 6.113217401504516,
|
|
"epoch": 0.531172730483928,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004977874660508165,
|
|
"loss": 5.9723,
|
|
"mean_token_accuracy": 0.16238286793231965,
|
|
"num_tokens": 11233014.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 6.072064352035523,
|
|
"epoch": 0.5316142705757683,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977830584027065,
|
|
"loss": 5.9707,
|
|
"mean_token_accuracy": 0.16605914533138275,
|
|
"num_tokens": 11242251.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 6.053228425979614,
|
|
"epoch": 0.5320558106676087,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004977786463904033,
|
|
"loss": 5.9956,
|
|
"mean_token_accuracy": 0.1588194712996483,
|
|
"num_tokens": 11251947.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 6.073520278930664,
|
|
"epoch": 0.5324973507594489,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004977742300139929,
|
|
"loss": 6.0406,
|
|
"mean_token_accuracy": 0.1539533868432045,
|
|
"num_tokens": 11261125.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 6.160229969024658,
|
|
"epoch": 0.5329388908512893,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004977698092735621,
|
|
"loss": 5.9402,
|
|
"mean_token_accuracy": 0.16421253234148026,
|
|
"num_tokens": 11270334.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.976454734802246,
|
|
"epoch": 0.5333804309431296,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004977653841691975,
|
|
"loss": 5.9399,
|
|
"mean_token_accuracy": 0.15946147739887237,
|
|
"num_tokens": 11280497.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 6.087053728103638,
|
|
"epoch": 0.53382197103497,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004977609547009857,
|
|
"loss": 5.8659,
|
|
"mean_token_accuracy": 0.16440399140119552,
|
|
"num_tokens": 11288636.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 6.129591178894043,
|
|
"epoch": 0.5342635111268104,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004977565208690134,
|
|
"loss": 5.9827,
|
|
"mean_token_accuracy": 0.15665870159864426,
|
|
"num_tokens": 11297519.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 6.083321666717529,
|
|
"epoch": 0.5347050512186506,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004977520826733677,
|
|
"loss": 5.9588,
|
|
"mean_token_accuracy": 0.1661783814430237,
|
|
"num_tokens": 11306280.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 6.075170660018921,
|
|
"epoch": 0.535146591310491,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004977476401141353,
|
|
"loss": 5.9527,
|
|
"mean_token_accuracy": 0.15530341863632202,
|
|
"num_tokens": 11315366.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 6.076067924499512,
|
|
"epoch": 0.5355881314023313,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004977431931914032,
|
|
"loss": 5.9595,
|
|
"mean_token_accuracy": 0.15634357184171677,
|
|
"num_tokens": 11324664.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 6.069175720214844,
|
|
"epoch": 0.5360296714941717,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004977387419052586,
|
|
"loss": 5.9339,
|
|
"mean_token_accuracy": 0.1536666676402092,
|
|
"num_tokens": 11334118.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 6.2017174243927,
|
|
"epoch": 0.536471211586012,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000497734286255789,
|
|
"loss": 6.0281,
|
|
"mean_token_accuracy": 0.15356491655111312,
|
|
"num_tokens": 11342684.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 6.07715802192688,
|
|
"epoch": 0.5369127516778524,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004977298262430813,
|
|
"loss": 5.9634,
|
|
"mean_token_accuracy": 0.16605139821767806,
|
|
"num_tokens": 11351298.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 6.011256170272827,
|
|
"epoch": 0.5373542917696927,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977253618672229,
|
|
"loss": 5.9153,
|
|
"mean_token_accuracy": 0.16742231994867324,
|
|
"num_tokens": 11360264.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.972763013839722,
|
|
"epoch": 0.537795831861533,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004977208931283013,
|
|
"loss": 5.894,
|
|
"mean_token_accuracy": 0.16407648473978043,
|
|
"num_tokens": 11369815.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 6.075697612762451,
|
|
"epoch": 0.5382373719533734,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004977164200264042,
|
|
"loss": 5.9606,
|
|
"mean_token_accuracy": 0.16177425235509874,
|
|
"num_tokens": 11379131.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 6.092071437835694,
|
|
"epoch": 0.5386789120452137,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004977119425616191,
|
|
"loss": 5.8475,
|
|
"mean_token_accuracy": 0.16356392502784728,
|
|
"num_tokens": 11387692.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 6.020279884338379,
|
|
"epoch": 0.5391204521370541,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004977074607340336,
|
|
"loss": 5.9208,
|
|
"mean_token_accuracy": 0.16179520785808563,
|
|
"num_tokens": 11396278.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.9607339859008786,
|
|
"epoch": 0.5395619922288943,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004977029745437357,
|
|
"loss": 5.8561,
|
|
"mean_token_accuracy": 0.16862278580665588,
|
|
"num_tokens": 11404790.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 6.074794054031372,
|
|
"epoch": 0.5400035323207347,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000497698483990813,
|
|
"loss": 5.9448,
|
|
"mean_token_accuracy": 0.15622814297676085,
|
|
"num_tokens": 11414614.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 6.05021300315857,
|
|
"epoch": 0.5404450724125751,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004976939890753539,
|
|
"loss": 5.9789,
|
|
"mean_token_accuracy": 0.15662158280611038,
|
|
"num_tokens": 11423696.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.987365674972534,
|
|
"epoch": 0.5408866125044154,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497689489797446,
|
|
"loss": 5.9184,
|
|
"mean_token_accuracy": 0.1647222951054573,
|
|
"num_tokens": 11432669.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 6.090789747238159,
|
|
"epoch": 0.5413281525962558,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004976849861571778,
|
|
"loss": 5.9464,
|
|
"mean_token_accuracy": 0.15996651202440262,
|
|
"num_tokens": 11442132.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 6.047543382644653,
|
|
"epoch": 0.541769692688096,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004976804781546373,
|
|
"loss": 5.9234,
|
|
"mean_token_accuracy": 0.1532900132238865,
|
|
"num_tokens": 11451821.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 6.068201351165771,
|
|
"epoch": 0.5422112327799364,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004976759657899129,
|
|
"loss": 5.9426,
|
|
"mean_token_accuracy": 0.16034639477729798,
|
|
"num_tokens": 11460242.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.969129514694214,
|
|
"epoch": 0.5426527728717767,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000497671449063093,
|
|
"loss": 5.9507,
|
|
"mean_token_accuracy": 0.16280617713928222,
|
|
"num_tokens": 11469588.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 6.102783298492431,
|
|
"epoch": 0.5430943129636171,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000497666927974266,
|
|
"loss": 6.0042,
|
|
"mean_token_accuracy": 0.15202597826719283,
|
|
"num_tokens": 11480401.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 6.1373790264129635,
|
|
"epoch": 0.5435358530554575,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004976624025235205,
|
|
"loss": 5.938,
|
|
"mean_token_accuracy": 0.16415768265724182,
|
|
"num_tokens": 11489117.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.957103109359741,
|
|
"epoch": 0.5439773931472978,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004976578727109453,
|
|
"loss": 5.871,
|
|
"mean_token_accuracy": 0.16513819396495819,
|
|
"num_tokens": 11497612.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.988688659667969,
|
|
"epoch": 0.5444189332391381,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004976533385366289,
|
|
"loss": 5.9763,
|
|
"mean_token_accuracy": 0.15876150280237197,
|
|
"num_tokens": 11507024.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 6.076825761795044,
|
|
"epoch": 0.5448604733309784,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004976488000006602,
|
|
"loss": 5.9042,
|
|
"mean_token_accuracy": 0.1657929614186287,
|
|
"num_tokens": 11516873.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 6.166196537017822,
|
|
"epoch": 0.5453020134228188,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004976442571031282,
|
|
"loss": 5.9943,
|
|
"mean_token_accuracy": 0.16310707181692125,
|
|
"num_tokens": 11526128.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.990091466903687,
|
|
"epoch": 0.5457435535146591,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004976397098441218,
|
|
"loss": 6.0027,
|
|
"mean_token_accuracy": 0.16429250240325927,
|
|
"num_tokens": 11535030.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.9520035743713375,
|
|
"epoch": 0.5461850936064995,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004976351582237301,
|
|
"loss": 5.885,
|
|
"mean_token_accuracy": 0.16240569949150085,
|
|
"num_tokens": 11543113.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 6.077959585189819,
|
|
"epoch": 0.5466266336983399,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004976306022420423,
|
|
"loss": 5.948,
|
|
"mean_token_accuracy": 0.15816203504800797,
|
|
"num_tokens": 11552744.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 6.0480293273925785,
|
|
"epoch": 0.5470681737901801,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004976260418991476,
|
|
"loss": 5.9521,
|
|
"mean_token_accuracy": 0.15910596996545792,
|
|
"num_tokens": 11561274.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 6.120669841766357,
|
|
"epoch": 0.5475097138820205,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004976214771951354,
|
|
"loss": 5.9849,
|
|
"mean_token_accuracy": 0.15998436361551285,
|
|
"num_tokens": 11570485.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 6.119882488250733,
|
|
"epoch": 0.5479512539738608,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004976169081300949,
|
|
"loss": 5.9707,
|
|
"mean_token_accuracy": 0.15495690554380417,
|
|
"num_tokens": 11579380.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 6.016612339019775,
|
|
"epoch": 0.5483927940657012,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004976123347041159,
|
|
"loss": 5.9419,
|
|
"mean_token_accuracy": 0.15940989702939987,
|
|
"num_tokens": 11588686.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.955654191970825,
|
|
"epoch": 0.5488343341575415,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004976077569172879,
|
|
"loss": 5.8544,
|
|
"mean_token_accuracy": 0.16704583019018174,
|
|
"num_tokens": 11598547.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 6.076641321182251,
|
|
"epoch": 0.5492758742493818,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004976031747697006,
|
|
"loss": 5.9632,
|
|
"mean_token_accuracy": 0.16434568166732788,
|
|
"num_tokens": 11608407.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 6.077345609664917,
|
|
"epoch": 0.5497174143412222,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004975985882614435,
|
|
"loss": 5.9317,
|
|
"mean_token_accuracy": 0.15894413441419603,
|
|
"num_tokens": 11618641.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 6.108517217636108,
|
|
"epoch": 0.5501589544330625,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004975939973926068,
|
|
"loss": 5.9658,
|
|
"mean_token_accuracy": 0.15470803529024124,
|
|
"num_tokens": 11627082.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 6.035842847824097,
|
|
"epoch": 0.5506004945249029,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004975894021632803,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.16572816520929337,
|
|
"num_tokens": 11636045.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 6.017991161346435,
|
|
"epoch": 0.5510420346167432,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000497584802573554,
|
|
"loss": 5.9476,
|
|
"mean_token_accuracy": 0.1594747096300125,
|
|
"num_tokens": 11646516.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.969410848617554,
|
|
"epoch": 0.5514835747085836,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000497580198623518,
|
|
"loss": 5.8793,
|
|
"mean_token_accuracy": 0.1672035813331604,
|
|
"num_tokens": 11655512.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 6.07194766998291,
|
|
"epoch": 0.5519251148004238,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004975755903132623,
|
|
"loss": 5.9988,
|
|
"mean_token_accuracy": 0.16115525662899016,
|
|
"num_tokens": 11665641.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 6.070268392562866,
|
|
"epoch": 0.5523666548922642,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004975709776428776,
|
|
"loss": 5.9626,
|
|
"mean_token_accuracy": 0.14910453855991362,
|
|
"num_tokens": 11674585.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 6.0138458728790285,
|
|
"epoch": 0.5528081949841046,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000497566360612454,
|
|
"loss": 5.8206,
|
|
"mean_token_accuracy": 0.1709229715168476,
|
|
"num_tokens": 11683498.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 6.020624494552612,
|
|
"epoch": 0.5532497350759449,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000497561739222082,
|
|
"loss": 5.8708,
|
|
"mean_token_accuracy": 0.16468418687582015,
|
|
"num_tokens": 11692663.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 6.071487998962402,
|
|
"epoch": 0.5536912751677853,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004975571134718521,
|
|
"loss": 6.0272,
|
|
"mean_token_accuracy": 0.1560715228319168,
|
|
"num_tokens": 11701906.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 6.1155585765838625,
|
|
"epoch": 0.5541328152596255,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004975524833618548,
|
|
"loss": 5.939,
|
|
"mean_token_accuracy": 0.16798952966928482,
|
|
"num_tokens": 11710932.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 6.000746726989746,
|
|
"epoch": 0.5545743553514659,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000497547848892181,
|
|
"loss": 5.9412,
|
|
"mean_token_accuracy": 0.16227794140577317,
|
|
"num_tokens": 11719953.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 6.077520656585693,
|
|
"epoch": 0.5550158954433062,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975432100629215,
|
|
"loss": 5.8256,
|
|
"mean_token_accuracy": 0.16723808497190476,
|
|
"num_tokens": 11728706.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 6.107032203674317,
|
|
"epoch": 0.5554574355351466,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000497538566874167,
|
|
"loss": 6.0226,
|
|
"mean_token_accuracy": 0.15363354086875916,
|
|
"num_tokens": 11738537.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 6.031191444396972,
|
|
"epoch": 0.555898975626987,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975339193260084,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.16909542083740234,
|
|
"num_tokens": 11747503.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 6.056135463714599,
|
|
"epoch": 0.5563405157188273,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975292674185371,
|
|
"loss": 5.9422,
|
|
"mean_token_accuracy": 0.1652400091290474,
|
|
"num_tokens": 11756501.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 6.039711570739746,
|
|
"epoch": 0.5567820558106676,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004975246111518439,
|
|
"loss": 5.9083,
|
|
"mean_token_accuracy": 0.15950924158096313,
|
|
"num_tokens": 11765483.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 6.124545240402222,
|
|
"epoch": 0.5572235959025079,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004975199505260202,
|
|
"loss": 6.0394,
|
|
"mean_token_accuracy": 0.14992669820785523,
|
|
"num_tokens": 11774941.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 6.157734012603759,
|
|
"epoch": 0.5576651359943483,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004975152855411571,
|
|
"loss": 6.0228,
|
|
"mean_token_accuracy": 0.15619456619024277,
|
|
"num_tokens": 11785080.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 6.149894714355469,
|
|
"epoch": 0.5581066760861886,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004975106161973462,
|
|
"loss": 5.898,
|
|
"mean_token_accuracy": 0.16184557229280472,
|
|
"num_tokens": 11793933.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 6.058248376846313,
|
|
"epoch": 0.558548216178029,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000497505942494679,
|
|
"loss": 5.9241,
|
|
"mean_token_accuracy": 0.16193547546863557,
|
|
"num_tokens": 11803592.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.964681005477905,
|
|
"epoch": 0.5589897562698694,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975012644332467,
|
|
"loss": 5.8295,
|
|
"mean_token_accuracy": 0.1708283841609955,
|
|
"num_tokens": 11812397.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.99154577255249,
|
|
"epoch": 0.5594312963617096,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004974965820131413,
|
|
"loss": 5.9663,
|
|
"mean_token_accuracy": 0.15441266000270842,
|
|
"num_tokens": 11821734.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 6.028355646133423,
|
|
"epoch": 0.55987283645355,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004974918952344544,
|
|
"loss": 5.8438,
|
|
"mean_token_accuracy": 0.1648928314447403,
|
|
"num_tokens": 11832107.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 6.0491269588470455,
|
|
"epoch": 0.5603143765453903,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004974872040972777,
|
|
"loss": 5.894,
|
|
"mean_token_accuracy": 0.15779831409454345,
|
|
"num_tokens": 11841983.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.933441162109375,
|
|
"epoch": 0.5607559166372307,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004974825086017033,
|
|
"loss": 5.8688,
|
|
"mean_token_accuracy": 0.1653630644083023,
|
|
"num_tokens": 11851836.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 6.113220262527466,
|
|
"epoch": 0.561197456729071,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000497477808747823,
|
|
"loss": 5.9227,
|
|
"mean_token_accuracy": 0.16370657682418824,
|
|
"num_tokens": 11861109.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 6.086468172073364,
|
|
"epoch": 0.5616389968209113,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004974731045357292,
|
|
"loss": 6.0077,
|
|
"mean_token_accuracy": 0.15728640407323838,
|
|
"num_tokens": 11871015.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.971633386611939,
|
|
"epoch": 0.5620805369127517,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004974683959655136,
|
|
"loss": 5.9026,
|
|
"mean_token_accuracy": 0.16690124869346618,
|
|
"num_tokens": 11881306.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 6.044920015335083,
|
|
"epoch": 0.562522077004592,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004974636830372686,
|
|
"loss": 5.8321,
|
|
"mean_token_accuracy": 0.1697683110833168,
|
|
"num_tokens": 11890251.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 6.0341572761535645,
|
|
"epoch": 0.5629636170964324,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004974589657510866,
|
|
"loss": 5.9548,
|
|
"mean_token_accuracy": 0.1595368355512619,
|
|
"num_tokens": 11899662.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 6.0119726181030275,
|
|
"epoch": 0.5634051571882727,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004974542441070601,
|
|
"loss": 5.9088,
|
|
"mean_token_accuracy": 0.164400115609169,
|
|
"num_tokens": 11908699.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 6.074244689941406,
|
|
"epoch": 0.563846697280113,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004974495181052814,
|
|
"loss": 5.9153,
|
|
"mean_token_accuracy": 0.16292343586683272,
|
|
"num_tokens": 11916637.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 6.057909202575684,
|
|
"epoch": 0.5642882373719533,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004974447877458433,
|
|
"loss": 5.978,
|
|
"mean_token_accuracy": 0.15960414558649064,
|
|
"num_tokens": 11926063.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 6.042461633682251,
|
|
"epoch": 0.5647297774637937,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004974400530288381,
|
|
"loss": 5.8396,
|
|
"mean_token_accuracy": 0.16759493798017502,
|
|
"num_tokens": 11934861.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 6.064510822296143,
|
|
"epoch": 0.5651713175556341,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004974353139543589,
|
|
"loss": 5.9472,
|
|
"mean_token_accuracy": 0.1592862457036972,
|
|
"num_tokens": 11944403.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 6.09717435836792,
|
|
"epoch": 0.5656128576474744,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004974305705224983,
|
|
"loss": 6.0273,
|
|
"mean_token_accuracy": 0.16373891979455948,
|
|
"num_tokens": 11953306.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 6.048258686065674,
|
|
"epoch": 0.5660543977393148,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004974258227333493,
|
|
"loss": 5.9229,
|
|
"mean_token_accuracy": 0.16490714997053146,
|
|
"num_tokens": 11962727.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 6.083757448196411,
|
|
"epoch": 0.566495937831155,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004974210705870052,
|
|
"loss": 6.0653,
|
|
"mean_token_accuracy": 0.1529408350586891,
|
|
"num_tokens": 11972375.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.9368589401245115,
|
|
"epoch": 0.5669374779229954,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004974163140835586,
|
|
"loss": 5.8357,
|
|
"mean_token_accuracy": 0.16640966832637788,
|
|
"num_tokens": 11981989.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 6.001610279083252,
|
|
"epoch": 0.5673790180148357,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004974115532231029,
|
|
"loss": 5.8865,
|
|
"mean_token_accuracy": 0.16079047322273254,
|
|
"num_tokens": 11990865.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 6.038493490219116,
|
|
"epoch": 0.5678205581066761,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004974067880057315,
|
|
"loss": 5.8903,
|
|
"mean_token_accuracy": 0.16845848858356477,
|
|
"num_tokens": 12000697.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 6.008808898925781,
|
|
"epoch": 0.5682620981985165,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004974020184315375,
|
|
"loss": 5.9645,
|
|
"mean_token_accuracy": 0.16037520170211791,
|
|
"num_tokens": 12010560.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 6.135006332397461,
|
|
"epoch": 0.5687036382903567,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973972445006144,
|
|
"loss": 5.9804,
|
|
"mean_token_accuracy": 0.16373104155063628,
|
|
"num_tokens": 12020760.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.968979454040527,
|
|
"epoch": 0.5691451783821971,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004973924662130559,
|
|
"loss": 5.8425,
|
|
"mean_token_accuracy": 0.1618216186761856,
|
|
"num_tokens": 12030467.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 6.0167624950408936,
|
|
"epoch": 0.5695867184740374,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004973876835689554,
|
|
"loss": 5.9458,
|
|
"mean_token_accuracy": 0.15894586592912674,
|
|
"num_tokens": 12039897.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 6.1160314083099365,
|
|
"epoch": 0.5700282585658778,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004973828965684066,
|
|
"loss": 5.9589,
|
|
"mean_token_accuracy": 0.1576131209731102,
|
|
"num_tokens": 12048997.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 6.03399133682251,
|
|
"epoch": 0.5704697986577181,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004973781052115035,
|
|
"loss": 5.8459,
|
|
"mean_token_accuracy": 0.17132855206727982,
|
|
"num_tokens": 12057493.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.896988725662231,
|
|
"epoch": 0.5709113387495585,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004973733094983396,
|
|
"loss": 5.917,
|
|
"mean_token_accuracy": 0.16548199504613875,
|
|
"num_tokens": 12066329.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 6.044803476333618,
|
|
"epoch": 0.5713528788413988,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004973685094290092,
|
|
"loss": 5.966,
|
|
"mean_token_accuracy": 0.15556841790676118,
|
|
"num_tokens": 12076022.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 6.063698959350586,
|
|
"epoch": 0.5717944189332391,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000497363705003606,
|
|
"loss": 5.9132,
|
|
"mean_token_accuracy": 0.15902747064828873,
|
|
"num_tokens": 12085750.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 6.0139764785766605,
|
|
"epoch": 0.5722359590250795,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004973588962222244,
|
|
"loss": 5.9232,
|
|
"mean_token_accuracy": 0.16362606585025788,
|
|
"num_tokens": 12094935.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.97671914100647,
|
|
"epoch": 0.5726774991169198,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004973540830849585,
|
|
"loss": 5.8763,
|
|
"mean_token_accuracy": 0.16199066191911698,
|
|
"num_tokens": 12104970.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 6.06100058555603,
|
|
"epoch": 0.5731190392087602,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004973492655919025,
|
|
"loss": 5.9691,
|
|
"mean_token_accuracy": 0.16144821047782898,
|
|
"num_tokens": 12116195.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 6.093713283538818,
|
|
"epoch": 0.5735605793006004,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004973444437431508,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.16268889904022216,
|
|
"num_tokens": 12125485.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.996689748764038,
|
|
"epoch": 0.5740021193924408,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004973396175387981,
|
|
"loss": 5.9323,
|
|
"mean_token_accuracy": 0.16517322808504104,
|
|
"num_tokens": 12135474.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 6.063704156875611,
|
|
"epoch": 0.5744436594842812,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004973347869789385,
|
|
"loss": 5.9733,
|
|
"mean_token_accuracy": 0.15778698474168779,
|
|
"num_tokens": 12144945.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 6.049719285964966,
|
|
"epoch": 0.5748851995761215,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004973299520636671,
|
|
"loss": 5.9195,
|
|
"mean_token_accuracy": 0.1623150423169136,
|
|
"num_tokens": 12154893.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 6.064568901062012,
|
|
"epoch": 0.5753267396679619,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973251127930782,
|
|
"loss": 5.991,
|
|
"mean_token_accuracy": 0.15995137244462967,
|
|
"num_tokens": 12164047.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 6.106995487213135,
|
|
"epoch": 0.5757682797598022,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004973202691672669,
|
|
"loss": 5.971,
|
|
"mean_token_accuracy": 0.15734548568725587,
|
|
"num_tokens": 12172965.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 6.065262413024902,
|
|
"epoch": 0.5762098198516425,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004973154211863279,
|
|
"loss": 5.9858,
|
|
"mean_token_accuracy": 0.15848653614521027,
|
|
"num_tokens": 12183534.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 6.019828367233276,
|
|
"epoch": 0.5766513599434828,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004973105688503563,
|
|
"loss": 5.9443,
|
|
"mean_token_accuracy": 0.16066363602876663,
|
|
"num_tokens": 12194370.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 6.086677503585816,
|
|
"epoch": 0.5770929000353232,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000497305712159447,
|
|
"loss": 5.953,
|
|
"mean_token_accuracy": 0.15858365148305892,
|
|
"num_tokens": 12203765.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 6.064221906661987,
|
|
"epoch": 0.5775344401271636,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004973008511136954,
|
|
"loss": 5.8947,
|
|
"mean_token_accuracy": 0.16712932288646698,
|
|
"num_tokens": 12212889.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 6.027991771697998,
|
|
"epoch": 0.5779759802190039,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004972959857131963,
|
|
"loss": 5.961,
|
|
"mean_token_accuracy": 0.15869880318641663,
|
|
"num_tokens": 12222055.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 6.09398717880249,
|
|
"epoch": 0.5784175203108443,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004972911159580454,
|
|
"loss": 5.9716,
|
|
"mean_token_accuracy": 0.15896429717540742,
|
|
"num_tokens": 12232092.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 6.056658220291138,
|
|
"epoch": 0.5788590604026845,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000497286241848338,
|
|
"loss": 5.9823,
|
|
"mean_token_accuracy": 0.15855540782213212,
|
|
"num_tokens": 12241411.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 6.101971817016602,
|
|
"epoch": 0.5793006004945249,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004972813633841696,
|
|
"loss": 5.8669,
|
|
"mean_token_accuracy": 0.16081757098436356,
|
|
"num_tokens": 12249735.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 6.011325693130493,
|
|
"epoch": 0.5797421405863652,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004972764805656355,
|
|
"loss": 5.8807,
|
|
"mean_token_accuracy": 0.16948403120040895,
|
|
"num_tokens": 12259132.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.9479166030883786,
|
|
"epoch": 0.5801836806782056,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004972715933928318,
|
|
"loss": 5.9587,
|
|
"mean_token_accuracy": 0.1681016907095909,
|
|
"num_tokens": 12269401.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 6.1060340881347654,
|
|
"epoch": 0.580625220770046,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004972667018658539,
|
|
"loss": 5.9837,
|
|
"mean_token_accuracy": 0.15532615929841995,
|
|
"num_tokens": 12279163.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 6.030529546737671,
|
|
"epoch": 0.5810667608618862,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004972618059847976,
|
|
"loss": 5.872,
|
|
"mean_token_accuracy": 0.16901613771915436,
|
|
"num_tokens": 12287942.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.988468074798584,
|
|
"epoch": 0.5815083009537266,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004972569057497591,
|
|
"loss": 5.8788,
|
|
"mean_token_accuracy": 0.16304004341363906,
|
|
"num_tokens": 12296717.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 6.02855339050293,
|
|
"epoch": 0.5819498410455669,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004972520011608341,
|
|
"loss": 5.8562,
|
|
"mean_token_accuracy": 0.17002730965614318,
|
|
"num_tokens": 12305983.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.991772890090942,
|
|
"epoch": 0.5823913811374073,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004972470922181189,
|
|
"loss": 5.932,
|
|
"mean_token_accuracy": 0.15614658892154692,
|
|
"num_tokens": 12315338.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 6.037394952774048,
|
|
"epoch": 0.5828329212292476,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004972421789217094,
|
|
"loss": 6.0198,
|
|
"mean_token_accuracy": 0.15138231888413428,
|
|
"num_tokens": 12324828.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 6.185899257659912,
|
|
"epoch": 0.583274461321088,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004972372612717022,
|
|
"loss": 5.893,
|
|
"mean_token_accuracy": 0.1675851821899414,
|
|
"num_tokens": 12332844.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 6.062373638153076,
|
|
"epoch": 0.5837160014129283,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004972323392681934,
|
|
"loss": 5.9112,
|
|
"mean_token_accuracy": 0.1606480211019516,
|
|
"num_tokens": 12342062.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.956027936935425,
|
|
"epoch": 0.5841575415047686,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004972274129112794,
|
|
"loss": 5.9835,
|
|
"mean_token_accuracy": 0.1534557655453682,
|
|
"num_tokens": 12352071.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 6.217044639587402,
|
|
"epoch": 0.584599081596609,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004972224822010567,
|
|
"loss": 6.0609,
|
|
"mean_token_accuracy": 0.15077690929174423,
|
|
"num_tokens": 12362025.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 6.157152462005615,
|
|
"epoch": 0.5850406216884493,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004972175471376222,
|
|
"loss": 6.043,
|
|
"mean_token_accuracy": 0.15202385187149048,
|
|
"num_tokens": 12371547.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 6.0660265445709225,
|
|
"epoch": 0.5854821617802897,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004972126077210722,
|
|
"loss": 5.9528,
|
|
"mean_token_accuracy": 0.156717449426651,
|
|
"num_tokens": 12381385.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 6.107764101028442,
|
|
"epoch": 0.5859237018721299,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004972076639515035,
|
|
"loss": 5.9961,
|
|
"mean_token_accuracy": 0.1575634926557541,
|
|
"num_tokens": 12391302.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.931721448898315,
|
|
"epoch": 0.5863652419639703,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004972027158290131,
|
|
"loss": 5.8827,
|
|
"mean_token_accuracy": 0.16473877429962158,
|
|
"num_tokens": 12400388.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 6.0112042903900145,
|
|
"epoch": 0.5868067820558107,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000497197763353698,
|
|
"loss": 5.9466,
|
|
"mean_token_accuracy": 0.15398259609937667,
|
|
"num_tokens": 12409695.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 6.130546712875367,
|
|
"epoch": 0.587248322147651,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000497192806525655,
|
|
"loss": 5.8142,
|
|
"mean_token_accuracy": 0.1708827257156372,
|
|
"num_tokens": 12418728.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 6.000955820083618,
|
|
"epoch": 0.5876898622394914,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004971878453449811,
|
|
"loss": 5.9827,
|
|
"mean_token_accuracy": 0.16348764598369597,
|
|
"num_tokens": 12428072.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 6.079076623916626,
|
|
"epoch": 0.5881314023313317,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497182879811774,
|
|
"loss": 5.9081,
|
|
"mean_token_accuracy": 0.1675763249397278,
|
|
"num_tokens": 12436624.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 6.015807867050171,
|
|
"epoch": 0.588572942423172,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004971779099261304,
|
|
"loss": 5.9174,
|
|
"mean_token_accuracy": 0.1616780251264572,
|
|
"num_tokens": 12446370.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 6.032747650146485,
|
|
"epoch": 0.5890144825150123,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000497172935688148,
|
|
"loss": 5.9717,
|
|
"mean_token_accuracy": 0.15881636887788772,
|
|
"num_tokens": 12456705.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 6.0455207347869875,
|
|
"epoch": 0.5894560226068527,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004971679570979242,
|
|
"loss": 5.9891,
|
|
"mean_token_accuracy": 0.16214286386966706,
|
|
"num_tokens": 12466150.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 6.019035196304321,
|
|
"epoch": 0.5898975626986931,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004971629741555565,
|
|
"loss": 5.8321,
|
|
"mean_token_accuracy": 0.16692282557487487,
|
|
"num_tokens": 12475051.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 6.051440095901489,
|
|
"epoch": 0.5903391027905334,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004971579868611423,
|
|
"loss": 5.908,
|
|
"mean_token_accuracy": 0.16642434149980545,
|
|
"num_tokens": 12485427.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.968716716766357,
|
|
"epoch": 0.5907806428823738,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004971529952147797,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.16762274205684663,
|
|
"num_tokens": 12494489.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 6.060288572311402,
|
|
"epoch": 0.591222182974214,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000497147999216566,
|
|
"loss": 5.8809,
|
|
"mean_token_accuracy": 0.16746745109558106,
|
|
"num_tokens": 12503656.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 6.088530397415161,
|
|
"epoch": 0.5916637230660544,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004971429988665996,
|
|
"loss": 5.9356,
|
|
"mean_token_accuracy": 0.15473153740167617,
|
|
"num_tokens": 12512940.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 6.049561214447022,
|
|
"epoch": 0.5921052631578947,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000497137994164978,
|
|
"loss": 5.8734,
|
|
"mean_token_accuracy": 0.16000210046768187,
|
|
"num_tokens": 12523186.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 6.0405772686004635,
|
|
"epoch": 0.5925468032497351,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004971329851117996,
|
|
"loss": 5.9207,
|
|
"mean_token_accuracy": 0.1611115738749504,
|
|
"num_tokens": 12533018.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 6.150906324386597,
|
|
"epoch": 0.5929883433415755,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004971279717071622,
|
|
"loss": 5.9854,
|
|
"mean_token_accuracy": 0.15813910067081452,
|
|
"num_tokens": 12542889.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 6.118391656875611,
|
|
"epoch": 0.5934298834334157,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004971229539511643,
|
|
"loss": 5.9951,
|
|
"mean_token_accuracy": 0.15295567288994788,
|
|
"num_tokens": 12553625.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.998600721359253,
|
|
"epoch": 0.5938714235252561,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000497117931843904,
|
|
"loss": 5.9109,
|
|
"mean_token_accuracy": 0.16377592980861663,
|
|
"num_tokens": 12563012.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 6.154142999649048,
|
|
"epoch": 0.5943129636170964,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004971129053854797,
|
|
"loss": 6.0368,
|
|
"mean_token_accuracy": 0.14983948394656182,
|
|
"num_tokens": 12572892.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 6.081307983398437,
|
|
"epoch": 0.5947545037089368,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00049710787457599,
|
|
"loss": 5.8738,
|
|
"mean_token_accuracy": 0.16873954683542253,
|
|
"num_tokens": 12581550.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 6.030810117721558,
|
|
"epoch": 0.5951960438007771,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004971028394155332,
|
|
"loss": 5.9572,
|
|
"mean_token_accuracy": 0.15993183553218843,
|
|
"num_tokens": 12590599.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 6.106176710128784,
|
|
"epoch": 0.5956375838926175,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004970977999042082,
|
|
"loss": 5.9152,
|
|
"mean_token_accuracy": 0.16037029176950454,
|
|
"num_tokens": 12599497.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 6.025887393951416,
|
|
"epoch": 0.5960791239844578,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004970927560421137,
|
|
"loss": 5.9749,
|
|
"mean_token_accuracy": 0.16369700878858567,
|
|
"num_tokens": 12608273.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.978644990921021,
|
|
"epoch": 0.5965206640762981,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004970877078293483,
|
|
"loss": 5.8404,
|
|
"mean_token_accuracy": 0.16632604449987412,
|
|
"num_tokens": 12617287.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 6.0520916938781735,
|
|
"epoch": 0.5969622041681385,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004970826552660111,
|
|
"loss": 5.9189,
|
|
"mean_token_accuracy": 0.1645960807800293,
|
|
"num_tokens": 12626874.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 6.046804714202881,
|
|
"epoch": 0.5974037442599788,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004970775983522009,
|
|
"loss": 5.9547,
|
|
"mean_token_accuracy": 0.15649548768997193,
|
|
"num_tokens": 12635218.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 6.0162660598754885,
|
|
"epoch": 0.5978452843518192,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004970725370880168,
|
|
"loss": 5.927,
|
|
"mean_token_accuracy": 0.1587716147303581,
|
|
"num_tokens": 12644395.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 6.0681376457214355,
|
|
"epoch": 0.5982868244436594,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004970674714735581,
|
|
"loss": 5.9416,
|
|
"mean_token_accuracy": 0.16704253554344178,
|
|
"num_tokens": 12654540.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 6.038317584991455,
|
|
"epoch": 0.5987283645354998,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970624015089239,
|
|
"loss": 5.8046,
|
|
"mean_token_accuracy": 0.17055938690900802,
|
|
"num_tokens": 12663546.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.958972406387329,
|
|
"epoch": 0.5991699046273402,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004970573271942135,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.1605516865849495,
|
|
"num_tokens": 12672353.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.948368644714355,
|
|
"epoch": 0.5996114447191805,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004970522485295265,
|
|
"loss": 5.8077,
|
|
"mean_token_accuracy": 0.1750718891620636,
|
|
"num_tokens": 12680833.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 6.008566856384277,
|
|
"epoch": 0.6000529848110209,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004970471655149621,
|
|
"loss": 5.7693,
|
|
"mean_token_accuracy": 0.168387308716774,
|
|
"num_tokens": 12690033.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.891954612731934,
|
|
"epoch": 0.6004945249028611,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00049704207815062,
|
|
"loss": 5.8498,
|
|
"mean_token_accuracy": 0.17069175988435745,
|
|
"num_tokens": 12700127.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 6.0572412490844725,
|
|
"epoch": 0.6009360649947015,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004970369864366,
|
|
"loss": 5.8864,
|
|
"mean_token_accuracy": 0.1621399462223053,
|
|
"num_tokens": 12709803.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 6.027445650100708,
|
|
"epoch": 0.6013776050865418,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004970318903730017,
|
|
"loss": 5.8683,
|
|
"mean_token_accuracy": 0.16281817108392715,
|
|
"num_tokens": 12718876.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 6.07098970413208,
|
|
"epoch": 0.6018191451783822,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004970267899599249,
|
|
"loss": 5.9846,
|
|
"mean_token_accuracy": 0.1596504732966423,
|
|
"num_tokens": 12728745.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 6.074090528488159,
|
|
"epoch": 0.6022606852702226,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004970216851974695,
|
|
"loss": 5.9778,
|
|
"mean_token_accuracy": 0.15287121534347534,
|
|
"num_tokens": 12738764.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.971881341934204,
|
|
"epoch": 0.6027022253620629,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004970165760857356,
|
|
"loss": 5.9192,
|
|
"mean_token_accuracy": 0.15215612798929215,
|
|
"num_tokens": 12747888.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 6.147901725769043,
|
|
"epoch": 0.6031437654539032,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004970114626248232,
|
|
"loss": 5.9269,
|
|
"mean_token_accuracy": 0.16107769459486007,
|
|
"num_tokens": 12757763.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 6.051682806015014,
|
|
"epoch": 0.6035853055457435,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004970063448148326,
|
|
"loss": 5.965,
|
|
"mean_token_accuracy": 0.16029387414455415,
|
|
"num_tokens": 12767924.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 6.023601818084717,
|
|
"epoch": 0.6040268456375839,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004970012226558639,
|
|
"loss": 5.977,
|
|
"mean_token_accuracy": 0.1597777470946312,
|
|
"num_tokens": 12777140.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 6.055026721954346,
|
|
"epoch": 0.6044683857294242,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004969960961480176,
|
|
"loss": 5.9521,
|
|
"mean_token_accuracy": 0.16517617404460908,
|
|
"num_tokens": 12786731.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 6.070271015167236,
|
|
"epoch": 0.6049099258212646,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004969909652913939,
|
|
"loss": 5.8877,
|
|
"mean_token_accuracy": 0.16475060284137727,
|
|
"num_tokens": 12796839.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 6.030538511276245,
|
|
"epoch": 0.605351465913105,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004969858300860935,
|
|
"loss": 5.9737,
|
|
"mean_token_accuracy": 0.1638708546757698,
|
|
"num_tokens": 12805944.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 6.017725276947021,
|
|
"epoch": 0.6057930060049452,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004969806905322169,
|
|
"loss": 5.9286,
|
|
"mean_token_accuracy": 0.16286252290010453,
|
|
"num_tokens": 12814692.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 6.016694974899292,
|
|
"epoch": 0.6062345460967856,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004969755466298649,
|
|
"loss": 5.8609,
|
|
"mean_token_accuracy": 0.1611609011888504,
|
|
"num_tokens": 12824506.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 6.07229380607605,
|
|
"epoch": 0.6066760861886259,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969703983791382,
|
|
"loss": 5.9186,
|
|
"mean_token_accuracy": 0.16030610352754593,
|
|
"num_tokens": 12835157.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 6.005796813964844,
|
|
"epoch": 0.6071176262804663,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004969652457801376,
|
|
"loss": 5.846,
|
|
"mean_token_accuracy": 0.16162244081497193,
|
|
"num_tokens": 12844281.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 6.094640493392944,
|
|
"epoch": 0.6075591663723066,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004969600888329641,
|
|
"loss": 5.9086,
|
|
"mean_token_accuracy": 0.16073376536369324,
|
|
"num_tokens": 12854486.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 6.048177909851074,
|
|
"epoch": 0.608000706464147,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004969549275377186,
|
|
"loss": 5.8907,
|
|
"mean_token_accuracy": 0.16041633486747742,
|
|
"num_tokens": 12863574.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.933614730834961,
|
|
"epoch": 0.6084422465559873,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004969497618945024,
|
|
"loss": 5.8647,
|
|
"mean_token_accuracy": 0.16501740068197251,
|
|
"num_tokens": 12872450.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.981774759292603,
|
|
"epoch": 0.6088837866478276,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004969445919034165,
|
|
"loss": 5.8105,
|
|
"mean_token_accuracy": 0.17405697256326674,
|
|
"num_tokens": 12881299.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.927428197860718,
|
|
"epoch": 0.609325326739668,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004969394175645623,
|
|
"loss": 5.7886,
|
|
"mean_token_accuracy": 0.1706787884235382,
|
|
"num_tokens": 12889943.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.946518993377685,
|
|
"epoch": 0.6097668668315083,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004969342388780411,
|
|
"loss": 5.8944,
|
|
"mean_token_accuracy": 0.16340020298957825,
|
|
"num_tokens": 12899497.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 6.05028657913208,
|
|
"epoch": 0.6102084069233487,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004969290558439546,
|
|
"loss": 5.8661,
|
|
"mean_token_accuracy": 0.16563009470701218,
|
|
"num_tokens": 12908677.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 6.065933132171631,
|
|
"epoch": 0.6106499470151889,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004969238684624038,
|
|
"loss": 5.9824,
|
|
"mean_token_accuracy": 0.15356329083442688,
|
|
"num_tokens": 12918468.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 6.017871952056884,
|
|
"epoch": 0.6110914871070293,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004969186767334908,
|
|
"loss": 5.8613,
|
|
"mean_token_accuracy": 0.17257402390241622,
|
|
"num_tokens": 12928527.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 6.052063798904419,
|
|
"epoch": 0.6115330271988697,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004969134806573172,
|
|
"loss": 5.8645,
|
|
"mean_token_accuracy": 0.1648426666855812,
|
|
"num_tokens": 12937021.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.897130489349365,
|
|
"epoch": 0.61197456729071,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004969082802339846,
|
|
"loss": 5.8116,
|
|
"mean_token_accuracy": 0.17037201374769212,
|
|
"num_tokens": 12945570.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.988294363021851,
|
|
"epoch": 0.6124161073825504,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000496903075463595,
|
|
"loss": 5.9244,
|
|
"mean_token_accuracy": 0.16518771648406982,
|
|
"num_tokens": 12954594.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 6.077741241455078,
|
|
"epoch": 0.6128576474743906,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004968978663462504,
|
|
"loss": 5.9046,
|
|
"mean_token_accuracy": 0.16602875143289567,
|
|
"num_tokens": 12963328.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.99314284324646,
|
|
"epoch": 0.613299187566231,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004968926528820529,
|
|
"loss": 5.8949,
|
|
"mean_token_accuracy": 0.16651069819927217,
|
|
"num_tokens": 12972806.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 6.007574129104614,
|
|
"epoch": 0.6137407276580713,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004968874350711044,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.1642751067876816,
|
|
"num_tokens": 12981940.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 6.007917213439941,
|
|
"epoch": 0.6141822677499117,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004968822129135072,
|
|
"loss": 5.8541,
|
|
"mean_token_accuracy": 0.16383134424686432,
|
|
"num_tokens": 12990501.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 6.093807888031006,
|
|
"epoch": 0.6146238078417521,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004968769864093637,
|
|
"loss": 5.9088,
|
|
"mean_token_accuracy": 0.16012341976165773,
|
|
"num_tokens": 12999734.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 6.101202297210693,
|
|
"epoch": 0.6150653479335924,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004968717555587762,
|
|
"loss": 5.9861,
|
|
"mean_token_accuracy": 0.15358458906412126,
|
|
"num_tokens": 13009323.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 6.119562292098999,
|
|
"epoch": 0.6155068880254327,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004968665203618472,
|
|
"loss": 5.9908,
|
|
"mean_token_accuracy": 0.1654104083776474,
|
|
"num_tokens": 13018136.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 6.0345367908477785,
|
|
"epoch": 0.615948428117273,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004968612808186793,
|
|
"loss": 5.9365,
|
|
"mean_token_accuracy": 0.16047066450119019,
|
|
"num_tokens": 13026494.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 6.053148508071899,
|
|
"epoch": 0.6163899682091134,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004968560369293751,
|
|
"loss": 5.9433,
|
|
"mean_token_accuracy": 0.16229794472455977,
|
|
"num_tokens": 13036548.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 6.089382314682007,
|
|
"epoch": 0.6168315083009537,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004968507886940372,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.16225423216819762,
|
|
"num_tokens": 13045653.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 6.10148777961731,
|
|
"epoch": 0.6172730483927941,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004968455361127686,
|
|
"loss": 6.0024,
|
|
"mean_token_accuracy": 0.1586548089981079,
|
|
"num_tokens": 13056204.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 6.0787682056427,
|
|
"epoch": 0.6177145884846345,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004968402791856721,
|
|
"loss": 5.9765,
|
|
"mean_token_accuracy": 0.16385324150323868,
|
|
"num_tokens": 13067215.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 6.088096475601196,
|
|
"epoch": 0.6181561285764747,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004968350179128507,
|
|
"loss": 5.9221,
|
|
"mean_token_accuracy": 0.1585768058896065,
|
|
"num_tokens": 13076824.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 5.991998910903931,
|
|
"epoch": 0.6185976686683151,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004968297522944075,
|
|
"loss": 5.8726,
|
|
"mean_token_accuracy": 0.16714178770780563,
|
|
"num_tokens": 13086205.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"entropy": 5.973235273361206,
|
|
"epoch": 0.6190392087601554,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004968244823304455,
|
|
"loss": 5.8218,
|
|
"mean_token_accuracy": 0.16783057302236556,
|
|
"num_tokens": 13096381.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 6.062705564498901,
|
|
"epoch": 0.6194807488519958,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004968192080210683,
|
|
"loss": 5.9077,
|
|
"mean_token_accuracy": 0.1670522004365921,
|
|
"num_tokens": 13106533.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"entropy": 5.984185743331909,
|
|
"epoch": 0.619922288943836,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004968139293663787,
|
|
"loss": 5.8625,
|
|
"mean_token_accuracy": 0.1657854288816452,
|
|
"num_tokens": 13116605.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 5.997422504425049,
|
|
"epoch": 0.6203638290356764,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004968086463664806,
|
|
"loss": 5.8531,
|
|
"mean_token_accuracy": 0.17010919004678726,
|
|
"num_tokens": 13125316.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"entropy": 6.074321842193603,
|
|
"epoch": 0.6208053691275168,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000496803359021477,
|
|
"loss": 6.0405,
|
|
"mean_token_accuracy": 0.16099716424942018,
|
|
"num_tokens": 13135595.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 6.097686433792115,
|
|
"epoch": 0.6212469092193571,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004967980673314719,
|
|
"loss": 5.9492,
|
|
"mean_token_accuracy": 0.15932941138744355,
|
|
"num_tokens": 13144395.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"entropy": 6.023861026763916,
|
|
"epoch": 0.6216884493111975,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004967927712965688,
|
|
"loss": 5.8975,
|
|
"mean_token_accuracy": 0.16490717828273774,
|
|
"num_tokens": 13153906.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 6.061697959899902,
|
|
"epoch": 0.6221299894030378,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004967874709168714,
|
|
"loss": 5.853,
|
|
"mean_token_accuracy": 0.16454191356897355,
|
|
"num_tokens": 13163840.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"entropy": 6.029821729660034,
|
|
"epoch": 0.6225715294948782,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004967821661924837,
|
|
"loss": 5.8585,
|
|
"mean_token_accuracy": 0.16540730595588685,
|
|
"num_tokens": 13173224.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 6.043258905410767,
|
|
"epoch": 0.6230130695867184,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004967768571235094,
|
|
"loss": 5.9062,
|
|
"mean_token_accuracy": 0.15961679369211196,
|
|
"num_tokens": 13182664.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"entropy": 5.974499702453613,
|
|
"epoch": 0.6234546096785588,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004967715437100526,
|
|
"loss": 5.8154,
|
|
"mean_token_accuracy": 0.16828686892986297,
|
|
"num_tokens": 13191076.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 6.139584732055664,
|
|
"epoch": 0.6238961497703992,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004967662259522174,
|
|
"loss": 6.0221,
|
|
"mean_token_accuracy": 0.15660104751586915,
|
|
"num_tokens": 13201075.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"entropy": 6.096574211120606,
|
|
"epoch": 0.6243376898622395,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000496760903850108,
|
|
"loss": 5.93,
|
|
"mean_token_accuracy": 0.1620198294520378,
|
|
"num_tokens": 13210602.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 6.080165433883667,
|
|
"epoch": 0.6247792299540799,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004967555774038286,
|
|
"loss": 5.9076,
|
|
"mean_token_accuracy": 0.15945767164230346,
|
|
"num_tokens": 13219601.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"entropy": 6.0232421398162845,
|
|
"epoch": 0.6252207700459201,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004967502466134836,
|
|
"loss": 5.9811,
|
|
"mean_token_accuracy": 0.15633575469255448,
|
|
"num_tokens": 13230675.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 6.057520818710327,
|
|
"epoch": 0.6256623101377605,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004967449114791773,
|
|
"loss": 5.9156,
|
|
"mean_token_accuracy": 0.16074651181697847,
|
|
"num_tokens": 13240349.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"entropy": 6.054627895355225,
|
|
"epoch": 0.6261038502296008,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004967395720010143,
|
|
"loss": 5.9756,
|
|
"mean_token_accuracy": 0.15755234509706498,
|
|
"num_tokens": 13249867.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 6.089642667770386,
|
|
"epoch": 0.6265453903214412,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004967342281790992,
|
|
"loss": 5.9095,
|
|
"mean_token_accuracy": 0.16455434262752533,
|
|
"num_tokens": 13259143.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"entropy": 6.015751934051513,
|
|
"epoch": 0.6269869304132816,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004967288800135368,
|
|
"loss": 5.84,
|
|
"mean_token_accuracy": 0.16161815971136093,
|
|
"num_tokens": 13268673.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 6.036752605438233,
|
|
"epoch": 0.6274284705051218,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004967235275044315,
|
|
"loss": 5.8735,
|
|
"mean_token_accuracy": 0.16451816260814667,
|
|
"num_tokens": 13278035.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"entropy": 6.101490259170532,
|
|
"epoch": 0.6278700105969622,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004967181706518886,
|
|
"loss": 5.9788,
|
|
"mean_token_accuracy": 0.16411617249250413,
|
|
"num_tokens": 13286950.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 5.9993843078613285,
|
|
"epoch": 0.6283115506888025,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004967128094560128,
|
|
"loss": 5.8609,
|
|
"mean_token_accuracy": 0.17262431532144545,
|
|
"num_tokens": 13296425.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"entropy": 6.1313232421875,
|
|
"epoch": 0.6287530907806429,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004967074439169092,
|
|
"loss": 5.9902,
|
|
"mean_token_accuracy": 0.1502525493502617,
|
|
"num_tokens": 13306094.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 6.051422643661499,
|
|
"epoch": 0.6291946308724832,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004967020740346827,
|
|
"loss": 5.9086,
|
|
"mean_token_accuracy": 0.16666829138994216,
|
|
"num_tokens": 13315181.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"entropy": 5.989902448654175,
|
|
"epoch": 0.6296361709643236,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004966966998094387,
|
|
"loss": 5.8307,
|
|
"mean_token_accuracy": 0.17334404289722444,
|
|
"num_tokens": 13324108.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 6.002046394348144,
|
|
"epoch": 0.630077711056164,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004966913212412825,
|
|
"loss": 5.8927,
|
|
"mean_token_accuracy": 0.17191413789987564,
|
|
"num_tokens": 13333101.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"entropy": 5.960778665542603,
|
|
"epoch": 0.6305192511480042,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004966859383303194,
|
|
"loss": 5.8222,
|
|
"mean_token_accuracy": 0.16842588186264038,
|
|
"num_tokens": 13341804.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 6.031595945358276,
|
|
"epoch": 0.6309607912398446,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004966805510766549,
|
|
"loss": 5.8624,
|
|
"mean_token_accuracy": 0.16456822901964188,
|
|
"num_tokens": 13350911.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"entropy": 6.114427614212036,
|
|
"epoch": 0.6314023313316849,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004966751594803944,
|
|
"loss": 5.8576,
|
|
"mean_token_accuracy": 0.16239054948091508,
|
|
"num_tokens": 13360657.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 5.901859426498413,
|
|
"epoch": 0.6318438714235253,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004966697635416437,
|
|
"loss": 5.807,
|
|
"mean_token_accuracy": 0.1658569395542145,
|
|
"num_tokens": 13370462.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"entropy": 6.063317108154297,
|
|
"epoch": 0.6322854115153655,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004966643632605082,
|
|
"loss": 5.922,
|
|
"mean_token_accuracy": 0.15854725539684295,
|
|
"num_tokens": 13379446.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 6.1069611549377445,
|
|
"epoch": 0.6327269516072059,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496658958637094,
|
|
"loss": 5.9265,
|
|
"mean_token_accuracy": 0.16306125223636628,
|
|
"num_tokens": 13389238.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"entropy": 6.04985294342041,
|
|
"epoch": 0.6331684916990463,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000496653549671507,
|
|
"loss": 5.926,
|
|
"mean_token_accuracy": 0.16360241323709487,
|
|
"num_tokens": 13398613.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 6.058578968048096,
|
|
"epoch": 0.6336100317908866,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004966481363638529,
|
|
"loss": 5.9238,
|
|
"mean_token_accuracy": 0.16620850712060928,
|
|
"num_tokens": 13407600.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"entropy": 5.953970432281494,
|
|
"epoch": 0.634051571882727,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004966427187142378,
|
|
"loss": 5.8066,
|
|
"mean_token_accuracy": 0.17403708547353744,
|
|
"num_tokens": 13415974.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 6.028492593765259,
|
|
"epoch": 0.6344931119745673,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004966372967227681,
|
|
"loss": 5.8296,
|
|
"mean_token_accuracy": 0.1693430557847023,
|
|
"num_tokens": 13425628.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"entropy": 5.933087348937988,
|
|
"epoch": 0.6349346520664076,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004966318703895496,
|
|
"loss": 5.8403,
|
|
"mean_token_accuracy": 0.17187251299619674,
|
|
"num_tokens": 13435573.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 6.019541263580322,
|
|
"epoch": 0.6353761921582479,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004966264397146888,
|
|
"loss": 5.8092,
|
|
"mean_token_accuracy": 0.16915347278118134,
|
|
"num_tokens": 13443969.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"entropy": 6.018271732330322,
|
|
"epoch": 0.6358177322500883,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004966210046982921,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.1629061296582222,
|
|
"num_tokens": 13452823.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 6.068755483627319,
|
|
"epoch": 0.6362592723419287,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004966155653404661,
|
|
"loss": 5.9196,
|
|
"mean_token_accuracy": 0.16197239011526107,
|
|
"num_tokens": 13461090.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"entropy": 6.133015775680542,
|
|
"epoch": 0.636700812433769,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004966101216413172,
|
|
"loss": 5.9121,
|
|
"mean_token_accuracy": 0.1573682501912117,
|
|
"num_tokens": 13471425.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 6.068574714660644,
|
|
"epoch": 0.6371423525256094,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004966046736009519,
|
|
"loss": 5.9723,
|
|
"mean_token_accuracy": 0.1632949158549309,
|
|
"num_tokens": 13479755.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"entropy": 5.951725959777832,
|
|
"epoch": 0.6375838926174496,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004965992212194772,
|
|
"loss": 5.8577,
|
|
"mean_token_accuracy": 0.17126143872737884,
|
|
"num_tokens": 13488854.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 6.062816572189331,
|
|
"epoch": 0.63802543270929,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004965937644969998,
|
|
"loss": 5.9191,
|
|
"mean_token_accuracy": 0.1607258602976799,
|
|
"num_tokens": 13497987.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"entropy": 6.078987693786621,
|
|
"epoch": 0.6384669728011303,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004965883034336266,
|
|
"loss": 5.9421,
|
|
"mean_token_accuracy": 0.16266340464353563,
|
|
"num_tokens": 13506960.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 6.036382436752319,
|
|
"epoch": 0.6389085128929707,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004965828380294645,
|
|
"loss": 5.8821,
|
|
"mean_token_accuracy": 0.1660477951169014,
|
|
"num_tokens": 13515907.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"entropy": 6.054521989822388,
|
|
"epoch": 0.6393500529848111,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004965773682846207,
|
|
"loss": 5.9641,
|
|
"mean_token_accuracy": 0.16726862788200378,
|
|
"num_tokens": 13524876.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 6.0579798221588135,
|
|
"epoch": 0.6397915930766513,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004965718941992021,
|
|
"loss": 5.8624,
|
|
"mean_token_accuracy": 0.1678209349513054,
|
|
"num_tokens": 13532891.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"entropy": 6.108337450027466,
|
|
"epoch": 0.6402331331684917,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004965664157733162,
|
|
"loss": 5.9374,
|
|
"mean_token_accuracy": 0.16674156337976456,
|
|
"num_tokens": 13542409.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 5.97765622138977,
|
|
"epoch": 0.640674673260332,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004965609330070703,
|
|
"loss": 5.9069,
|
|
"mean_token_accuracy": 0.16326847672462463,
|
|
"num_tokens": 13551947.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"entropy": 6.088966798782349,
|
|
"epoch": 0.6411162133521724,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004965554459005717,
|
|
"loss": 5.9212,
|
|
"mean_token_accuracy": 0.16018670350313186,
|
|
"num_tokens": 13560812.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 6.0528840065002445,
|
|
"epoch": 0.6415577534440127,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004965499544539278,
|
|
"loss": 5.9344,
|
|
"mean_token_accuracy": 0.162466648966074,
|
|
"num_tokens": 13570197.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"entropy": 5.996675062179565,
|
|
"epoch": 0.641999293535853,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965444586672463,
|
|
"loss": 5.8443,
|
|
"mean_token_accuracy": 0.16122349947690964,
|
|
"num_tokens": 13579614.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 6.066436815261841,
|
|
"epoch": 0.6424408336276934,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000496538958540635,
|
|
"loss": 5.8921,
|
|
"mean_token_accuracy": 0.16424771398305893,
|
|
"num_tokens": 13589511.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"entropy": 6.036352348327637,
|
|
"epoch": 0.6428823737195337,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004965334540742015,
|
|
"loss": 5.8698,
|
|
"mean_token_accuracy": 0.1678945705294609,
|
|
"num_tokens": 13597718.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 6.0071446895599365,
|
|
"epoch": 0.6433239138113741,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004965279452680535,
|
|
"loss": 5.962,
|
|
"mean_token_accuracy": 0.15827914625406264,
|
|
"num_tokens": 13607599.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"entropy": 6.15809268951416,
|
|
"epoch": 0.6437654539032144,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004965224321222991,
|
|
"loss": 6.0109,
|
|
"mean_token_accuracy": 0.1586283728480339,
|
|
"num_tokens": 13616513.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 6.08264536857605,
|
|
"epoch": 0.6442069939950548,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004965169146370462,
|
|
"loss": 5.8812,
|
|
"mean_token_accuracy": 0.16384359896183015,
|
|
"num_tokens": 13625505.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"entropy": 6.07037844657898,
|
|
"epoch": 0.644648534086895,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004965113928124031,
|
|
"loss": 5.9287,
|
|
"mean_token_accuracy": 0.16326516717672349,
|
|
"num_tokens": 13635340.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 5.908147239685059,
|
|
"epoch": 0.6450900741787354,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004965058666484775,
|
|
"loss": 5.8545,
|
|
"mean_token_accuracy": 0.16143952161073685,
|
|
"num_tokens": 13644886.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"entropy": 5.967869806289673,
|
|
"epoch": 0.6455316142705758,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004965003361453782,
|
|
"loss": 5.8811,
|
|
"mean_token_accuracy": 0.16908676475286483,
|
|
"num_tokens": 13654093.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 6.116286134719848,
|
|
"epoch": 0.6459731543624161,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004964948013032132,
|
|
"loss": 5.9554,
|
|
"mean_token_accuracy": 0.1531238943338394,
|
|
"num_tokens": 13663095.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"entropy": 6.0913684368133545,
|
|
"epoch": 0.6464146944542565,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000496489262122091,
|
|
"loss": 5.9779,
|
|
"mean_token_accuracy": 0.16273966878652574,
|
|
"num_tokens": 13673624.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 6.033328199386597,
|
|
"epoch": 0.6468562345460968,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004964837186021201,
|
|
"loss": 5.8581,
|
|
"mean_token_accuracy": 0.16317969858646392,
|
|
"num_tokens": 13683145.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"entropy": 5.987786388397216,
|
|
"epoch": 0.6472977746379371,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004964781707434093,
|
|
"loss": 5.8507,
|
|
"mean_token_accuracy": 0.15873554050922395,
|
|
"num_tokens": 13691963.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 5.932110071182251,
|
|
"epoch": 0.6477393147297774,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004964726185460669,
|
|
"loss": 5.7666,
|
|
"mean_token_accuracy": 0.1703936919569969,
|
|
"num_tokens": 13700926.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"entropy": 6.086005926132202,
|
|
"epoch": 0.6481808548216178,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004964670620102019,
|
|
"loss": 5.9927,
|
|
"mean_token_accuracy": 0.15364788472652435,
|
|
"num_tokens": 13709824.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 6.05243673324585,
|
|
"epoch": 0.6486223949134582,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004964615011359231,
|
|
"loss": 5.8899,
|
|
"mean_token_accuracy": 0.16376349031925203,
|
|
"num_tokens": 13718789.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"entropy": 6.070804214477539,
|
|
"epoch": 0.6490639350052985,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004964559359233395,
|
|
"loss": 5.9055,
|
|
"mean_token_accuracy": 0.16012013852596282,
|
|
"num_tokens": 13728180.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 6.069525623321534,
|
|
"epoch": 0.6495054750971389,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004964503663725601,
|
|
"loss": 5.9942,
|
|
"mean_token_accuracy": 0.1554926410317421,
|
|
"num_tokens": 13737819.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"entropy": 6.066304063796997,
|
|
"epoch": 0.6499470151889791,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004964447924836938,
|
|
"loss": 5.9771,
|
|
"mean_token_accuracy": 0.1574745424091816,
|
|
"num_tokens": 13747519.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 6.091936492919922,
|
|
"epoch": 0.6503885552808195,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004964392142568501,
|
|
"loss": 5.9718,
|
|
"mean_token_accuracy": 0.15019024461507796,
|
|
"num_tokens": 13757821.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"entropy": 5.972079277038574,
|
|
"epoch": 0.6508300953726598,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004964336316921381,
|
|
"loss": 5.7573,
|
|
"mean_token_accuracy": 0.16695429980754853,
|
|
"num_tokens": 13766560.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 6.025388288497925,
|
|
"epoch": 0.6512716354645002,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0004964280447896672,
|
|
"loss": 5.8072,
|
|
"mean_token_accuracy": 0.16919275969266892,
|
|
"num_tokens": 13775560.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"entropy": 5.9751911640167235,
|
|
"epoch": 0.6517131755563406,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004964224535495468,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.16387373208999634,
|
|
"num_tokens": 13784391.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 5.957211112976074,
|
|
"epoch": 0.6521547156481808,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004964168579718864,
|
|
"loss": 5.869,
|
|
"mean_token_accuracy": 0.1669353127479553,
|
|
"num_tokens": 13794008.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"entropy": 6.05711817741394,
|
|
"epoch": 0.6525962557400212,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004964112580567959,
|
|
"loss": 5.9144,
|
|
"mean_token_accuracy": 0.159195776283741,
|
|
"num_tokens": 13804321.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 6.070773077011109,
|
|
"epoch": 0.6530377958318615,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004964056538043845,
|
|
"loss": 5.9157,
|
|
"mean_token_accuracy": 0.15635670274496077,
|
|
"num_tokens": 13813492.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"entropy": 6.008130598068237,
|
|
"epoch": 0.6534793359237019,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004964000452147624,
|
|
"loss": 5.9007,
|
|
"mean_token_accuracy": 0.169648377597332,
|
|
"num_tokens": 13822471.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 5.992099714279175,
|
|
"epoch": 0.6539208760155422,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004963944322880394,
|
|
"loss": 5.96,
|
|
"mean_token_accuracy": 0.15306852012872696,
|
|
"num_tokens": 13832789.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"entropy": 6.087279605865478,
|
|
"epoch": 0.6543624161073825,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004963888150243253,
|
|
"loss": 5.9298,
|
|
"mean_token_accuracy": 0.1591775357723236,
|
|
"num_tokens": 13842230.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 6.0844536304473875,
|
|
"epoch": 0.6548039561992229,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004963831934237302,
|
|
"loss": 5.8461,
|
|
"mean_token_accuracy": 0.16865865141153336,
|
|
"num_tokens": 13851542.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"entropy": 6.002407360076904,
|
|
"epoch": 0.6552454962910632,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004963775674863643,
|
|
"loss": 5.8748,
|
|
"mean_token_accuracy": 0.16350056529045104,
|
|
"num_tokens": 13860753.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 6.004277229309082,
|
|
"epoch": 0.6556870363829036,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004963719372123378,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.1674501046538353,
|
|
"num_tokens": 13870133.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"entropy": 6.038403463363648,
|
|
"epoch": 0.6561285764747439,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004963663026017609,
|
|
"loss": 5.941,
|
|
"mean_token_accuracy": 0.16310749650001527,
|
|
"num_tokens": 13879086.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 6.047044372558593,
|
|
"epoch": 0.6565701165665843,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004963606636547441,
|
|
"loss": 5.8789,
|
|
"mean_token_accuracy": 0.16871441453695296,
|
|
"num_tokens": 13888706.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"entropy": 6.038377714157105,
|
|
"epoch": 0.6570116566584245,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004963550203713976,
|
|
"loss": 5.9301,
|
|
"mean_token_accuracy": 0.15731319785118103,
|
|
"num_tokens": 13897455.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 6.029115009307861,
|
|
"epoch": 0.6574531967502649,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004963493727518323,
|
|
"loss": 5.9129,
|
|
"mean_token_accuracy": 0.16222378611564636,
|
|
"num_tokens": 13907697.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"entropy": 5.996784257888794,
|
|
"epoch": 0.6578947368421053,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004963437207961585,
|
|
"loss": 5.9135,
|
|
"mean_token_accuracy": 0.16317444592714309,
|
|
"num_tokens": 13917124.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 6.048440885543823,
|
|
"epoch": 0.6583362769339456,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004963380645044874,
|
|
"loss": 5.8641,
|
|
"mean_token_accuracy": 0.16341730505228041,
|
|
"num_tokens": 13926434.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"entropy": 5.950565671920776,
|
|
"epoch": 0.658777817025786,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004963324038769292,
|
|
"loss": 5.8284,
|
|
"mean_token_accuracy": 0.16859227418899536,
|
|
"num_tokens": 13934883.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 6.026736736297607,
|
|
"epoch": 0.6592193571176262,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004963267389135953,
|
|
"loss": 5.9366,
|
|
"mean_token_accuracy": 0.16433704197406768,
|
|
"num_tokens": 13944514.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"entropy": 6.020615196228027,
|
|
"epoch": 0.6596608972094666,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004963210696145964,
|
|
"loss": 5.8521,
|
|
"mean_token_accuracy": 0.16204071938991546,
|
|
"num_tokens": 13953434.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 6.0246869087219235,
|
|
"epoch": 0.6601024373013069,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004963153959800436,
|
|
"loss": 5.8636,
|
|
"mean_token_accuracy": 0.16973265260457993,
|
|
"num_tokens": 13962493.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"entropy": 6.054371166229248,
|
|
"epoch": 0.6605439773931473,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000496309718010048,
|
|
"loss": 5.9044,
|
|
"mean_token_accuracy": 0.1610116109251976,
|
|
"num_tokens": 13971193.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 5.995823526382447,
|
|
"epoch": 0.6609855174849877,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004963040357047209,
|
|
"loss": 5.9127,
|
|
"mean_token_accuracy": 0.16409173905849456,
|
|
"num_tokens": 13981829.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"entropy": 5.979941177368164,
|
|
"epoch": 0.661427057576828,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004962983490641737,
|
|
"loss": 5.8615,
|
|
"mean_token_accuracy": 0.1629578024148941,
|
|
"num_tokens": 13990262.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 5.9936957359313965,
|
|
"epoch": 0.6618685976686683,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004962926580885177,
|
|
"loss": 5.9007,
|
|
"mean_token_accuracy": 0.16571170836687088,
|
|
"num_tokens": 14000100.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"entropy": 6.05236439704895,
|
|
"epoch": 0.6623101377605086,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004962869627778642,
|
|
"loss": 5.9336,
|
|
"mean_token_accuracy": 0.16470786482095717,
|
|
"num_tokens": 14009330.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 5.993536043167114,
|
|
"epoch": 0.662751677852349,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004962812631323252,
|
|
"loss": 5.8976,
|
|
"mean_token_accuracy": 0.15885231047868728,
|
|
"num_tokens": 14018109.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"entropy": 6.01054573059082,
|
|
"epoch": 0.6631932179441893,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000496275559152012,
|
|
"loss": 5.7961,
|
|
"mean_token_accuracy": 0.1680995926260948,
|
|
"num_tokens": 14027318.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 6.015023469924927,
|
|
"epoch": 0.6636347580360297,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004962698508370365,
|
|
"loss": 5.9339,
|
|
"mean_token_accuracy": 0.15832750350236893,
|
|
"num_tokens": 14037061.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"entropy": 5.948602676391602,
|
|
"epoch": 0.6640762981278701,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004962641381875103,
|
|
"loss": 5.8786,
|
|
"mean_token_accuracy": 0.15682309120893478,
|
|
"num_tokens": 14046008.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 6.026252365112304,
|
|
"epoch": 0.6645178382197103,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004962584212035456,
|
|
"loss": 5.8877,
|
|
"mean_token_accuracy": 0.16163944005966185,
|
|
"num_tokens": 14054535.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"entropy": 6.092637443542481,
|
|
"epoch": 0.6649593783115507,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004962526998852543,
|
|
"loss": 5.8989,
|
|
"mean_token_accuracy": 0.16449156999588013,
|
|
"num_tokens": 14063680.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 6.065542888641358,
|
|
"epoch": 0.665400918403391,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004962469742327483,
|
|
"loss": 5.9386,
|
|
"mean_token_accuracy": 0.16228194236755372,
|
|
"num_tokens": 14072206.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"entropy": 6.043942975997925,
|
|
"epoch": 0.6658424584952314,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00049624124424614,
|
|
"loss": 5.9271,
|
|
"mean_token_accuracy": 0.16253331303596497,
|
|
"num_tokens": 14081943.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 6.0551399230957035,
|
|
"epoch": 0.6662839985870717,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004962355099255415,
|
|
"loss": 5.8902,
|
|
"mean_token_accuracy": 0.16782213896512985,
|
|
"num_tokens": 14091695.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"entropy": 5.9793314933776855,
|
|
"epoch": 0.666725538678912,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004962297712710653,
|
|
"loss": 5.815,
|
|
"mean_token_accuracy": 0.17347454726696016,
|
|
"num_tokens": 14100811.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 6.09973931312561,
|
|
"epoch": 0.6671670787707524,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004962240282828236,
|
|
"loss": 5.9826,
|
|
"mean_token_accuracy": 0.14768042415380478,
|
|
"num_tokens": 14109818.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"entropy": 6.081058645248413,
|
|
"epoch": 0.6676086188625927,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000496218280960929,
|
|
"loss": 5.9184,
|
|
"mean_token_accuracy": 0.15994459986686707,
|
|
"num_tokens": 14119811.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 6.110054540634155,
|
|
"epoch": 0.6680501589544331,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004962125293054941,
|
|
"loss": 5.8684,
|
|
"mean_token_accuracy": 0.16850226670503615,
|
|
"num_tokens": 14128963.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"entropy": 5.9815596580505375,
|
|
"epoch": 0.6684916990462734,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004962067733166317,
|
|
"loss": 5.9451,
|
|
"mean_token_accuracy": 0.15114885419607163,
|
|
"num_tokens": 14138532.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 5.973617649078369,
|
|
"epoch": 0.6689332391381138,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004962010129944542,
|
|
"loss": 5.8884,
|
|
"mean_token_accuracy": 0.1637090712785721,
|
|
"num_tokens": 14148107.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"entropy": 6.079353713989258,
|
|
"epoch": 0.669374779229954,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004961952483390747,
|
|
"loss": 5.9456,
|
|
"mean_token_accuracy": 0.15459064096212388,
|
|
"num_tokens": 14158823.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 6.022123670578003,
|
|
"epoch": 0.6698163193217944,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004961894793506062,
|
|
"loss": 5.8854,
|
|
"mean_token_accuracy": 0.1703451931476593,
|
|
"num_tokens": 14168147.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"entropy": 5.947274494171142,
|
|
"epoch": 0.6702578594136348,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004961837060291615,
|
|
"loss": 5.9241,
|
|
"mean_token_accuracy": 0.1599956676363945,
|
|
"num_tokens": 14177123.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 6.023081350326538,
|
|
"epoch": 0.6706993995054751,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004961779283748538,
|
|
"loss": 5.908,
|
|
"mean_token_accuracy": 0.1616277053952217,
|
|
"num_tokens": 14186614.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"entropy": 6.018943977355957,
|
|
"epoch": 0.6711409395973155,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004961721463877963,
|
|
"loss": 5.9096,
|
|
"mean_token_accuracy": 0.16158800423145295,
|
|
"num_tokens": 14195310.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 6.002673530578614,
|
|
"epoch": 0.6715824796891557,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004961663600681023,
|
|
"loss": 5.8765,
|
|
"mean_token_accuracy": 0.16197859048843383,
|
|
"num_tokens": 14204321.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"entropy": 5.991839742660522,
|
|
"epoch": 0.6720240197809961,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000496160569415885,
|
|
"loss": 5.9103,
|
|
"mean_token_accuracy": 0.16719352900981904,
|
|
"num_tokens": 14213615.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 6.088936424255371,
|
|
"epoch": 0.6724655598728364,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004961547744312579,
|
|
"loss": 5.785,
|
|
"mean_token_accuracy": 0.17092676907777787,
|
|
"num_tokens": 14222715.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"entropy": 6.0617317199707035,
|
|
"epoch": 0.6729070999646768,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004961489751143346,
|
|
"loss": 5.9181,
|
|
"mean_token_accuracy": 0.16617656946182252,
|
|
"num_tokens": 14232117.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 5.968132638931275,
|
|
"epoch": 0.6733486400565172,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004961431714652287,
|
|
"loss": 5.7999,
|
|
"mean_token_accuracy": 0.1727928102016449,
|
|
"num_tokens": 14240606.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"entropy": 6.0396177768707275,
|
|
"epoch": 0.6737901801483575,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004961373634840538,
|
|
"loss": 5.9223,
|
|
"mean_token_accuracy": 0.15404981672763823,
|
|
"num_tokens": 14249844.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 6.007670164108276,
|
|
"epoch": 0.6742317202401978,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004961315511709237,
|
|
"loss": 5.9178,
|
|
"mean_token_accuracy": 0.1644563376903534,
|
|
"num_tokens": 14259231.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"entropy": 6.081117105484009,
|
|
"epoch": 0.6746732603320381,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004961257345259524,
|
|
"loss": 5.9229,
|
|
"mean_token_accuracy": 0.16527654528617858,
|
|
"num_tokens": 14268508.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 6.0169459819793705,
|
|
"epoch": 0.6751148004238785,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961199135492535,
|
|
"loss": 5.839,
|
|
"mean_token_accuracy": 0.17000525146722795,
|
|
"num_tokens": 14278057.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"entropy": 5.990185642242432,
|
|
"epoch": 0.6755563405157188,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004961140882409414,
|
|
"loss": 5.9137,
|
|
"mean_token_accuracy": 0.16065546423196791,
|
|
"num_tokens": 14287225.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 6.092997264862061,
|
|
"epoch": 0.6759978806075592,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00049610825860113,
|
|
"loss": 5.8698,
|
|
"mean_token_accuracy": 0.16446682810783386,
|
|
"num_tokens": 14296529.0,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"entropy": 5.976733684539795,
|
|
"epoch": 0.6764394206993996,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004961024246299336,
|
|
"loss": 5.8333,
|
|
"mean_token_accuracy": 0.16461792439222336,
|
|
"num_tokens": 14305675.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 5.9833447456359865,
|
|
"epoch": 0.6768809607912398,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004960965863274665,
|
|
"loss": 5.7966,
|
|
"mean_token_accuracy": 0.16425360292196273,
|
|
"num_tokens": 14313632.0,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"entropy": 5.937602996826172,
|
|
"epoch": 0.6773225008830802,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000496090743693843,
|
|
"loss": 5.9136,
|
|
"mean_token_accuracy": 0.16260362714529036,
|
|
"num_tokens": 14323353.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 6.1345940113067625,
|
|
"epoch": 0.6777640409749205,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004960848967291775,
|
|
"loss": 5.9051,
|
|
"mean_token_accuracy": 0.1604089170694351,
|
|
"num_tokens": 14331953.0,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"entropy": 6.091330718994141,
|
|
"epoch": 0.6782055810667609,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004960790454335846,
|
|
"loss": 5.9563,
|
|
"mean_token_accuracy": 0.15599895119667054,
|
|
"num_tokens": 14342456.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 5.974487972259522,
|
|
"epoch": 0.6786471211586012,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000496073189807179,
|
|
"loss": 5.7375,
|
|
"mean_token_accuracy": 0.17251136004924775,
|
|
"num_tokens": 14351013.0,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"entropy": 6.053620958328247,
|
|
"epoch": 0.6790886612504415,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004960673298500753,
|
|
"loss": 6.0012,
|
|
"mean_token_accuracy": 0.1551806792616844,
|
|
"num_tokens": 14360887.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 6.065280342102051,
|
|
"epoch": 0.6795302013422819,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004960614655623883,
|
|
"loss": 5.9223,
|
|
"mean_token_accuracy": 0.1608433708548546,
|
|
"num_tokens": 14370510.0,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"entropy": 5.98971471786499,
|
|
"epoch": 0.6799717414341222,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000496055596944233,
|
|
"loss": 5.9103,
|
|
"mean_token_accuracy": 0.16423211097717286,
|
|
"num_tokens": 14379315.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 6.040924167633056,
|
|
"epoch": 0.6804132815259626,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004960497239957243,
|
|
"loss": 5.8872,
|
|
"mean_token_accuracy": 0.159690023958683,
|
|
"num_tokens": 14388537.0,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"entropy": 6.125636291503906,
|
|
"epoch": 0.6808548216178029,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004960438467169772,
|
|
"loss": 5.929,
|
|
"mean_token_accuracy": 0.16286856234073638,
|
|
"num_tokens": 14398347.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 6.0843593120574955,
|
|
"epoch": 0.6812963617096433,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004960379651081068,
|
|
"loss": 5.855,
|
|
"mean_token_accuracy": 0.16956682354211808,
|
|
"num_tokens": 14407304.0,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"entropy": 5.984658145904541,
|
|
"epoch": 0.6817379018014835,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004960320791692283,
|
|
"loss": 5.875,
|
|
"mean_token_accuracy": 0.162070694565773,
|
|
"num_tokens": 14416641.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 6.045535898208618,
|
|
"epoch": 0.6821794418933239,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004960261889004572,
|
|
"loss": 5.9795,
|
|
"mean_token_accuracy": 0.15513098388910293,
|
|
"num_tokens": 14426095.0,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"entropy": 6.097327518463135,
|
|
"epoch": 0.6826209819851643,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004960202943019088,
|
|
"loss": 5.8766,
|
|
"mean_token_accuracy": 0.16527304351329802,
|
|
"num_tokens": 14434563.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 5.9518779754638675,
|
|
"epoch": 0.6830625220770046,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004960143953736984,
|
|
"loss": 5.7974,
|
|
"mean_token_accuracy": 0.17335865646600723,
|
|
"num_tokens": 14444319.0,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"entropy": 5.9988618850708,
|
|
"epoch": 0.683504062168845,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004960084921159419,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.167277492582798,
|
|
"num_tokens": 14453381.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 5.874141550064087,
|
|
"epoch": 0.6839456022606852,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004960025845287546,
|
|
"loss": 5.8492,
|
|
"mean_token_accuracy": 0.1701892837882042,
|
|
"num_tokens": 14462660.0,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"entropy": 5.956099653244019,
|
|
"epoch": 0.6843871423525256,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004959966726122523,
|
|
"loss": 5.7854,
|
|
"mean_token_accuracy": 0.17432601302862166,
|
|
"num_tokens": 14471820.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 6.101538801193238,
|
|
"epoch": 0.6848286824443659,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000495990756366551,
|
|
"loss": 5.9344,
|
|
"mean_token_accuracy": 0.1623198062181473,
|
|
"num_tokens": 14481663.0,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"entropy": 6.123762798309326,
|
|
"epoch": 0.6852702225362063,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004959848357917664,
|
|
"loss": 5.9611,
|
|
"mean_token_accuracy": 0.1574322536587715,
|
|
"num_tokens": 14491515.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 6.004442834854126,
|
|
"epoch": 0.6857117626280467,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004959789108880145,
|
|
"loss": 5.7873,
|
|
"mean_token_accuracy": 0.164418064057827,
|
|
"num_tokens": 14500852.0,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"entropy": 6.017218780517578,
|
|
"epoch": 0.686153302719887,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004959729816554115,
|
|
"loss": 5.8648,
|
|
"mean_token_accuracy": 0.16977741122245787,
|
|
"num_tokens": 14509771.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 5.983751487731934,
|
|
"epoch": 0.6865948428117273,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004959670480940734,
|
|
"loss": 5.8261,
|
|
"mean_token_accuracy": 0.16761362850666045,
|
|
"num_tokens": 14519377.0,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"entropy": 5.9449629306793215,
|
|
"epoch": 0.6870363829035676,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004959611102041165,
|
|
"loss": 5.8723,
|
|
"mean_token_accuracy": 0.16630145758390427,
|
|
"num_tokens": 14528998.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 6.026974439620972,
|
|
"epoch": 0.687477922995408,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004959551679856571,
|
|
"loss": 5.9144,
|
|
"mean_token_accuracy": 0.1634315237402916,
|
|
"num_tokens": 14538153.0,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"entropy": 6.042395067214966,
|
|
"epoch": 0.6879194630872483,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004959492214388117,
|
|
"loss": 5.8503,
|
|
"mean_token_accuracy": 0.16629589945077897,
|
|
"num_tokens": 14547631.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 5.997350263595581,
|
|
"epoch": 0.6883610031790887,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004959432705636968,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.171341934800148,
|
|
"num_tokens": 14557685.0,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"entropy": 6.041592073440552,
|
|
"epoch": 0.688802543270929,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004959373153604287,
|
|
"loss": 5.9302,
|
|
"mean_token_accuracy": 0.16370871067047119,
|
|
"num_tokens": 14567360.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 6.000632572174072,
|
|
"epoch": 0.6892440833627693,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004959313558291243,
|
|
"loss": 5.8529,
|
|
"mean_token_accuracy": 0.1705251455307007,
|
|
"num_tokens": 14576916.0,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"entropy": 6.014446067810058,
|
|
"epoch": 0.6896856234546097,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004959253919699004,
|
|
"loss": 5.7764,
|
|
"mean_token_accuracy": 0.17379749268293382,
|
|
"num_tokens": 14586716.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 5.978111124038696,
|
|
"epoch": 0.69012716354645,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004959194237828735,
|
|
"loss": 5.8253,
|
|
"mean_token_accuracy": 0.1650472640991211,
|
|
"num_tokens": 14595690.0,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"entropy": 5.980210590362549,
|
|
"epoch": 0.6905687036382904,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004959134512681609,
|
|
"loss": 5.8995,
|
|
"mean_token_accuracy": 0.1592855393886566,
|
|
"num_tokens": 14605481.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 6.018238973617554,
|
|
"epoch": 0.6910102437301306,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004959074744258794,
|
|
"loss": 5.9594,
|
|
"mean_token_accuracy": 0.15670410841703414,
|
|
"num_tokens": 14615242.0,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"entropy": 6.039451837539673,
|
|
"epoch": 0.691451783821971,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004959014932561461,
|
|
"loss": 5.7923,
|
|
"mean_token_accuracy": 0.16917330920696258,
|
|
"num_tokens": 14624343.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 5.9386570930480955,
|
|
"epoch": 0.6918933239138114,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004958955077590782,
|
|
"loss": 5.7665,
|
|
"mean_token_accuracy": 0.17301661819219588,
|
|
"num_tokens": 14633014.0,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"entropy": 5.955418539047241,
|
|
"epoch": 0.6923348640056517,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004958895179347929,
|
|
"loss": 5.8864,
|
|
"mean_token_accuracy": 0.1625771328806877,
|
|
"num_tokens": 14642796.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 6.032704639434814,
|
|
"epoch": 0.6927764040974921,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004958835237834075,
|
|
"loss": 5.9372,
|
|
"mean_token_accuracy": 0.15462349355220795,
|
|
"num_tokens": 14652388.0,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"entropy": 6.132510662078857,
|
|
"epoch": 0.6932179441893324,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004958775253050396,
|
|
"loss": 5.8631,
|
|
"mean_token_accuracy": 0.16805050820112227,
|
|
"num_tokens": 14661219.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 6.07625560760498,
|
|
"epoch": 0.6936594842811727,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004958715224998066,
|
|
"loss": 5.885,
|
|
"mean_token_accuracy": 0.16949716359376907,
|
|
"num_tokens": 14670878.0,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"entropy": 5.988736534118653,
|
|
"epoch": 0.694101024373013,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004958655153678261,
|
|
"loss": 5.9343,
|
|
"mean_token_accuracy": 0.1602768912911415,
|
|
"num_tokens": 14680893.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 6.038381433486938,
|
|
"epoch": 0.6945425644648534,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004958595039092156,
|
|
"loss": 5.8472,
|
|
"mean_token_accuracy": 0.16352402567863464,
|
|
"num_tokens": 14689249.0,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"entropy": 6.069445562362671,
|
|
"epoch": 0.6949841045566938,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004958534881240932,
|
|
"loss": 5.8468,
|
|
"mean_token_accuracy": 0.16557328253984452,
|
|
"num_tokens": 14698724.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 6.03548789024353,
|
|
"epoch": 0.6954256446485341,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004958474680125766,
|
|
"loss": 5.7999,
|
|
"mean_token_accuracy": 0.1614638313651085,
|
|
"num_tokens": 14707756.0,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"entropy": 5.9449488639831545,
|
|
"epoch": 0.6958671847403745,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004958414435747837,
|
|
"loss": 5.8556,
|
|
"mean_token_accuracy": 0.16810656636953353,
|
|
"num_tokens": 14717416.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 5.9693653106689455,
|
|
"epoch": 0.6963087248322147,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004958354148108324,
|
|
"loss": 5.8907,
|
|
"mean_token_accuracy": 0.16461798548698425,
|
|
"num_tokens": 14727321.0,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"entropy": 5.969771814346314,
|
|
"epoch": 0.6967502649240551,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.000495829381720841,
|
|
"loss": 5.7638,
|
|
"mean_token_accuracy": 0.17590563893318176,
|
|
"num_tokens": 14736298.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 6.0774918556213375,
|
|
"epoch": 0.6971918050158954,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958233443049275,
|
|
"loss": 5.8168,
|
|
"mean_token_accuracy": 0.1612874150276184,
|
|
"num_tokens": 14745893.0,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"entropy": 5.994456005096436,
|
|
"epoch": 0.6976333451077358,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004958173025632103,
|
|
"loss": 5.8764,
|
|
"mean_token_accuracy": 0.1590081751346588,
|
|
"num_tokens": 14755277.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 6.05733437538147,
|
|
"epoch": 0.6980748851995762,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004958112564958079,
|
|
"loss": 5.99,
|
|
"mean_token_accuracy": 0.15567466169595717,
|
|
"num_tokens": 14764369.0,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"entropy": 6.000176811218262,
|
|
"epoch": 0.6985164252914164,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004958052061028385,
|
|
"loss": 5.8249,
|
|
"mean_token_accuracy": 0.17366339713335038,
|
|
"num_tokens": 14772970.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 6.0693926334381105,
|
|
"epoch": 0.6989579653832568,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004957991513844205,
|
|
"loss": 5.9008,
|
|
"mean_token_accuracy": 0.1680343359708786,
|
|
"num_tokens": 14782862.0,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"entropy": 6.051274967193604,
|
|
"epoch": 0.6993995054750971,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004957930923406729,
|
|
"loss": 5.8969,
|
|
"mean_token_accuracy": 0.16636960506439208,
|
|
"num_tokens": 14792042.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 5.996286153793335,
|
|
"epoch": 0.6998410455669375,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004957870289717142,
|
|
"loss": 5.8408,
|
|
"mean_token_accuracy": 0.16046467274427414,
|
|
"num_tokens": 14801032.0,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"entropy": 6.02137451171875,
|
|
"epoch": 0.7002825856587778,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004957809612776631,
|
|
"loss": 5.8596,
|
|
"mean_token_accuracy": 0.16641456931829451,
|
|
"num_tokens": 14810594.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 6.00620470046997,
|
|
"epoch": 0.7007241257506182,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004957748892586384,
|
|
"loss": 5.9501,
|
|
"mean_token_accuracy": 0.165498748421669,
|
|
"num_tokens": 14820570.0,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"entropy": 5.895208406448364,
|
|
"epoch": 0.7011656658424585,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004957688129147594,
|
|
"loss": 5.7706,
|
|
"mean_token_accuracy": 0.1790166676044464,
|
|
"num_tokens": 14830078.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 6.026910591125488,
|
|
"epoch": 0.7016072059342988,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004957627322461448,
|
|
"loss": 5.8544,
|
|
"mean_token_accuracy": 0.16862896382808684,
|
|
"num_tokens": 14839974.0,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"entropy": 6.061175012588501,
|
|
"epoch": 0.7020487460261392,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004957566472529139,
|
|
"loss": 5.954,
|
|
"mean_token_accuracy": 0.15304158478975297,
|
|
"num_tokens": 14849551.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 6.011351013183594,
|
|
"epoch": 0.7024902861179795,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004957505579351858,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.16445884555578233,
|
|
"num_tokens": 14859011.0,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"entropy": 5.97165355682373,
|
|
"epoch": 0.7029318262098199,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004957444642930798,
|
|
"loss": 5.8183,
|
|
"mean_token_accuracy": 0.16486039459705354,
|
|
"num_tokens": 14867948.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 6.02838978767395,
|
|
"epoch": 0.7033733663016601,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004957383663267152,
|
|
"loss": 5.8825,
|
|
"mean_token_accuracy": 0.16174710541963577,
|
|
"num_tokens": 14877233.0,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"entropy": 6.0542638301849365,
|
|
"epoch": 0.7038149063935005,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004957322640362118,
|
|
"loss": 5.8992,
|
|
"mean_token_accuracy": 0.16317424327135086,
|
|
"num_tokens": 14887439.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 6.137719011306762,
|
|
"epoch": 0.7042564464853409,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004957261574216887,
|
|
"loss": 5.9772,
|
|
"mean_token_accuracy": 0.16205019503831863,
|
|
"num_tokens": 14897064.0,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"entropy": 6.055337810516358,
|
|
"epoch": 0.7046979865771812,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004957200464832656,
|
|
"loss": 5.9176,
|
|
"mean_token_accuracy": 0.15968191921710967,
|
|
"num_tokens": 14907054.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 5.990302705764771,
|
|
"epoch": 0.7051395266690216,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004957139312210626,
|
|
"loss": 5.8819,
|
|
"mean_token_accuracy": 0.1649075925350189,
|
|
"num_tokens": 14916969.0,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"entropy": 6.02733588218689,
|
|
"epoch": 0.7055810667608619,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000495707811635199,
|
|
"loss": 5.907,
|
|
"mean_token_accuracy": 0.16429728716611863,
|
|
"num_tokens": 14926404.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 6.033973550796508,
|
|
"epoch": 0.7060226068527022,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004957016877257949,
|
|
"loss": 5.8792,
|
|
"mean_token_accuracy": 0.1650012642145157,
|
|
"num_tokens": 14935693.0,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"entropy": 5.978080940246582,
|
|
"epoch": 0.7064641469445425,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004956955594929704,
|
|
"loss": 5.8245,
|
|
"mean_token_accuracy": 0.16376616731286048,
|
|
"num_tokens": 14945147.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 5.948422574996949,
|
|
"epoch": 0.7069056870363829,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004956894269368454,
|
|
"loss": 5.8657,
|
|
"mean_token_accuracy": 0.1638239175081253,
|
|
"num_tokens": 14954022.0,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"entropy": 5.952633428573608,
|
|
"epoch": 0.7073472271282233,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004956832900575399,
|
|
"loss": 5.8716,
|
|
"mean_token_accuracy": 0.16031511574983598,
|
|
"num_tokens": 14962977.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 6.035693979263305,
|
|
"epoch": 0.7077887672200636,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004956771488551742,
|
|
"loss": 5.8928,
|
|
"mean_token_accuracy": 0.15938186645507812,
|
|
"num_tokens": 14973420.0,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"entropy": 6.132956552505493,
|
|
"epoch": 0.708230307311904,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004956710033298689,
|
|
"loss": 5.8627,
|
|
"mean_token_accuracy": 0.16708310693502426,
|
|
"num_tokens": 14981762.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 6.101843976974488,
|
|
"epoch": 0.7086718474037442,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.000495664853481744,
|
|
"loss": 5.9503,
|
|
"mean_token_accuracy": 0.16041337698698044,
|
|
"num_tokens": 14991449.0,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"entropy": 6.028578805923462,
|
|
"epoch": 0.7091133874955846,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004956586993109201,
|
|
"loss": 5.8799,
|
|
"mean_token_accuracy": 0.1627985119819641,
|
|
"num_tokens": 15000802.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 6.014289045333863,
|
|
"epoch": 0.7095549275874249,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004956525408175179,
|
|
"loss": 5.8875,
|
|
"mean_token_accuracy": 0.16235641092061998,
|
|
"num_tokens": 15010580.0,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"entropy": 5.969857788085937,
|
|
"epoch": 0.7099964676792653,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004956463780016578,
|
|
"loss": 5.9324,
|
|
"mean_token_accuracy": 0.15980477035045623,
|
|
"num_tokens": 15020059.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 6.070868968963623,
|
|
"epoch": 0.7104380077711057,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004956402108634607,
|
|
"loss": 5.8696,
|
|
"mean_token_accuracy": 0.16538775116205215,
|
|
"num_tokens": 15030268.0,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"entropy": 5.997285652160644,
|
|
"epoch": 0.7108795478629459,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004956340394030473,
|
|
"loss": 5.8496,
|
|
"mean_token_accuracy": 0.16604703515768052,
|
|
"num_tokens": 15038865.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 6.069686794281006,
|
|
"epoch": 0.7113210879547863,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004956278636205386,
|
|
"loss": 5.991,
|
|
"mean_token_accuracy": 0.1484125152230263,
|
|
"num_tokens": 15048427.0,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"entropy": 5.9873466968536375,
|
|
"epoch": 0.7117626280466266,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004956216835160556,
|
|
"loss": 5.8458,
|
|
"mean_token_accuracy": 0.17582572996616364,
|
|
"num_tokens": 15057366.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 6.044218683242798,
|
|
"epoch": 0.712204168138467,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004956154990897191,
|
|
"loss": 5.8795,
|
|
"mean_token_accuracy": 0.15993637442588807,
|
|
"num_tokens": 15067098.0,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"entropy": 6.051040840148926,
|
|
"epoch": 0.7126457082303073,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004956093103416505,
|
|
"loss": 5.8797,
|
|
"mean_token_accuracy": 0.16278290301561354,
|
|
"num_tokens": 15075734.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 6.057865238189697,
|
|
"epoch": 0.7130872483221476,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004956031172719711,
|
|
"loss": 5.9255,
|
|
"mean_token_accuracy": 0.16511817574501036,
|
|
"num_tokens": 15084625.0,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"entropy": 5.939253950119019,
|
|
"epoch": 0.713528788413988,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004955969198808021,
|
|
"loss": 5.7378,
|
|
"mean_token_accuracy": 0.18222922831773758,
|
|
"num_tokens": 15093256.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 5.952800703048706,
|
|
"epoch": 0.7139703285058283,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004955907181682648,
|
|
"loss": 5.8272,
|
|
"mean_token_accuracy": 0.17085774391889572,
|
|
"num_tokens": 15103175.0,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"entropy": 6.015474128723144,
|
|
"epoch": 0.7144118685976687,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004955845121344809,
|
|
"loss": 5.9234,
|
|
"mean_token_accuracy": 0.16088522225618362,
|
|
"num_tokens": 15112789.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 6.063271713256836,
|
|
"epoch": 0.714853408689509,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004955783017795717,
|
|
"loss": 5.936,
|
|
"mean_token_accuracy": 0.1662163808941841,
|
|
"num_tokens": 15121869.0,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"entropy": 6.000880479812622,
|
|
"epoch": 0.7152949487813494,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000495572087103659,
|
|
"loss": 5.8404,
|
|
"mean_token_accuracy": 0.16656978577375411,
|
|
"num_tokens": 15131441.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 5.968098402023315,
|
|
"epoch": 0.7157364888731896,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004955658681068647,
|
|
"loss": 5.8773,
|
|
"mean_token_accuracy": 0.16107980757951737,
|
|
"num_tokens": 15140641.0,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"entropy": 6.045904302597046,
|
|
"epoch": 0.71617802896503,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004955596447893105,
|
|
"loss": 5.8728,
|
|
"mean_token_accuracy": 0.16488367021083833,
|
|
"num_tokens": 15150073.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 5.916897344589233,
|
|
"epoch": 0.7166195690568704,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004955534171511181,
|
|
"loss": 5.6291,
|
|
"mean_token_accuracy": 0.18246244937181472,
|
|
"num_tokens": 15159879.0,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"entropy": 5.921168994903565,
|
|
"epoch": 0.7170611091487107,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00049554718519241,
|
|
"loss": 5.8187,
|
|
"mean_token_accuracy": 0.17096499651670455,
|
|
"num_tokens": 15168316.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 5.946758127212524,
|
|
"epoch": 0.7175026492405511,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004955409489133078,
|
|
"loss": 5.8865,
|
|
"mean_token_accuracy": 0.1636739045381546,
|
|
"num_tokens": 15176700.0,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"entropy": 6.089166688919067,
|
|
"epoch": 0.7179441893323913,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004955347083139338,
|
|
"loss": 5.8745,
|
|
"mean_token_accuracy": 0.16882388591766356,
|
|
"num_tokens": 15185997.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 5.963169384002685,
|
|
"epoch": 0.7183857294242317,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004955284633944104,
|
|
"loss": 5.8886,
|
|
"mean_token_accuracy": 0.16364263594150544,
|
|
"num_tokens": 15195341.0,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"entropy": 5.988012361526489,
|
|
"epoch": 0.718827269516072,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004955222141548597,
|
|
"loss": 5.9061,
|
|
"mean_token_accuracy": 0.1646143302321434,
|
|
"num_tokens": 15205501.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 6.043216848373413,
|
|
"epoch": 0.7192688096079124,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004955159605954043,
|
|
"loss": 5.8431,
|
|
"mean_token_accuracy": 0.1658138707280159,
|
|
"num_tokens": 15214778.0,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"entropy": 6.04915189743042,
|
|
"epoch": 0.7197103496997528,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0004955097027161666,
|
|
"loss": 5.8873,
|
|
"mean_token_accuracy": 0.16919119507074357,
|
|
"num_tokens": 15224201.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 5.954038476943969,
|
|
"epoch": 0.7201518897915931,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004955034405172693,
|
|
"loss": 5.8114,
|
|
"mean_token_accuracy": 0.17064497172832488,
|
|
"num_tokens": 15233472.0,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"entropy": 6.0233845710754395,
|
|
"epoch": 0.7205934298834334,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000495497173998835,
|
|
"loss": 5.8847,
|
|
"mean_token_accuracy": 0.16386122703552247,
|
|
"num_tokens": 15242323.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 6.042276906967163,
|
|
"epoch": 0.7210349699752737,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004954909031609864,
|
|
"loss": 5.9032,
|
|
"mean_token_accuracy": 0.1618262857198715,
|
|
"num_tokens": 15252952.0,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"entropy": 6.041112995147705,
|
|
"epoch": 0.7214765100671141,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004954846280038465,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.16644167602062226,
|
|
"num_tokens": 15261815.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 6.043339776992798,
|
|
"epoch": 0.7219180501589544,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004954783485275381,
|
|
"loss": 5.929,
|
|
"mean_token_accuracy": 0.15943233221769332,
|
|
"num_tokens": 15271026.0,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"entropy": 5.986793136596679,
|
|
"epoch": 0.7223595902507948,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004954720647321843,
|
|
"loss": 5.7721,
|
|
"mean_token_accuracy": 0.1739621266722679,
|
|
"num_tokens": 15279795.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 6.092932939529419,
|
|
"epoch": 0.7228011303426352,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004954657766179081,
|
|
"loss": 5.9041,
|
|
"mean_token_accuracy": 0.1600492849946022,
|
|
"num_tokens": 15289113.0,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"entropy": 5.976881504058838,
|
|
"epoch": 0.7232426704344754,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004954594841848327,
|
|
"loss": 5.767,
|
|
"mean_token_accuracy": 0.17719005048274994,
|
|
"num_tokens": 15298271.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 6.073885917663574,
|
|
"epoch": 0.7236842105263158,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004954531874330814,
|
|
"loss": 5.8562,
|
|
"mean_token_accuracy": 0.16308169662952424,
|
|
"num_tokens": 15307677.0,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"entropy": 6.005601215362549,
|
|
"epoch": 0.7241257506181561,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004954468863627774,
|
|
"loss": 5.8728,
|
|
"mean_token_accuracy": 0.16475293338298796,
|
|
"num_tokens": 15316637.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 5.967337942123413,
|
|
"epoch": 0.7245672907099965,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004954405809740445,
|
|
"loss": 5.8599,
|
|
"mean_token_accuracy": 0.16259262561798096,
|
|
"num_tokens": 15325221.0,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"entropy": 6.023098516464233,
|
|
"epoch": 0.7250088308018368,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004954342712670058,
|
|
"loss": 5.8724,
|
|
"mean_token_accuracy": 0.16482951641082763,
|
|
"num_tokens": 15335877.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 6.053239488601685,
|
|
"epoch": 0.7254503708936771,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004954279572417852,
|
|
"loss": 5.8843,
|
|
"mean_token_accuracy": 0.16335032880306244,
|
|
"num_tokens": 15345096.0,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"entropy": 5.979609155654908,
|
|
"epoch": 0.7258919109855175,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004954216388985063,
|
|
"loss": 5.7252,
|
|
"mean_token_accuracy": 0.17507985085248948,
|
|
"num_tokens": 15353785.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 5.9983946800231935,
|
|
"epoch": 0.7263334510773578,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004954153162372928,
|
|
"loss": 5.8308,
|
|
"mean_token_accuracy": 0.1683450922369957,
|
|
"num_tokens": 15362761.0,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"entropy": 6.0386522769927975,
|
|
"epoch": 0.7267749911691982,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004954089892582686,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.16770410239696504,
|
|
"num_tokens": 15371951.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 6.033671808242798,
|
|
"epoch": 0.7272165312610385,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004954026579615577,
|
|
"loss": 5.8935,
|
|
"mean_token_accuracy": 0.1658299371600151,
|
|
"num_tokens": 15380277.0,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"entropy": 5.973099327087402,
|
|
"epoch": 0.7276580713528789,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004953963223472841,
|
|
"loss": 5.8997,
|
|
"mean_token_accuracy": 0.16341323554515838,
|
|
"num_tokens": 15390951.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 6.110719299316406,
|
|
"epoch": 0.7280996114447191,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004953899824155718,
|
|
"loss": 5.8614,
|
|
"mean_token_accuracy": 0.16662085205316543,
|
|
"num_tokens": 15399394.0,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"entropy": 5.979340410232544,
|
|
"epoch": 0.7285411515365595,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004953836381665452,
|
|
"loss": 5.8164,
|
|
"mean_token_accuracy": 0.16606194376945496,
|
|
"num_tokens": 15408644.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 6.033501100540161,
|
|
"epoch": 0.7289826916283999,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004953772896003284,
|
|
"loss": 5.8773,
|
|
"mean_token_accuracy": 0.16229377537965775,
|
|
"num_tokens": 15417398.0,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"entropy": 6.058249139785767,
|
|
"epoch": 0.7294242317202402,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004953709367170458,
|
|
"loss": 5.9495,
|
|
"mean_token_accuracy": 0.1523979589343071,
|
|
"num_tokens": 15426532.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 6.039269495010376,
|
|
"epoch": 0.7298657718120806,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000495364579516822,
|
|
"loss": 5.9197,
|
|
"mean_token_accuracy": 0.16389914900064467,
|
|
"num_tokens": 15436371.0,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"entropy": 6.098191547393799,
|
|
"epoch": 0.7303073119039208,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004953582179997813,
|
|
"loss": 5.9687,
|
|
"mean_token_accuracy": 0.15681514143943787,
|
|
"num_tokens": 15446600.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 6.082651376724243,
|
|
"epoch": 0.7307488519957612,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004953518521660484,
|
|
"loss": 5.8208,
|
|
"mean_token_accuracy": 0.16526079773902894,
|
|
"num_tokens": 15455547.0,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"entropy": 6.003137111663818,
|
|
"epoch": 0.7311903920876015,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004953454820157481,
|
|
"loss": 5.8614,
|
|
"mean_token_accuracy": 0.16507752984762192,
|
|
"num_tokens": 15464543.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 6.047074937820435,
|
|
"epoch": 0.7316319321794419,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004953391075490051,
|
|
"loss": 5.9886,
|
|
"mean_token_accuracy": 0.15476133823394775,
|
|
"num_tokens": 15473626.0,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"entropy": 6.047168159484864,
|
|
"epoch": 0.7320734722712823,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004953327287659444,
|
|
"loss": 5.9098,
|
|
"mean_token_accuracy": 0.15806199312210084,
|
|
"num_tokens": 15484164.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 6.163697624206543,
|
|
"epoch": 0.7325150123631226,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004953263456666907,
|
|
"loss": 5.9146,
|
|
"mean_token_accuracy": 0.15813378542661666,
|
|
"num_tokens": 15493718.0,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"entropy": 6.036547660827637,
|
|
"epoch": 0.7329565524549629,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 0.0004953199582513693,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.1601982071995735,
|
|
"num_tokens": 15503339.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 6.029874324798584,
|
|
"epoch": 0.7333980925468032,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004953135665201052,
|
|
"loss": 5.8401,
|
|
"mean_token_accuracy": 0.17256525456905364,
|
|
"num_tokens": 15513314.0,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"entropy": 6.069580602645874,
|
|
"epoch": 0.7338396326386436,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004953071704730237,
|
|
"loss": 5.9203,
|
|
"mean_token_accuracy": 0.15911492258310317,
|
|
"num_tokens": 15522943.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 6.0191961288452145,
|
|
"epoch": 0.7342811727304839,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004953007701102499,
|
|
"loss": 5.8915,
|
|
"mean_token_accuracy": 0.16012327522039413,
|
|
"num_tokens": 15531975.0,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"entropy": 5.931882619857788,
|
|
"epoch": 0.7347227128223243,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004952943654319094,
|
|
"loss": 5.8235,
|
|
"mean_token_accuracy": 0.17004168182611465,
|
|
"num_tokens": 15541139.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 5.996765184402466,
|
|
"epoch": 0.7351642529141647,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004952879564381276,
|
|
"loss": 5.9014,
|
|
"mean_token_accuracy": 0.16608232408761978,
|
|
"num_tokens": 15550604.0,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"entropy": 6.023161125183106,
|
|
"epoch": 0.7356057930060049,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004952815431290298,
|
|
"loss": 5.8069,
|
|
"mean_token_accuracy": 0.17140596508979797,
|
|
"num_tokens": 15558995.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 6.1181238174438475,
|
|
"epoch": 0.7360473330978453,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000495275125504742,
|
|
"loss": 5.9002,
|
|
"mean_token_accuracy": 0.16549504548311234,
|
|
"num_tokens": 15568229.0,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"entropy": 5.97317328453064,
|
|
"epoch": 0.7364888731896856,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004952687035653899,
|
|
"loss": 5.8214,
|
|
"mean_token_accuracy": 0.167140843719244,
|
|
"num_tokens": 15577042.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 5.926223802566528,
|
|
"epoch": 0.736930413281526,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004952622773110989,
|
|
"loss": 5.7381,
|
|
"mean_token_accuracy": 0.17849351614713668,
|
|
"num_tokens": 15585825.0,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"entropy": 5.940134286880493,
|
|
"epoch": 0.7373719533733663,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004952558467419953,
|
|
"loss": 5.7658,
|
|
"mean_token_accuracy": 0.17126924693584442,
|
|
"num_tokens": 15595486.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 5.950185823440552,
|
|
"epoch": 0.7378134934652066,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.000495249411858205,
|
|
"loss": 5.813,
|
|
"mean_token_accuracy": 0.15817922800779344,
|
|
"num_tokens": 15605312.0,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"entropy": 5.981673097610473,
|
|
"epoch": 0.738255033557047,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004952429726598538,
|
|
"loss": 5.8119,
|
|
"mean_token_accuracy": 0.17498987764120102,
|
|
"num_tokens": 15615185.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 6.078198766708374,
|
|
"epoch": 0.7386965736488873,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004952365291470682,
|
|
"loss": 5.9209,
|
|
"mean_token_accuracy": 0.16283219456672668,
|
|
"num_tokens": 15625259.0,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"entropy": 6.13894739151001,
|
|
"epoch": 0.7391381137407277,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004952300813199742,
|
|
"loss": 5.8934,
|
|
"mean_token_accuracy": 0.16778266727924346,
|
|
"num_tokens": 15635040.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 6.093637466430664,
|
|
"epoch": 0.739579653832568,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004952236291786981,
|
|
"loss": 5.9225,
|
|
"mean_token_accuracy": 0.15750904381275177,
|
|
"num_tokens": 15644996.0,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"entropy": 5.882752561569214,
|
|
"epoch": 0.7400211939244083,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004952171727233665,
|
|
"loss": 5.8031,
|
|
"mean_token_accuracy": 0.16644213497638702,
|
|
"num_tokens": 15654598.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 5.981708002090454,
|
|
"epoch": 0.7404627340162486,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004952107119541055,
|
|
"loss": 5.8277,
|
|
"mean_token_accuracy": 0.1631438508629799,
|
|
"num_tokens": 15664213.0,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"entropy": 6.015055465698242,
|
|
"epoch": 0.740904274108089,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004952042468710421,
|
|
"loss": 5.898,
|
|
"mean_token_accuracy": 0.16343553215265275,
|
|
"num_tokens": 15673595.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 6.0489397048950195,
|
|
"epoch": 0.7413458141999294,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004951977774743027,
|
|
"loss": 5.9483,
|
|
"mean_token_accuracy": 0.15604591816663743,
|
|
"num_tokens": 15682753.0,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"entropy": 6.007998657226563,
|
|
"epoch": 0.7417873542917697,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004951913037640139,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.17713254690170288,
|
|
"num_tokens": 15691508.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 5.9607525825500485,
|
|
"epoch": 0.7422288943836101,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004951848257403029,
|
|
"loss": 5.7621,
|
|
"mean_token_accuracy": 0.16910529732704163,
|
|
"num_tokens": 15701110.0,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"entropy": 5.835663938522339,
|
|
"epoch": 0.7426704344754503,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004951783434032963,
|
|
"loss": 5.7001,
|
|
"mean_token_accuracy": 0.17653533816337585,
|
|
"num_tokens": 15710565.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 5.945553207397461,
|
|
"epoch": 0.7431119745672907,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004951718567531211,
|
|
"loss": 5.8933,
|
|
"mean_token_accuracy": 0.16108815222978592,
|
|
"num_tokens": 15718364.0,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"entropy": 6.040232849121094,
|
|
"epoch": 0.743553514659131,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004951653657899045,
|
|
"loss": 5.9572,
|
|
"mean_token_accuracy": 0.15713704228401185,
|
|
"num_tokens": 15729127.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 6.014114904403686,
|
|
"epoch": 0.7439950547509714,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004951588705137737,
|
|
"loss": 5.9172,
|
|
"mean_token_accuracy": 0.160747928917408,
|
|
"num_tokens": 15738491.0,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"entropy": 6.051564168930054,
|
|
"epoch": 0.7444365948428118,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004951523709248557,
|
|
"loss": 5.812,
|
|
"mean_token_accuracy": 0.17417716681957246,
|
|
"num_tokens": 15748346.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 5.936352062225342,
|
|
"epoch": 0.744878134934652,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000495145867023278,
|
|
"loss": 5.8178,
|
|
"mean_token_accuracy": 0.17726168632507325,
|
|
"num_tokens": 15757242.0,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"entropy": 6.062502765655518,
|
|
"epoch": 0.7453196750264924,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.000495139358809168,
|
|
"loss": 5.8817,
|
|
"mean_token_accuracy": 0.16312602609395982,
|
|
"num_tokens": 15767236.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 6.0330291271209715,
|
|
"epoch": 0.7457612151183327,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004951328462826532,
|
|
"loss": 5.917,
|
|
"mean_token_accuracy": 0.15931661278009415,
|
|
"num_tokens": 15776977.0,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"entropy": 5.967469167709351,
|
|
"epoch": 0.7462027552101731,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004951263294438611,
|
|
"loss": 5.8146,
|
|
"mean_token_accuracy": 0.16552847474813462,
|
|
"num_tokens": 15787333.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 5.8979497909545895,
|
|
"epoch": 0.7466442953020134,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004951198082929194,
|
|
"loss": 5.6991,
|
|
"mean_token_accuracy": 0.182376991212368,
|
|
"num_tokens": 15794773.0,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"entropy": 5.887519073486328,
|
|
"epoch": 0.7470858353938538,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004951132828299558,
|
|
"loss": 5.8119,
|
|
"mean_token_accuracy": 0.17694963216781617,
|
|
"num_tokens": 15803803.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 6.003292751312256,
|
|
"epoch": 0.7475273754856941,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004951067530550982,
|
|
"loss": 5.8686,
|
|
"mean_token_accuracy": 0.1630527436733246,
|
|
"num_tokens": 15814021.0,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"entropy": 6.018431043624878,
|
|
"epoch": 0.7479689155775344,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004951002189684745,
|
|
"loss": 5.847,
|
|
"mean_token_accuracy": 0.1591825008392334,
|
|
"num_tokens": 15822974.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 5.89467420578003,
|
|
"epoch": 0.7484104556693748,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004950936805702127,
|
|
"loss": 5.7536,
|
|
"mean_token_accuracy": 0.17445838451385498,
|
|
"num_tokens": 15832202.0,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"entropy": 6.020446825027466,
|
|
"epoch": 0.7488519957612151,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004950871378604409,
|
|
"loss": 5.8714,
|
|
"mean_token_accuracy": 0.16651229858398436,
|
|
"num_tokens": 15841223.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 6.053658437728882,
|
|
"epoch": 0.7492935358530555,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004950805908392872,
|
|
"loss": 5.8103,
|
|
"mean_token_accuracy": 0.1726827636361122,
|
|
"num_tokens": 15850260.0,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"entropy": 5.962708044052124,
|
|
"epoch": 0.7497350759448957,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004950740395068799,
|
|
"loss": 5.9007,
|
|
"mean_token_accuracy": 0.16698147505521774,
|
|
"num_tokens": 15859780.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 5.943686819076538,
|
|
"epoch": 0.7501766160367361,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004950674838633473,
|
|
"loss": 5.8261,
|
|
"mean_token_accuracy": 0.16707435846328736,
|
|
"num_tokens": 15868331.0,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"entropy": 6.012507152557373,
|
|
"epoch": 0.7506181561285765,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004950609239088178,
|
|
"loss": 5.8834,
|
|
"mean_token_accuracy": 0.16294038891792298,
|
|
"num_tokens": 15877359.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 6.04980845451355,
|
|
"epoch": 0.7510596962204168,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004950543596434201,
|
|
"loss": 5.7905,
|
|
"mean_token_accuracy": 0.1808181643486023,
|
|
"num_tokens": 15886976.0,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"entropy": 5.95252537727356,
|
|
"epoch": 0.7515012363122572,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004950477910672825,
|
|
"loss": 5.8655,
|
|
"mean_token_accuracy": 0.1658195436000824,
|
|
"num_tokens": 15896139.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 5.946612977981568,
|
|
"epoch": 0.7519427764040975,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000495041218180534,
|
|
"loss": 5.8504,
|
|
"mean_token_accuracy": 0.17413663417100905,
|
|
"num_tokens": 15904672.0,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"entropy": 5.9041801452636715,
|
|
"epoch": 0.7523843164959378,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.000495034640983303,
|
|
"loss": 5.7024,
|
|
"mean_token_accuracy": 0.176799039542675,
|
|
"num_tokens": 15912716.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 6.003086757659912,
|
|
"epoch": 0.7528258565877781,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004950280594757186,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.16853666454553604,
|
|
"num_tokens": 15921974.0,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"entropy": 6.049868392944336,
|
|
"epoch": 0.7532673966796185,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004950214736579097,
|
|
"loss": 5.8433,
|
|
"mean_token_accuracy": 0.16325998157262803,
|
|
"num_tokens": 15930859.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 5.95236120223999,
|
|
"epoch": 0.7537089367714589,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004950148835300051,
|
|
"loss": 5.7595,
|
|
"mean_token_accuracy": 0.1746041879057884,
|
|
"num_tokens": 15940052.0,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"entropy": 5.976876878738404,
|
|
"epoch": 0.7541504768632992,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004950082890921341,
|
|
"loss": 5.833,
|
|
"mean_token_accuracy": 0.16658651530742646,
|
|
"num_tokens": 15948422.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 5.991146755218506,
|
|
"epoch": 0.7545920169551396,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004950016903444261,
|
|
"loss": 5.9212,
|
|
"mean_token_accuracy": 0.16847308576107026,
|
|
"num_tokens": 15957433.0,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"entropy": 6.095296812057495,
|
|
"epoch": 0.7550335570469798,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004949950872870099,
|
|
"loss": 5.8565,
|
|
"mean_token_accuracy": 0.16426337659358978,
|
|
"num_tokens": 15966542.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 6.052794790267944,
|
|
"epoch": 0.7554750971388202,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.000494988479920015,
|
|
"loss": 5.9687,
|
|
"mean_token_accuracy": 0.15761574506759643,
|
|
"num_tokens": 15977067.0,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"entropy": 6.062600803375244,
|
|
"epoch": 0.7559166372306605,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.000494981868243571,
|
|
"loss": 5.8892,
|
|
"mean_token_accuracy": 0.16307247430086136,
|
|
"num_tokens": 15986502.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 6.134724473953247,
|
|
"epoch": 0.7563581773225009,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004949752522578073,
|
|
"loss": 5.9426,
|
|
"mean_token_accuracy": 0.15969116240739822,
|
|
"num_tokens": 15996788.0,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"entropy": 6.0735067367553714,
|
|
"epoch": 0.7567997174143413,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004949686319628535,
|
|
"loss": 5.846,
|
|
"mean_token_accuracy": 0.16205350160598755,
|
|
"num_tokens": 16007212.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 5.9686089038848875,
|
|
"epoch": 0.7572412575061815,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004949620073588394,
|
|
"loss": 5.8694,
|
|
"mean_token_accuracy": 0.16372007131576538,
|
|
"num_tokens": 16017000.0,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"entropy": 6.056089782714844,
|
|
"epoch": 0.7576827975980219,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004949553784458945,
|
|
"loss": 5.9647,
|
|
"mean_token_accuracy": 0.15608513206243516,
|
|
"num_tokens": 16026647.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 5.986236476898194,
|
|
"epoch": 0.7581243376898622,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 0.0004949487452241489,
|
|
"loss": 5.6913,
|
|
"mean_token_accuracy": 0.1788015127182007,
|
|
"num_tokens": 16034773.0,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"entropy": 6.041558456420899,
|
|
"epoch": 0.7585658777817026,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004949421076937325,
|
|
"loss": 5.8507,
|
|
"mean_token_accuracy": 0.1660602033138275,
|
|
"num_tokens": 16043787.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 5.984687471389771,
|
|
"epoch": 0.7590074178735429,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004949354658547753,
|
|
"loss": 5.8063,
|
|
"mean_token_accuracy": 0.17274508327245713,
|
|
"num_tokens": 16053130.0,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"entropy": 5.994377183914184,
|
|
"epoch": 0.7594489579653833,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004949288197074074,
|
|
"loss": 5.9551,
|
|
"mean_token_accuracy": 0.15927992463111879,
|
|
"num_tokens": 16062529.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 5.955754947662354,
|
|
"epoch": 0.7598904980572236,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004949221692517589,
|
|
"loss": 5.8028,
|
|
"mean_token_accuracy": 0.16142662912607192,
|
|
"num_tokens": 16071932.0,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"entropy": 6.0386292934417725,
|
|
"epoch": 0.7603320381490639,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004949155144879602,
|
|
"loss": 5.7827,
|
|
"mean_token_accuracy": 0.17397554367780685,
|
|
"num_tokens": 16081077.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 5.993456697463989,
|
|
"epoch": 0.7607735782409043,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004949088554161417,
|
|
"loss": 5.7304,
|
|
"mean_token_accuracy": 0.17781465798616408,
|
|
"num_tokens": 16089725.0,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"entropy": 6.007733106613159,
|
|
"epoch": 0.7612151183327446,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004949021920364339,
|
|
"loss": 5.8982,
|
|
"mean_token_accuracy": 0.1661226361989975,
|
|
"num_tokens": 16099362.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 6.0902595043182375,
|
|
"epoch": 0.761656658424585,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004948955243489671,
|
|
"loss": 5.917,
|
|
"mean_token_accuracy": 0.15821752846240997,
|
|
"num_tokens": 16108746.0,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"entropy": 6.114125061035156,
|
|
"epoch": 0.7620981985164252,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004948888523538721,
|
|
"loss": 6.0579,
|
|
"mean_token_accuracy": 0.15333025604486467,
|
|
"num_tokens": 16118902.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 5.9824565887451175,
|
|
"epoch": 0.7625397386082656,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004948821760512795,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.1688033312559128,
|
|
"num_tokens": 16127781.0,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"entropy": 5.991695070266724,
|
|
"epoch": 0.762981278700106,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.0004948754954413201,
|
|
"loss": 5.7665,
|
|
"mean_token_accuracy": 0.1775361105799675,
|
|
"num_tokens": 16136004.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 5.9446573734283445,
|
|
"epoch": 0.7634228187919463,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004948688105241248,
|
|
"loss": 5.8371,
|
|
"mean_token_accuracy": 0.1675607979297638,
|
|
"num_tokens": 16145023.0,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"entropy": 5.97957935333252,
|
|
"epoch": 0.7638643588837867,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004948621212998246,
|
|
"loss": 5.8866,
|
|
"mean_token_accuracy": 0.1627459466457367,
|
|
"num_tokens": 16154242.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 6.039345216751099,
|
|
"epoch": 0.764305898975627,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004948554277685505,
|
|
"loss": 5.8596,
|
|
"mean_token_accuracy": 0.1614910364151001,
|
|
"num_tokens": 16163252.0,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"entropy": 6.080992984771728,
|
|
"epoch": 0.7647474390674673,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004948487299304335,
|
|
"loss": 5.7838,
|
|
"mean_token_accuracy": 0.1723628520965576,
|
|
"num_tokens": 16171363.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 6.038664770126343,
|
|
"epoch": 0.7651889791593076,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.000494842027785605,
|
|
"loss": 5.9707,
|
|
"mean_token_accuracy": 0.16340688914060592,
|
|
"num_tokens": 16181234.0,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"entropy": 6.008198308944702,
|
|
"epoch": 0.765630519251148,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004948353213341961,
|
|
"loss": 5.8662,
|
|
"mean_token_accuracy": 0.16622024774551392,
|
|
"num_tokens": 16189420.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 6.095108985900879,
|
|
"epoch": 0.7660720593429884,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004948286105763385,
|
|
"loss": 6.0349,
|
|
"mean_token_accuracy": 0.14946414679288864,
|
|
"num_tokens": 16199202.0,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"entropy": 6.088163900375366,
|
|
"epoch": 0.7665135994348287,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004948218955121632,
|
|
"loss": 5.8595,
|
|
"mean_token_accuracy": 0.16536297798156738,
|
|
"num_tokens": 16208481.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 6.020900392532349,
|
|
"epoch": 0.766955139526669,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004948151761418021,
|
|
"loss": 5.928,
|
|
"mean_token_accuracy": 0.16450031846761703,
|
|
"num_tokens": 16217282.0,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"entropy": 6.010745429992676,
|
|
"epoch": 0.7673966796185093,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004948084524653868,
|
|
"loss": 5.786,
|
|
"mean_token_accuracy": 0.17689766138792037,
|
|
"num_tokens": 16226567.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 6.038272476196289,
|
|
"epoch": 0.7678382197103497,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004948017244830489,
|
|
"loss": 5.9912,
|
|
"mean_token_accuracy": 0.1531811460852623,
|
|
"num_tokens": 16236255.0,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"entropy": 6.012173700332641,
|
|
"epoch": 0.76827975980219,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004947949921949201,
|
|
"loss": 5.7611,
|
|
"mean_token_accuracy": 0.18096890598535537,
|
|
"num_tokens": 16244838.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 5.877737808227539,
|
|
"epoch": 0.7687212998940304,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004947882556011325,
|
|
"loss": 5.6674,
|
|
"mean_token_accuracy": 0.17592710703611375,
|
|
"num_tokens": 16254101.0,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"entropy": 5.790978288650512,
|
|
"epoch": 0.7691628399858708,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000494781514701818,
|
|
"loss": 5.7549,
|
|
"mean_token_accuracy": 0.17472999691963195,
|
|
"num_tokens": 16263496.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 6.015494537353516,
|
|
"epoch": 0.769604380077711,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004947747694971085,
|
|
"loss": 5.8577,
|
|
"mean_token_accuracy": 0.17157624363899232,
|
|
"num_tokens": 16271884.0,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"entropy": 6.020488214492798,
|
|
"epoch": 0.7700459201695514,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004947680199871363,
|
|
"loss": 5.7849,
|
|
"mean_token_accuracy": 0.17175298631191255,
|
|
"num_tokens": 16281197.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 5.941747856140137,
|
|
"epoch": 0.7704874602613917,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004947612661720336,
|
|
"loss": 5.8585,
|
|
"mean_token_accuracy": 0.16881806254386902,
|
|
"num_tokens": 16289908.0,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"entropy": 5.970186948776245,
|
|
"epoch": 0.7709290003532321,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004947545080519327,
|
|
"loss": 5.8431,
|
|
"mean_token_accuracy": 0.16303220093250276,
|
|
"num_tokens": 16299237.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 6.07957649230957,
|
|
"epoch": 0.7713705404450724,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000494747745626966,
|
|
"loss": 5.8625,
|
|
"mean_token_accuracy": 0.17015912532806396,
|
|
"num_tokens": 16308175.0,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"entropy": 5.939646482467651,
|
|
"epoch": 0.7718120805369127,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004947409788972659,
|
|
"loss": 5.8134,
|
|
"mean_token_accuracy": 0.17128169685602188,
|
|
"num_tokens": 16317121.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 5.888267755508423,
|
|
"epoch": 0.7722536206287531,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004947342078629649,
|
|
"loss": 5.7836,
|
|
"mean_token_accuracy": 0.1751389503479004,
|
|
"num_tokens": 16325764.0,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"entropy": 6.0162074089050295,
|
|
"epoch": 0.7726951607205934,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004947274325241959,
|
|
"loss": 5.7971,
|
|
"mean_token_accuracy": 0.1663631409406662,
|
|
"num_tokens": 16333823.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 6.007051038742065,
|
|
"epoch": 0.7731367008124338,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004947206528810913,
|
|
"loss": 5.7953,
|
|
"mean_token_accuracy": 0.1734223335981369,
|
|
"num_tokens": 16341918.0,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"entropy": 5.964048862457275,
|
|
"epoch": 0.7735782409042741,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004947138689337843,
|
|
"loss": 5.8454,
|
|
"mean_token_accuracy": 0.1691407859325409,
|
|
"num_tokens": 16351904.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 6.049382305145263,
|
|
"epoch": 0.7740197809961145,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004947070806824074,
|
|
"loss": 5.863,
|
|
"mean_token_accuracy": 0.16452886164188385,
|
|
"num_tokens": 16360921.0,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"entropy": 6.076323413848877,
|
|
"epoch": 0.7744613210879547,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 0.0004947002881270937,
|
|
"loss": 5.882,
|
|
"mean_token_accuracy": 0.16276332437992097,
|
|
"num_tokens": 16370929.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 6.061858701705932,
|
|
"epoch": 0.7749028611797951,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004946934912679764,
|
|
"loss": 5.8834,
|
|
"mean_token_accuracy": 0.1623873770236969,
|
|
"num_tokens": 16380099.0,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"entropy": 6.058119440078736,
|
|
"epoch": 0.7753444012716355,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004946866901051885,
|
|
"loss": 5.8984,
|
|
"mean_token_accuracy": 0.166536608338356,
|
|
"num_tokens": 16389109.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 5.995142030715942,
|
|
"epoch": 0.7757859413634758,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004946798846388634,
|
|
"loss": 5.9407,
|
|
"mean_token_accuracy": 0.1601017102599144,
|
|
"num_tokens": 16398601.0,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"entropy": 5.908467817306518,
|
|
"epoch": 0.7762274814553162,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004946730748691342,
|
|
"loss": 5.804,
|
|
"mean_token_accuracy": 0.17755288928747176,
|
|
"num_tokens": 16406950.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 6.086664867401123,
|
|
"epoch": 0.7766690215471564,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004946662607961344,
|
|
"loss": 5.812,
|
|
"mean_token_accuracy": 0.16931003481149673,
|
|
"num_tokens": 16414680.0,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"entropy": 6.029450368881226,
|
|
"epoch": 0.7771105616389968,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004946594424199976,
|
|
"loss": 5.8757,
|
|
"mean_token_accuracy": 0.16709612607955932,
|
|
"num_tokens": 16424060.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 6.009132289886475,
|
|
"epoch": 0.7775521017308371,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004946526197408571,
|
|
"loss": 5.8871,
|
|
"mean_token_accuracy": 0.16631227284669875,
|
|
"num_tokens": 16433691.0,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"entropy": 6.034335660934448,
|
|
"epoch": 0.7779936418226775,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004946457927588467,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.16435332745313644,
|
|
"num_tokens": 16443234.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 5.976377153396607,
|
|
"epoch": 0.7784351819145179,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004946389614741002,
|
|
"loss": 5.828,
|
|
"mean_token_accuracy": 0.17055855989456176,
|
|
"num_tokens": 16452084.0,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"entropy": 6.041907787322998,
|
|
"epoch": 0.7788767220063582,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004946321258867513,
|
|
"loss": 5.8151,
|
|
"mean_token_accuracy": 0.16775723546743393,
|
|
"num_tokens": 16460135.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 6.004779911041259,
|
|
"epoch": 0.7793182620981985,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.000494625285996934,
|
|
"loss": 5.9481,
|
|
"mean_token_accuracy": 0.16491457521915437,
|
|
"num_tokens": 16469919.0,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"entropy": 5.999904203414917,
|
|
"epoch": 0.7797598021900388,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004946184418047823,
|
|
"loss": 5.8509,
|
|
"mean_token_accuracy": 0.16316285580396653,
|
|
"num_tokens": 16478473.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 6.058293390274048,
|
|
"epoch": 0.7802013422818792,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004946115933104301,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.15920920595526694,
|
|
"num_tokens": 16486878.0,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"entropy": 6.025899887084961,
|
|
"epoch": 0.7806428823737195,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004946047405140119,
|
|
"loss": 5.901,
|
|
"mean_token_accuracy": 0.16094491481781006,
|
|
"num_tokens": 16496798.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 6.0621325969696045,
|
|
"epoch": 0.7810844224655599,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004945978834156616,
|
|
"loss": 5.8335,
|
|
"mean_token_accuracy": 0.16948919147253036,
|
|
"num_tokens": 16506508.0,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"entropy": 5.898591375350952,
|
|
"epoch": 0.7815259625574003,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004945910220155136,
|
|
"loss": 5.8036,
|
|
"mean_token_accuracy": 0.1735559344291687,
|
|
"num_tokens": 16514979.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 6.0083986759185795,
|
|
"epoch": 0.7819675026492405,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004945841563137025,
|
|
"loss": 5.8737,
|
|
"mean_token_accuracy": 0.1645631179213524,
|
|
"num_tokens": 16524654.0,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"entropy": 6.1006495475769045,
|
|
"epoch": 0.7824090427410809,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004945772863103626,
|
|
"loss": 5.9451,
|
|
"mean_token_accuracy": 0.15826726704835892,
|
|
"num_tokens": 16534486.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 6.079514646530152,
|
|
"epoch": 0.7828505828329212,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004945704120056286,
|
|
"loss": 5.8814,
|
|
"mean_token_accuracy": 0.17055715173482894,
|
|
"num_tokens": 16543718.0,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"entropy": 5.943536615371704,
|
|
"epoch": 0.7832921229247616,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000494563533399635,
|
|
"loss": 5.8278,
|
|
"mean_token_accuracy": 0.16279819309711457,
|
|
"num_tokens": 16553159.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 6.071165704727173,
|
|
"epoch": 0.7837336630166019,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004945566504925167,
|
|
"loss": 5.8979,
|
|
"mean_token_accuracy": 0.16369519382715225,
|
|
"num_tokens": 16562398.0,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"entropy": 6.046672582626343,
|
|
"epoch": 0.7841752031084422,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004945497632844084,
|
|
"loss": 5.8434,
|
|
"mean_token_accuracy": 0.1709023430943489,
|
|
"num_tokens": 16571530.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 5.891947078704834,
|
|
"epoch": 0.7846167432002826,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004945428717754453,
|
|
"loss": 5.8451,
|
|
"mean_token_accuracy": 0.16528365015983582,
|
|
"num_tokens": 16581074.0,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"entropy": 5.903192663192749,
|
|
"epoch": 0.7850582832921229,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000494535975965762,
|
|
"loss": 5.7092,
|
|
"mean_token_accuracy": 0.17383420020341872,
|
|
"num_tokens": 16589574.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 5.908491230010986,
|
|
"epoch": 0.7854998233839633,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004945290758554939,
|
|
"loss": 5.7966,
|
|
"mean_token_accuracy": 0.17097795158624648,
|
|
"num_tokens": 16599623.0,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"entropy": 5.951535558700561,
|
|
"epoch": 0.7859413634758036,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000494522171444776,
|
|
"loss": 5.8771,
|
|
"mean_token_accuracy": 0.1554115429520607,
|
|
"num_tokens": 16608559.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 6.007721900939941,
|
|
"epoch": 0.786382903567644,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004945152627337437,
|
|
"loss": 5.7946,
|
|
"mean_token_accuracy": 0.17543937712907792,
|
|
"num_tokens": 16617148.0,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"entropy": 5.975952243804931,
|
|
"epoch": 0.7868244436594842,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004945083497225321,
|
|
"loss": 5.8109,
|
|
"mean_token_accuracy": 0.16906408369541168,
|
|
"num_tokens": 16626405.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 6.001080751419067,
|
|
"epoch": 0.7872659837513246,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004945014324112769,
|
|
"loss": 5.833,
|
|
"mean_token_accuracy": 0.17065125554800034,
|
|
"num_tokens": 16635116.0,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"entropy": 5.936459684371949,
|
|
"epoch": 0.787707523843165,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004944945108001134,
|
|
"loss": 5.8052,
|
|
"mean_token_accuracy": 0.1653392180800438,
|
|
"num_tokens": 16644381.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 5.9898097038269045,
|
|
"epoch": 0.7881490639350053,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004944875848891773,
|
|
"loss": 5.8612,
|
|
"mean_token_accuracy": 0.1636720508337021,
|
|
"num_tokens": 16654275.0,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"entropy": 6.067295503616333,
|
|
"epoch": 0.7885906040268457,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004944806546786042,
|
|
"loss": 5.9303,
|
|
"mean_token_accuracy": 0.1632734939455986,
|
|
"num_tokens": 16663484.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 6.037387943267822,
|
|
"epoch": 0.7890321441186859,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00049447372016853,
|
|
"loss": 5.8665,
|
|
"mean_token_accuracy": 0.17046364545822143,
|
|
"num_tokens": 16672490.0,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"entropy": 6.083343410491944,
|
|
"epoch": 0.7894736842105263,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004944667813590904,
|
|
"loss": 5.8409,
|
|
"mean_token_accuracy": 0.165067557990551,
|
|
"num_tokens": 16681242.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 5.971105098724365,
|
|
"epoch": 0.7899152243023666,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004944598382504214,
|
|
"loss": 5.7775,
|
|
"mean_token_accuracy": 0.16900418400764466,
|
|
"num_tokens": 16690359.0,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"entropy": 5.975606966018677,
|
|
"epoch": 0.790356764394207,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004944528908426589,
|
|
"loss": 5.8643,
|
|
"mean_token_accuracy": 0.1650958314538002,
|
|
"num_tokens": 16700774.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 6.042025375366211,
|
|
"epoch": 0.7907983044860474,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004944459391359391,
|
|
"loss": 5.9374,
|
|
"mean_token_accuracy": 0.16598828583955766,
|
|
"num_tokens": 16710513.0,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"entropy": 5.985045576095581,
|
|
"epoch": 0.7912398445778877,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004944389831303982,
|
|
"loss": 5.8374,
|
|
"mean_token_accuracy": 0.16597743779420854,
|
|
"num_tokens": 16720785.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 6.096461009979248,
|
|
"epoch": 0.791681384669728,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004944320228261723,
|
|
"loss": 5.8489,
|
|
"mean_token_accuracy": 0.16366840451955794,
|
|
"num_tokens": 16729083.0,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"entropy": 5.978036975860595,
|
|
"epoch": 0.7921229247615683,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.000494425058223398,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.16302479505538942,
|
|
"num_tokens": 16738313.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 5.952572774887085,
|
|
"epoch": 0.7925644648534087,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 0.0004944180893222117,
|
|
"loss": 5.7871,
|
|
"mean_token_accuracy": 0.17404089868068695,
|
|
"num_tokens": 16747191.0,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"entropy": 5.940504741668701,
|
|
"epoch": 0.793006004945249,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004944111161227496,
|
|
"loss": 5.8295,
|
|
"mean_token_accuracy": 0.16756651401519776,
|
|
"num_tokens": 16755921.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 6.005930614471436,
|
|
"epoch": 0.7934475450370894,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004944041386251486,
|
|
"loss": 5.9722,
|
|
"mean_token_accuracy": 0.162301617115736,
|
|
"num_tokens": 16765394.0,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"entropy": 6.021315050125122,
|
|
"epoch": 0.7938890851289298,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004943971568295454,
|
|
"loss": 5.851,
|
|
"mean_token_accuracy": 0.17589876651763917,
|
|
"num_tokens": 16774824.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 6.0411216735839846,
|
|
"epoch": 0.79433062522077,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004943901707360765,
|
|
"loss": 5.9653,
|
|
"mean_token_accuracy": 0.15621765553951264,
|
|
"num_tokens": 16783945.0,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"entropy": 5.95935206413269,
|
|
"epoch": 0.7947721653126104,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000494383180344879,
|
|
"loss": 5.7439,
|
|
"mean_token_accuracy": 0.17004499435424805,
|
|
"num_tokens": 16792618.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7947721653126104,
|
|
"eval_entropy": 5.774197348921917,
|
|
"eval_loss": 5.872763156890869,
|
|
"eval_mean_token_accuracy": 0.17264041615408,
|
|
"eval_num_tokens": 16792618.0,
|
|
"eval_runtime": 26.2257,
|
|
"eval_samples_per_second": 1346.581,
|
|
"eval_steps_per_second": 168.346,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 5.941266775131226,
|
|
"epoch": 0.7952137054044507,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004943761856560898,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.1739298015832901,
|
|
"num_tokens": 16801210.0,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"entropy": 6.00039587020874,
|
|
"epoch": 0.7956552454962911,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000494369186669846,
|
|
"loss": 5.8748,
|
|
"mean_token_accuracy": 0.16477510184049607,
|
|
"num_tokens": 16810371.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 6.072319173812867,
|
|
"epoch": 0.7960967855881314,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004943621833862844,
|
|
"loss": 5.7852,
|
|
"mean_token_accuracy": 0.16916722655296326,
|
|
"num_tokens": 16818886.0,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"entropy": 6.060644626617432,
|
|
"epoch": 0.7965383256799717,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004943551758055425,
|
|
"loss": 5.8459,
|
|
"mean_token_accuracy": 0.1706278830766678,
|
|
"num_tokens": 16827705.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 6.0029925346374515,
|
|
"epoch": 0.7969798657718121,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004943481639277574,
|
|
"loss": 5.8133,
|
|
"mean_token_accuracy": 0.16445554345846175,
|
|
"num_tokens": 16836313.0,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"entropy": 5.99051866531372,
|
|
"epoch": 0.7974214058636524,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004943411477530665,
|
|
"loss": 5.9,
|
|
"mean_token_accuracy": 0.15801046937704086,
|
|
"num_tokens": 16845016.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 6.039083003997803,
|
|
"epoch": 0.7978629459554928,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004943341272816073,
|
|
"loss": 5.8104,
|
|
"mean_token_accuracy": 0.16541790813207627,
|
|
"num_tokens": 16854025.0,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"entropy": 6.015139532089234,
|
|
"epoch": 0.7983044860473331,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004943271025135173,
|
|
"loss": 5.7965,
|
|
"mean_token_accuracy": 0.16925208270549774,
|
|
"num_tokens": 16862859.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 5.987388277053833,
|
|
"epoch": 0.7987460261391734,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004943200734489341,
|
|
"loss": 5.9064,
|
|
"mean_token_accuracy": 0.16620191037654877,
|
|
"num_tokens": 16872226.0,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"entropy": 5.935065841674804,
|
|
"epoch": 0.7991875662310137,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004943130400879953,
|
|
"loss": 5.7871,
|
|
"mean_token_accuracy": 0.17395144551992417,
|
|
"num_tokens": 16881977.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 6.003419685363769,
|
|
"epoch": 0.7996291063228541,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004943060024308389,
|
|
"loss": 5.879,
|
|
"mean_token_accuracy": 0.16055256724357606,
|
|
"num_tokens": 16891656.0,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"entropy": 6.008896398544311,
|
|
"epoch": 0.8000706464146945,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004942989604776026,
|
|
"loss": 5.7544,
|
|
"mean_token_accuracy": 0.1753992348909378,
|
|
"num_tokens": 16900944.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 5.880461978912353,
|
|
"epoch": 0.8005121865065348,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004942919142284244,
|
|
"loss": 5.7738,
|
|
"mean_token_accuracy": 0.17210271209478378,
|
|
"num_tokens": 16910154.0,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"entropy": 6.027874279022217,
|
|
"epoch": 0.8009537265983752,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004942848636834423,
|
|
"loss": 5.9076,
|
|
"mean_token_accuracy": 0.16426340490579605,
|
|
"num_tokens": 16919694.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 6.09167332649231,
|
|
"epoch": 0.8013952666902154,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004942778088427944,
|
|
"loss": 5.8551,
|
|
"mean_token_accuracy": 0.16622861176729203,
|
|
"num_tokens": 16928530.0,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"entropy": 6.067292928695679,
|
|
"epoch": 0.8018368067820558,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000494270749706619,
|
|
"loss": 5.8796,
|
|
"mean_token_accuracy": 0.16309232711791993,
|
|
"num_tokens": 16938614.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 6.004611015319824,
|
|
"epoch": 0.8022783468738961,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004942636862750544,
|
|
"loss": 5.8232,
|
|
"mean_token_accuracy": 0.1672021970152855,
|
|
"num_tokens": 16947655.0,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"entropy": 6.013993453979492,
|
|
"epoch": 0.8027198869657365,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004942566185482387,
|
|
"loss": 5.9213,
|
|
"mean_token_accuracy": 0.1601979151368141,
|
|
"num_tokens": 16957101.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 6.012250661849976,
|
|
"epoch": 0.8031614270575769,
|
|
"grad_norm": 10.5,
|
|
"learning_rate": 0.0004942495465263106,
|
|
"loss": 5.9408,
|
|
"mean_token_accuracy": 0.15858723670244218,
|
|
"num_tokens": 16966421.0,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"entropy": 6.113315486907959,
|
|
"epoch": 0.8036029671494171,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004942424702094085,
|
|
"loss": 5.841,
|
|
"mean_token_accuracy": 0.1748947262763977,
|
|
"num_tokens": 16975656.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 6.038866281509399,
|
|
"epoch": 0.8040445072412575,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004942353895976713,
|
|
"loss": 5.8954,
|
|
"mean_token_accuracy": 0.16503171473741532,
|
|
"num_tokens": 16984409.0,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"entropy": 5.946926879882812,
|
|
"epoch": 0.8044860473330978,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004942283046912374,
|
|
"loss": 5.7914,
|
|
"mean_token_accuracy": 0.1615387961268425,
|
|
"num_tokens": 16992728.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 5.988664960861206,
|
|
"epoch": 0.8049275874249382,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004942212154902456,
|
|
"loss": 5.7094,
|
|
"mean_token_accuracy": 0.17891569435596466,
|
|
"num_tokens": 17001616.0,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"entropy": 6.016485023498535,
|
|
"epoch": 0.8053691275167785,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000494214121994835,
|
|
"loss": 5.8815,
|
|
"mean_token_accuracy": 0.1644422948360443,
|
|
"num_tokens": 17010655.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 5.981185865402222,
|
|
"epoch": 0.8058106676086189,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004942070242051444,
|
|
"loss": 5.8206,
|
|
"mean_token_accuracy": 0.16579540222883224,
|
|
"num_tokens": 17019819.0,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"entropy": 6.017875623703003,
|
|
"epoch": 0.8062522077004592,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004941999221213129,
|
|
"loss": 5.869,
|
|
"mean_token_accuracy": 0.16917242109775543,
|
|
"num_tokens": 17029218.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 6.040101957321167,
|
|
"epoch": 0.8066937477922995,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004941928157434796,
|
|
"loss": 5.7759,
|
|
"mean_token_accuracy": 0.1664850026369095,
|
|
"num_tokens": 17037880.0,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"entropy": 5.899328088760376,
|
|
"epoch": 0.8071352878841399,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004941857050717836,
|
|
"loss": 5.8518,
|
|
"mean_token_accuracy": 0.16521087139844895,
|
|
"num_tokens": 17047934.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 6.061554431915283,
|
|
"epoch": 0.8075768279759802,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004941785901063642,
|
|
"loss": 5.9161,
|
|
"mean_token_accuracy": 0.16436673253774642,
|
|
"num_tokens": 17057258.0,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"entropy": 6.060186815261841,
|
|
"epoch": 0.8080183680678206,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004941714708473611,
|
|
"loss": 5.8742,
|
|
"mean_token_accuracy": 0.16407700031995773,
|
|
"num_tokens": 17066387.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 5.970388793945313,
|
|
"epoch": 0.8084599081596608,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004941643472949134,
|
|
"loss": 5.8069,
|
|
"mean_token_accuracy": 0.16943138539791108,
|
|
"num_tokens": 17075952.0,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"entropy": 5.959314489364624,
|
|
"epoch": 0.8089014482515012,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0004941572194491608,
|
|
"loss": 5.6841,
|
|
"mean_token_accuracy": 0.17572013437747955,
|
|
"num_tokens": 17084209.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 5.960428619384766,
|
|
"epoch": 0.8093429883433416,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000494150087310243,
|
|
"loss": 5.903,
|
|
"mean_token_accuracy": 0.16461869776248933,
|
|
"num_tokens": 17093395.0,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"entropy": 6.011094474792481,
|
|
"epoch": 0.8097845284351819,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004941429508782995,
|
|
"loss": 5.8087,
|
|
"mean_token_accuracy": 0.16933547407388688,
|
|
"num_tokens": 17102646.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 6.0214849472045895,
|
|
"epoch": 0.8102260685270223,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004941358101534703,
|
|
"loss": 5.8163,
|
|
"mean_token_accuracy": 0.1657823845744133,
|
|
"num_tokens": 17111530.0,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"entropy": 5.9327880859375,
|
|
"epoch": 0.8106676086188626,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004941286651358952,
|
|
"loss": 5.7754,
|
|
"mean_token_accuracy": 0.17723568230867387,
|
|
"num_tokens": 17119659.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 5.851162385940552,
|
|
"epoch": 0.8111091487107029,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004941215158257142,
|
|
"loss": 5.7937,
|
|
"mean_token_accuracy": 0.16790302991867065,
|
|
"num_tokens": 17127931.0,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"entropy": 5.973969125747681,
|
|
"epoch": 0.8115506888025432,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0004941143622230672,
|
|
"loss": 5.8164,
|
|
"mean_token_accuracy": 0.17351385802030564,
|
|
"num_tokens": 17137374.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 6.0743568420410154,
|
|
"epoch": 0.8119922288943836,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004941072043280946,
|
|
"loss": 5.885,
|
|
"mean_token_accuracy": 0.16001378148794174,
|
|
"num_tokens": 17147257.0,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"entropy": 6.005939292907715,
|
|
"epoch": 0.812433768986224,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004941000421409365,
|
|
"loss": 5.7516,
|
|
"mean_token_accuracy": 0.17450112253427505,
|
|
"num_tokens": 17156233.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 6.042790842056275,
|
|
"epoch": 0.8128753090780643,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004940928756617331,
|
|
"loss": 5.9396,
|
|
"mean_token_accuracy": 0.15833428949117662,
|
|
"num_tokens": 17166364.0,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"entropy": 5.958063983917237,
|
|
"epoch": 0.8133168491699047,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004940857048906248,
|
|
"loss": 5.8384,
|
|
"mean_token_accuracy": 0.1704106017947197,
|
|
"num_tokens": 17175853.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 6.029957246780396,
|
|
"epoch": 0.8137583892617449,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004940785298277523,
|
|
"loss": 5.9398,
|
|
"mean_token_accuracy": 0.1621903046965599,
|
|
"num_tokens": 17187849.0,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"entropy": 6.07569317817688,
|
|
"epoch": 0.8141999293535853,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000494071350473256,
|
|
"loss": 5.8259,
|
|
"mean_token_accuracy": 0.1650031879544258,
|
|
"num_tokens": 17196646.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 6.061435317993164,
|
|
"epoch": 0.8146414694454256,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004940641668272765,
|
|
"loss": 5.9578,
|
|
"mean_token_accuracy": 0.16093774735927582,
|
|
"num_tokens": 17205919.0,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"entropy": 6.044738626480102,
|
|
"epoch": 0.815083009537266,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004940569788899544,
|
|
"loss": 5.9068,
|
|
"mean_token_accuracy": 0.16127809137105942,
|
|
"num_tokens": 17215085.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 5.978901338577271,
|
|
"epoch": 0.8155245496291064,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004940497866614309,
|
|
"loss": 5.886,
|
|
"mean_token_accuracy": 0.15986076593399048,
|
|
"num_tokens": 17223633.0,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"entropy": 6.01971402168274,
|
|
"epoch": 0.8159660897209466,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004940425901418465,
|
|
"loss": 5.7953,
|
|
"mean_token_accuracy": 0.1701249048113823,
|
|
"num_tokens": 17232915.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 6.049149703979492,
|
|
"epoch": 0.816407629812787,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004940353893313423,
|
|
"loss": 5.9239,
|
|
"mean_token_accuracy": 0.1602263942360878,
|
|
"num_tokens": 17242392.0,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"entropy": 6.114197826385498,
|
|
"epoch": 0.8168491699046273,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004940281842300596,
|
|
"loss": 5.9946,
|
|
"mean_token_accuracy": 0.15723732411861419,
|
|
"num_tokens": 17252259.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 6.034798717498779,
|
|
"epoch": 0.8172907099964677,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004940209748381392,
|
|
"loss": 5.8621,
|
|
"mean_token_accuracy": 0.16262195855379105,
|
|
"num_tokens": 17261584.0,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"entropy": 6.006899690628051,
|
|
"epoch": 0.817732250088308,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004940137611557225,
|
|
"loss": 5.8059,
|
|
"mean_token_accuracy": 0.16907447427511216,
|
|
"num_tokens": 17270901.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 6.030629062652588,
|
|
"epoch": 0.8181737901801484,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004940065431829508,
|
|
"loss": 5.8586,
|
|
"mean_token_accuracy": 0.16188233494758605,
|
|
"num_tokens": 17280024.0,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"entropy": 6.03720874786377,
|
|
"epoch": 0.8186153302719887,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004939993209199654,
|
|
"loss": 5.9058,
|
|
"mean_token_accuracy": 0.16143932938575745,
|
|
"num_tokens": 17290131.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 6.105310440063477,
|
|
"epoch": 0.819056870363829,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004939920943669079,
|
|
"loss": 5.9044,
|
|
"mean_token_accuracy": 0.1629559114575386,
|
|
"num_tokens": 17299453.0,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"entropy": 6.005107450485229,
|
|
"epoch": 0.8194984104556694,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004939848635239198,
|
|
"loss": 5.8754,
|
|
"mean_token_accuracy": 0.163715136051178,
|
|
"num_tokens": 17309344.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 6.007379484176636,
|
|
"epoch": 0.8199399505475097,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004939776283911429,
|
|
"loss": 5.7864,
|
|
"mean_token_accuracy": 0.17647817730903625,
|
|
"num_tokens": 17318636.0,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"entropy": 5.983161783218383,
|
|
"epoch": 0.8203814906393501,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004939703889687186,
|
|
"loss": 5.8451,
|
|
"mean_token_accuracy": 0.16662515699863434,
|
|
"num_tokens": 17328146.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 5.976247024536133,
|
|
"epoch": 0.8208230307311903,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004939631452567891,
|
|
"loss": 5.9131,
|
|
"mean_token_accuracy": 0.16368812918663025,
|
|
"num_tokens": 17338542.0,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"entropy": 6.122719049453735,
|
|
"epoch": 0.8212645708230307,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.000493955897255496,
|
|
"loss": 5.8842,
|
|
"mean_token_accuracy": 0.16635640561580659,
|
|
"num_tokens": 17347577.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 6.022024011611938,
|
|
"epoch": 0.8217061109148711,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004939486449649814,
|
|
"loss": 5.8624,
|
|
"mean_token_accuracy": 0.16558304727077483,
|
|
"num_tokens": 17356554.0,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"entropy": 5.960217761993408,
|
|
"epoch": 0.8221476510067114,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.0004939413883853873,
|
|
"loss": 5.8814,
|
|
"mean_token_accuracy": 0.1689998045563698,
|
|
"num_tokens": 17366432.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 6.087877368927002,
|
|
"epoch": 0.8225891910985518,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.000493934127516856,
|
|
"loss": 5.8458,
|
|
"mean_token_accuracy": 0.1692266508936882,
|
|
"num_tokens": 17376556.0,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"entropy": 6.018303537368775,
|
|
"epoch": 0.823030731190392,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004939268623595297,
|
|
"loss": 5.8919,
|
|
"mean_token_accuracy": 0.15746195390820503,
|
|
"num_tokens": 17385382.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 5.992043781280517,
|
|
"epoch": 0.8234722712822324,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004939195929135506,
|
|
"loss": 5.7691,
|
|
"mean_token_accuracy": 0.1613371714949608,
|
|
"num_tokens": 17393229.0,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"entropy": 5.988037776947022,
|
|
"epoch": 0.8239138113740727,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004939123191790612,
|
|
"loss": 5.7709,
|
|
"mean_token_accuracy": 0.16014604717493058,
|
|
"num_tokens": 17402439.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 5.987378358840942,
|
|
"epoch": 0.8243553514659131,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004939050411562041,
|
|
"loss": 5.9294,
|
|
"mean_token_accuracy": 0.15827521085739135,
|
|
"num_tokens": 17412349.0,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"entropy": 5.954100561141968,
|
|
"epoch": 0.8247968915577535,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004938977588451216,
|
|
"loss": 5.6967,
|
|
"mean_token_accuracy": 0.17938645631074907,
|
|
"num_tokens": 17420842.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 6.057862091064453,
|
|
"epoch": 0.8252384316495938,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0004938904722459565,
|
|
"loss": 5.7685,
|
|
"mean_token_accuracy": 0.1707105204463005,
|
|
"num_tokens": 17429877.0,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"entropy": 6.008994388580322,
|
|
"epoch": 0.8256799717414341,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004938831813588515,
|
|
"loss": 5.8971,
|
|
"mean_token_accuracy": 0.1633854404091835,
|
|
"num_tokens": 17439508.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 5.926471376419068,
|
|
"epoch": 0.8261215118332744,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004938758861839495,
|
|
"loss": 5.8169,
|
|
"mean_token_accuracy": 0.16402943134307862,
|
|
"num_tokens": 17449004.0,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"entropy": 5.991986894607544,
|
|
"epoch": 0.8265630519251148,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004938685867213934,
|
|
"loss": 5.8272,
|
|
"mean_token_accuracy": 0.1690111994743347,
|
|
"num_tokens": 17458462.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 6.053527164459228,
|
|
"epoch": 0.8270045920169551,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004938612829713262,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.16346956491470338,
|
|
"num_tokens": 17466808.0,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"entropy": 6.0850182056427,
|
|
"epoch": 0.8274461321087955,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004938539749338909,
|
|
"loss": 5.8984,
|
|
"mean_token_accuracy": 0.1645541086792946,
|
|
"num_tokens": 17476441.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 6.065642690658569,
|
|
"epoch": 0.8278876722006359,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004938466626092308,
|
|
"loss": 5.8183,
|
|
"mean_token_accuracy": 0.1734413832426071,
|
|
"num_tokens": 17485393.0,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"entropy": 5.940054988861084,
|
|
"epoch": 0.8283292122924761,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000493839345997489,
|
|
"loss": 5.8528,
|
|
"mean_token_accuracy": 0.16922394186258316,
|
|
"num_tokens": 17494904.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 6.036320209503174,
|
|
"epoch": 0.8287707523843165,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004938320250988087,
|
|
"loss": 5.8517,
|
|
"mean_token_accuracy": 0.1592379555106163,
|
|
"num_tokens": 17505137.0,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"entropy": 6.0248651027679445,
|
|
"epoch": 0.8292122924761568,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004938246999133337,
|
|
"loss": 5.9001,
|
|
"mean_token_accuracy": 0.1642383113503456,
|
|
"num_tokens": 17514039.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 5.958837652206421,
|
|
"epoch": 0.8296538325679972,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004938173704412071,
|
|
"loss": 5.7857,
|
|
"mean_token_accuracy": 0.1705437883734703,
|
|
"num_tokens": 17522423.0,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"entropy": 5.987477111816406,
|
|
"epoch": 0.8300953726598375,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004938100366825728,
|
|
"loss": 5.7645,
|
|
"mean_token_accuracy": 0.17269083708524705,
|
|
"num_tokens": 17531958.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 6.023804759979248,
|
|
"epoch": 0.8305369127516778,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004938026986375742,
|
|
"loss": 5.8286,
|
|
"mean_token_accuracy": 0.1699989214539528,
|
|
"num_tokens": 17540905.0,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"entropy": 5.9978249073028564,
|
|
"epoch": 0.8309784528435182,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004937953563063553,
|
|
"loss": 5.8802,
|
|
"mean_token_accuracy": 0.1642656847834587,
|
|
"num_tokens": 17551007.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 5.944204664230346,
|
|
"epoch": 0.8314199929353585,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004937880096890598,
|
|
"loss": 5.7058,
|
|
"mean_token_accuracy": 0.17777763903141022,
|
|
"num_tokens": 17559403.0,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"entropy": 5.989461374282837,
|
|
"epoch": 0.8318615330271989,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004937806587858315,
|
|
"loss": 5.8403,
|
|
"mean_token_accuracy": 0.17147947996854782,
|
|
"num_tokens": 17569191.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 6.078926372528076,
|
|
"epoch": 0.8323030731190392,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004937733035968147,
|
|
"loss": 5.9022,
|
|
"mean_token_accuracy": 0.15950139611959457,
|
|
"num_tokens": 17578538.0,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"entropy": 5.990520238876343,
|
|
"epoch": 0.8327446132108796,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0004937659441221531,
|
|
"loss": 5.794,
|
|
"mean_token_accuracy": 0.17096205651760102,
|
|
"num_tokens": 17587344.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 5.9082780361175535,
|
|
"epoch": 0.8331861533027198,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004937585803619912,
|
|
"loss": 5.7814,
|
|
"mean_token_accuracy": 0.17017468810081482,
|
|
"num_tokens": 17596033.0,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"entropy": 6.001602983474731,
|
|
"epoch": 0.8336276933945602,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004937512123164731,
|
|
"loss": 5.7366,
|
|
"mean_token_accuracy": 0.1792237713932991,
|
|
"num_tokens": 17605860.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 6.037835311889649,
|
|
"epoch": 0.8340692334864006,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004937438399857431,
|
|
"loss": 5.8755,
|
|
"mean_token_accuracy": 0.16186862140893937,
|
|
"num_tokens": 17615819.0,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"entropy": 6.026955842971802,
|
|
"epoch": 0.8345107735782409,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004937364633699459,
|
|
"loss": 5.8441,
|
|
"mean_token_accuracy": 0.16839220821857454,
|
|
"num_tokens": 17625478.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 6.022288179397583,
|
|
"epoch": 0.8349523136700813,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004937290824692255,
|
|
"loss": 5.8191,
|
|
"mean_token_accuracy": 0.1682550862431526,
|
|
"num_tokens": 17635329.0,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"entropy": 5.94835467338562,
|
|
"epoch": 0.8353938537619215,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004937216972837269,
|
|
"loss": 5.8152,
|
|
"mean_token_accuracy": 0.16099725067615508,
|
|
"num_tokens": 17644129.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 5.91757493019104,
|
|
"epoch": 0.8358353938537619,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004937143078135946,
|
|
"loss": 5.7547,
|
|
"mean_token_accuracy": 0.17431456297636033,
|
|
"num_tokens": 17652836.0,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"entropy": 6.034531450271606,
|
|
"epoch": 0.8362769339456022,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004937069140589735,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.16793447434902192,
|
|
"num_tokens": 17661222.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 5.951354742050171,
|
|
"epoch": 0.8367184740374426,
|
|
"grad_norm": 2.59375,
|
|
"learning_rate": 0.0004936995160200083,
|
|
"loss": 5.7551,
|
|
"mean_token_accuracy": 0.17580796182155609,
|
|
"num_tokens": 17669637.0,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"entropy": 6.0513585090637205,
|
|
"epoch": 0.837160014129283,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004936921136968439,
|
|
"loss": 5.9381,
|
|
"mean_token_accuracy": 0.15244681164622306,
|
|
"num_tokens": 17678589.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 6.065783834457397,
|
|
"epoch": 0.8376015542211233,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004936847070896254,
|
|
"loss": 5.9067,
|
|
"mean_token_accuracy": 0.16115776300430298,
|
|
"num_tokens": 17689129.0,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"entropy": 6.0849708080291744,
|
|
"epoch": 0.8380430943129636,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004936772961984979,
|
|
"loss": 5.9027,
|
|
"mean_token_accuracy": 0.1689728707075119,
|
|
"num_tokens": 17698864.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 6.058166265487671,
|
|
"epoch": 0.8384846344048039,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.0004936698810236065,
|
|
"loss": 5.9882,
|
|
"mean_token_accuracy": 0.15745993703603745,
|
|
"num_tokens": 17707900.0,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"entropy": 6.042195796966553,
|
|
"epoch": 0.8389261744966443,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004936624615650964,
|
|
"loss": 5.808,
|
|
"mean_token_accuracy": 0.17014045864343644,
|
|
"num_tokens": 17716983.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 5.999377870559693,
|
|
"epoch": 0.8393677145884846,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004936550378231133,
|
|
"loss": 5.8048,
|
|
"mean_token_accuracy": 0.16574602127075194,
|
|
"num_tokens": 17726503.0,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"entropy": 6.001359987258911,
|
|
"epoch": 0.839809254680325,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004936476097978023,
|
|
"loss": 5.8939,
|
|
"mean_token_accuracy": 0.1637062907218933,
|
|
"num_tokens": 17735708.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 5.994287729263306,
|
|
"epoch": 0.8402507947721654,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004936401774893088,
|
|
"loss": 5.8038,
|
|
"mean_token_accuracy": 0.16697103083133696,
|
|
"num_tokens": 17745169.0,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"entropy": 6.0431797981262205,
|
|
"epoch": 0.8406923348640056,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004936327408977786,
|
|
"loss": 5.9432,
|
|
"mean_token_accuracy": 0.1595461219549179,
|
|
"num_tokens": 17755630.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 6.0142663478851315,
|
|
"epoch": 0.841133874955846,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004936253000233574,
|
|
"loss": 5.8283,
|
|
"mean_token_accuracy": 0.17293956726789475,
|
|
"num_tokens": 17764469.0,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"entropy": 5.962830114364624,
|
|
"epoch": 0.8415754150476863,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004936178548661909,
|
|
"loss": 5.9207,
|
|
"mean_token_accuracy": 0.1591649293899536,
|
|
"num_tokens": 17774977.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 6.046345520019531,
|
|
"epoch": 0.8420169551395267,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000493610405426425,
|
|
"loss": 5.8196,
|
|
"mean_token_accuracy": 0.16513993442058564,
|
|
"num_tokens": 17784250.0,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"entropy": 6.038363361358643,
|
|
"epoch": 0.842458495231367,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004936029517042056,
|
|
"loss": 5.8518,
|
|
"mean_token_accuracy": 0.16764956265687941,
|
|
"num_tokens": 17793858.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 6.0372391700744625,
|
|
"epoch": 0.8429000353232073,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004935954936996787,
|
|
"loss": 5.9554,
|
|
"mean_token_accuracy": 0.16116243451833726,
|
|
"num_tokens": 17804330.0,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"entropy": 6.033014535903931,
|
|
"epoch": 0.8433415754150477,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004935880314129903,
|
|
"loss": 5.8408,
|
|
"mean_token_accuracy": 0.17326375991106033,
|
|
"num_tokens": 17813750.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 5.969830417633057,
|
|
"epoch": 0.843783115506888,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004935805648442868,
|
|
"loss": 5.8038,
|
|
"mean_token_accuracy": 0.167977911233902,
|
|
"num_tokens": 17823537.0,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"entropy": 5.982773494720459,
|
|
"epoch": 0.8442246555987284,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004935730939937143,
|
|
"loss": 5.8136,
|
|
"mean_token_accuracy": 0.17517689615488052,
|
|
"num_tokens": 17833298.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 5.976465034484863,
|
|
"epoch": 0.8446661956905687,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004935656188614192,
|
|
"loss": 5.737,
|
|
"mean_token_accuracy": 0.1733606703579426,
|
|
"num_tokens": 17842339.0,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"entropy": 6.008795309066772,
|
|
"epoch": 0.845107735782409,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004935581394475479,
|
|
"loss": 5.9289,
|
|
"mean_token_accuracy": 0.16886893808841705,
|
|
"num_tokens": 17851827.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 5.95584626197815,
|
|
"epoch": 0.8455492758742493,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004935506557522469,
|
|
"loss": 5.7982,
|
|
"mean_token_accuracy": 0.17455206960439681,
|
|
"num_tokens": 17860788.0,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"entropy": 6.097922229766846,
|
|
"epoch": 0.8459908159660897,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.000493543167775663,
|
|
"loss": 5.7847,
|
|
"mean_token_accuracy": 0.1724754050374031,
|
|
"num_tokens": 17869849.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 6.013970279693604,
|
|
"epoch": 0.8464323560579301,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004935356755179426,
|
|
"loss": 5.8191,
|
|
"mean_token_accuracy": 0.16670462042093276,
|
|
"num_tokens": 17879431.0,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"entropy": 6.042088985443115,
|
|
"epoch": 0.8468738961497704,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004935281789792326,
|
|
"loss": 5.9476,
|
|
"mean_token_accuracy": 0.16087115854024886,
|
|
"num_tokens": 17888599.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 5.985098457336425,
|
|
"epoch": 0.8473154362416108,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004935206781596799,
|
|
"loss": 5.8266,
|
|
"mean_token_accuracy": 0.17410103529691695,
|
|
"num_tokens": 17898662.0,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"entropy": 6.038266277313232,
|
|
"epoch": 0.847756976333451,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004935131730594315,
|
|
"loss": 5.8564,
|
|
"mean_token_accuracy": 0.16674958169460297,
|
|
"num_tokens": 17907810.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 5.981097459793091,
|
|
"epoch": 0.8481985164252914,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004935056636786342,
|
|
"loss": 5.8282,
|
|
"mean_token_accuracy": 0.16734828054904938,
|
|
"num_tokens": 17917611.0,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"entropy": 5.966639995574951,
|
|
"epoch": 0.8486400565171317,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004934981500174353,
|
|
"loss": 5.8433,
|
|
"mean_token_accuracy": 0.16487517058849335,
|
|
"num_tokens": 17926076.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 6.038370132446289,
|
|
"epoch": 0.8490815966089721,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004934906320759818,
|
|
"loss": 5.8761,
|
|
"mean_token_accuracy": 0.16527412384748458,
|
|
"num_tokens": 17935521.0,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"entropy": 6.072547054290771,
|
|
"epoch": 0.8495231367008125,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004934831098544211,
|
|
"loss": 5.8459,
|
|
"mean_token_accuracy": 0.17391809970140457,
|
|
"num_tokens": 17944849.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 6.06336669921875,
|
|
"epoch": 0.8499646767926528,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004934755833529007,
|
|
"loss": 5.8252,
|
|
"mean_token_accuracy": 0.16894840747117995,
|
|
"num_tokens": 17953988.0,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"entropy": 5.969831466674805,
|
|
"epoch": 0.8504062168844931,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004934680525715677,
|
|
"loss": 5.8247,
|
|
"mean_token_accuracy": 0.1663273498415947,
|
|
"num_tokens": 17963319.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 6.004313802719116,
|
|
"epoch": 0.8508477569763334,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004934605175105698,
|
|
"loss": 5.7655,
|
|
"mean_token_accuracy": 0.1794113963842392,
|
|
"num_tokens": 17972054.0,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"entropy": 6.0897088050842285,
|
|
"epoch": 0.8512892970681738,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004934529781700546,
|
|
"loss": 5.9241,
|
|
"mean_token_accuracy": 0.16022397577762604,
|
|
"num_tokens": 17981630.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 6.045695877075195,
|
|
"epoch": 0.8517308371600141,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004934454345501698,
|
|
"loss": 5.8614,
|
|
"mean_token_accuracy": 0.16092253774404525,
|
|
"num_tokens": 17990987.0,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"entropy": 5.906431531906128,
|
|
"epoch": 0.8521723772518545,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004934378866510633,
|
|
"loss": 5.764,
|
|
"mean_token_accuracy": 0.17493847757577896,
|
|
"num_tokens": 17999284.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 5.998637628555298,
|
|
"epoch": 0.8526139173436948,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004934303344728828,
|
|
"loss": 5.7818,
|
|
"mean_token_accuracy": 0.16770309656858445,
|
|
"num_tokens": 18008670.0,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"entropy": 5.967482423782348,
|
|
"epoch": 0.8530554574355351,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004934227780157761,
|
|
"loss": 5.7853,
|
|
"mean_token_accuracy": 0.17448310256004335,
|
|
"num_tokens": 18018374.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 6.040644407272339,
|
|
"epoch": 0.8534969975273755,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004934152172798916,
|
|
"loss": 5.8803,
|
|
"mean_token_accuracy": 0.1725458726286888,
|
|
"num_tokens": 18026943.0,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"entropy": 5.996393537521362,
|
|
"epoch": 0.8539385376192158,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004934076522653772,
|
|
"loss": 5.8137,
|
|
"mean_token_accuracy": 0.17032251507043839,
|
|
"num_tokens": 18036270.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 5.9290753364562985,
|
|
"epoch": 0.8543800777110562,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000493400082972381,
|
|
"loss": 5.8042,
|
|
"mean_token_accuracy": 0.16725092232227326,
|
|
"num_tokens": 18045611.0,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"entropy": 5.9702893733978275,
|
|
"epoch": 0.8548216178028964,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004933925094010516,
|
|
"loss": 5.8499,
|
|
"mean_token_accuracy": 0.16466162502765655,
|
|
"num_tokens": 18055152.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 6.040339088439941,
|
|
"epoch": 0.8552631578947368,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004933849315515369,
|
|
"loss": 5.8251,
|
|
"mean_token_accuracy": 0.16667694002389907,
|
|
"num_tokens": 18064376.0,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"entropy": 5.933894348144531,
|
|
"epoch": 0.8557046979865772,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004933773494239857,
|
|
"loss": 5.742,
|
|
"mean_token_accuracy": 0.17865219563245774,
|
|
"num_tokens": 18073569.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 5.9812424659729,
|
|
"epoch": 0.8561462380784175,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004933697630185464,
|
|
"loss": 5.8365,
|
|
"mean_token_accuracy": 0.1723334863781929,
|
|
"num_tokens": 18083264.0,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"entropy": 6.035292339324951,
|
|
"epoch": 0.8565877781702579,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004933621723353677,
|
|
"loss": 5.8311,
|
|
"mean_token_accuracy": 0.16717321500182153,
|
|
"num_tokens": 18093151.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 5.95260443687439,
|
|
"epoch": 0.8570293182620982,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000493354577374598,
|
|
"loss": 5.7354,
|
|
"mean_token_accuracy": 0.1816457837820053,
|
|
"num_tokens": 18102087.0,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"entropy": 5.994460964202881,
|
|
"epoch": 0.8574708583539385,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004933469781363865,
|
|
"loss": 5.8703,
|
|
"mean_token_accuracy": 0.16676534563302994,
|
|
"num_tokens": 18111854.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 6.052928924560547,
|
|
"epoch": 0.8579123984457788,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004933393746208819,
|
|
"loss": 5.8892,
|
|
"mean_token_accuracy": 0.16378687620162963,
|
|
"num_tokens": 18120520.0,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"entropy": 6.051138830184937,
|
|
"epoch": 0.8583539385376192,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004933317668282331,
|
|
"loss": 5.9426,
|
|
"mean_token_accuracy": 0.16159225404262542,
|
|
"num_tokens": 18130557.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 6.075818586349487,
|
|
"epoch": 0.8587954786294596,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004933241547585891,
|
|
"loss": 5.7244,
|
|
"mean_token_accuracy": 0.17409752905368805,
|
|
"num_tokens": 18139199.0,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"entropy": 5.959064102172851,
|
|
"epoch": 0.8592370187212999,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004933165384120991,
|
|
"loss": 5.7982,
|
|
"mean_token_accuracy": 0.16895681619644165,
|
|
"num_tokens": 18150039.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 5.912999486923217,
|
|
"epoch": 0.8596785588131403,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004933089177889124,
|
|
"loss": 5.7993,
|
|
"mean_token_accuracy": 0.16626023799180983,
|
|
"num_tokens": 18159451.0,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"entropy": 6.015667581558228,
|
|
"epoch": 0.8601200989049805,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0004933012928891781,
|
|
"loss": 5.8742,
|
|
"mean_token_accuracy": 0.16694561541080474,
|
|
"num_tokens": 18169896.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 6.040274524688721,
|
|
"epoch": 0.8605616389968209,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004932936637130456,
|
|
"loss": 5.7726,
|
|
"mean_token_accuracy": 0.17514863312244416,
|
|
"num_tokens": 18177764.0,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"entropy": 6.000326204299927,
|
|
"epoch": 0.8610031790886612,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004932860302606644,
|
|
"loss": 5.8506,
|
|
"mean_token_accuracy": 0.1685821920633316,
|
|
"num_tokens": 18186940.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 5.984031820297242,
|
|
"epoch": 0.8614447191805016,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004932783925321838,
|
|
"loss": 5.7285,
|
|
"mean_token_accuracy": 0.17475786805152893,
|
|
"num_tokens": 18195374.0,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"entropy": 5.933980989456177,
|
|
"epoch": 0.861886259272342,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.000493270750527754,
|
|
"loss": 5.8251,
|
|
"mean_token_accuracy": 0.1692585453391075,
|
|
"num_tokens": 18204779.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 5.94135684967041,
|
|
"epoch": 0.8623277993641822,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004932631042475241,
|
|
"loss": 5.7685,
|
|
"mean_token_accuracy": 0.1741400569677353,
|
|
"num_tokens": 18214099.0,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"entropy": 5.960148763656616,
|
|
"epoch": 0.8627693394560226,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004932554536916441,
|
|
"loss": 5.8428,
|
|
"mean_token_accuracy": 0.17174284309148788,
|
|
"num_tokens": 18223190.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 6.038312864303589,
|
|
"epoch": 0.8632108795478629,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.000493247798860264,
|
|
"loss": 5.8336,
|
|
"mean_token_accuracy": 0.17261188477277756,
|
|
"num_tokens": 18232380.0,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"entropy": 6.0753703117370605,
|
|
"epoch": 0.8636524196397033,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004932401397535337,
|
|
"loss": 5.9178,
|
|
"mean_token_accuracy": 0.1631627470254898,
|
|
"num_tokens": 18242621.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 6.08184962272644,
|
|
"epoch": 0.8640939597315436,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.000493232476371603,
|
|
"loss": 5.9481,
|
|
"mean_token_accuracy": 0.1636492282152176,
|
|
"num_tokens": 18253183.0,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"entropy": 6.007740831375122,
|
|
"epoch": 0.864535499823384,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004932248087146223,
|
|
"loss": 5.7486,
|
|
"mean_token_accuracy": 0.17018086612224578,
|
|
"num_tokens": 18261967.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 5.943189096450806,
|
|
"epoch": 0.8649770399152243,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004932171367827417,
|
|
"loss": 5.8268,
|
|
"mean_token_accuracy": 0.16698989272117615,
|
|
"num_tokens": 18270902.0,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"entropy": 5.992133235931396,
|
|
"epoch": 0.8654185800070646,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004932094605761115,
|
|
"loss": 5.8579,
|
|
"mean_token_accuracy": 0.17358374893665313,
|
|
"num_tokens": 18279387.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 5.937605714797973,
|
|
"epoch": 0.865860120098905,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004932017800948821,
|
|
"loss": 5.6603,
|
|
"mean_token_accuracy": 0.1796998143196106,
|
|
"num_tokens": 18288363.0,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"entropy": 6.07945613861084,
|
|
"epoch": 0.8663016601907453,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004931940953392038,
|
|
"loss": 5.9728,
|
|
"mean_token_accuracy": 0.15800556987524034,
|
|
"num_tokens": 18297398.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 6.066021203994751,
|
|
"epoch": 0.8667432002825857,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004931864063092274,
|
|
"loss": 5.8072,
|
|
"mean_token_accuracy": 0.1684715837240219,
|
|
"num_tokens": 18306230.0,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"entropy": 6.009411478042603,
|
|
"epoch": 0.8671847403744259,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0004931787130051034,
|
|
"loss": 5.9412,
|
|
"mean_token_accuracy": 0.1657043479382992,
|
|
"num_tokens": 18316395.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 5.963434362411499,
|
|
"epoch": 0.8676262804662663,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004931710154269824,
|
|
"loss": 5.8386,
|
|
"mean_token_accuracy": 0.1774977833032608,
|
|
"num_tokens": 18326020.0,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"entropy": 5.950678396224975,
|
|
"epoch": 0.8680678205581067,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004931633135750154,
|
|
"loss": 5.8482,
|
|
"mean_token_accuracy": 0.1690017983317375,
|
|
"num_tokens": 18335320.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 6.013969230651855,
|
|
"epoch": 0.868509360649947,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004931556074493531,
|
|
"loss": 5.8672,
|
|
"mean_token_accuracy": 0.1664042592048645,
|
|
"num_tokens": 18345137.0,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"entropy": 6.006626081466675,
|
|
"epoch": 0.8689509007417874,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004931478970501465,
|
|
"loss": 5.8628,
|
|
"mean_token_accuracy": 0.1689472183585167,
|
|
"num_tokens": 18354289.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 6.098352861404419,
|
|
"epoch": 0.8693924408336277,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004931401823775466,
|
|
"loss": 5.9391,
|
|
"mean_token_accuracy": 0.15878856778144837,
|
|
"num_tokens": 18363701.0,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"entropy": 6.120831251144409,
|
|
"epoch": 0.869833980925468,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004931324634317047,
|
|
"loss": 5.8467,
|
|
"mean_token_accuracy": 0.16003623902797698,
|
|
"num_tokens": 18373336.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 6.053362512588501,
|
|
"epoch": 0.8702755210173083,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004931247402127719,
|
|
"loss": 5.8407,
|
|
"mean_token_accuracy": 0.16822702288627625,
|
|
"num_tokens": 18383233.0,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"entropy": 6.035002946853638,
|
|
"epoch": 0.8707170611091487,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004931170127208995,
|
|
"loss": 5.8865,
|
|
"mean_token_accuracy": 0.16325117498636246,
|
|
"num_tokens": 18393214.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 5.987590885162353,
|
|
"epoch": 0.8711586012009891,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004931092809562388,
|
|
"loss": 5.7788,
|
|
"mean_token_accuracy": 0.1711319714784622,
|
|
"num_tokens": 18401773.0,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"entropy": 5.970164728164673,
|
|
"epoch": 0.8716001412928294,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004931015449189414,
|
|
"loss": 5.828,
|
|
"mean_token_accuracy": 0.16339541971683502,
|
|
"num_tokens": 18411748.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 6.00556640625,
|
|
"epoch": 0.8720416813846698,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004930938046091587,
|
|
"loss": 5.8756,
|
|
"mean_token_accuracy": 0.16804593652486802,
|
|
"num_tokens": 18421399.0,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"entropy": 6.005237674713134,
|
|
"epoch": 0.87248322147651,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004930860600270425,
|
|
"loss": 5.8802,
|
|
"mean_token_accuracy": 0.16329103857278823,
|
|
"num_tokens": 18430052.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 6.0280194759368895,
|
|
"epoch": 0.8729247615683504,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 0.0004930783111727443,
|
|
"loss": 5.7106,
|
|
"mean_token_accuracy": 0.1797136604785919,
|
|
"num_tokens": 18438268.0,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"entropy": 6.017422676086426,
|
|
"epoch": 0.8733663016601907,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.000493070558046416,
|
|
"loss": 5.8469,
|
|
"mean_token_accuracy": 0.16759492307901383,
|
|
"num_tokens": 18446964.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 6.02519154548645,
|
|
"epoch": 0.8738078417520311,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004930628006482097,
|
|
"loss": 5.842,
|
|
"mean_token_accuracy": 0.1586953066289425,
|
|
"num_tokens": 18455701.0,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"entropy": 5.978132152557373,
|
|
"epoch": 0.8742493818438715,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.0004930550389782769,
|
|
"loss": 5.8038,
|
|
"mean_token_accuracy": 0.16593412458896636,
|
|
"num_tokens": 18465635.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 5.958258533477784,
|
|
"epoch": 0.8746909219357117,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004930472730367701,
|
|
"loss": 5.7267,
|
|
"mean_token_accuracy": 0.16820934116840364,
|
|
"num_tokens": 18474833.0,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"entropy": 6.048409032821655,
|
|
"epoch": 0.8751324620275521,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004930395028238412,
|
|
"loss": 5.8346,
|
|
"mean_token_accuracy": 0.16550887376070023,
|
|
"num_tokens": 18482987.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 5.955231666564941,
|
|
"epoch": 0.8755740021193924,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004930317283396423,
|
|
"loss": 5.7227,
|
|
"mean_token_accuracy": 0.1746671810746193,
|
|
"num_tokens": 18491274.0,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"entropy": 5.987809944152832,
|
|
"epoch": 0.8760155422112328,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.000493023949584326,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.16021096110343933,
|
|
"num_tokens": 18500811.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 5.892692232131958,
|
|
"epoch": 0.8764570823030731,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.0004930161665580445,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.18408720344305038,
|
|
"num_tokens": 18509475.0,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"entropy": 6.078322219848633,
|
|
"epoch": 0.8768986223949135,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004930083792609502,
|
|
"loss": 5.8238,
|
|
"mean_token_accuracy": 0.16401641070842743,
|
|
"num_tokens": 18519410.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 5.8636486530303955,
|
|
"epoch": 0.8773401624867538,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0004930005876931958,
|
|
"loss": 5.6611,
|
|
"mean_token_accuracy": 0.18206960707902908,
|
|
"num_tokens": 18529194.0,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"entropy": 6.004857063293457,
|
|
"epoch": 0.8777817025785941,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004929927918549339,
|
|
"loss": 5.8735,
|
|
"mean_token_accuracy": 0.15991197973489762,
|
|
"num_tokens": 18539166.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 5.99868106842041,
|
|
"epoch": 0.8782232426704345,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004929849917463171,
|
|
"loss": 5.7955,
|
|
"mean_token_accuracy": 0.17093802392482757,
|
|
"num_tokens": 18548166.0,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"entropy": 5.960470199584961,
|
|
"epoch": 0.8786647827622748,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004929771873674984,
|
|
"loss": 5.8056,
|
|
"mean_token_accuracy": 0.16528980284929276,
|
|
"num_tokens": 18557041.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 5.992664957046509,
|
|
"epoch": 0.8791063228541152,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004929693787186305,
|
|
"loss": 5.8483,
|
|
"mean_token_accuracy": 0.1624258577823639,
|
|
"num_tokens": 18566461.0,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"entropy": 5.950482988357544,
|
|
"epoch": 0.8795478629459554,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004929615657998664,
|
|
"loss": 5.7632,
|
|
"mean_token_accuracy": 0.1686630889773369,
|
|
"num_tokens": 18576334.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 5.985661172866822,
|
|
"epoch": 0.8799894030377958,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004929537486113593,
|
|
"loss": 5.7676,
|
|
"mean_token_accuracy": 0.16868792176246644,
|
|
"num_tokens": 18584882.0,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"entropy": 6.04607048034668,
|
|
"epoch": 0.8804309431296362,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004929459271532621,
|
|
"loss": 5.8249,
|
|
"mean_token_accuracy": 0.17223169654607773,
|
|
"num_tokens": 18593669.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 6.0269121646881105,
|
|
"epoch": 0.8808724832214765,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004929381014257282,
|
|
"loss": 5.8604,
|
|
"mean_token_accuracy": 0.16619434505701064,
|
|
"num_tokens": 18603357.0,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"entropy": 6.008830833435058,
|
|
"epoch": 0.8813140233133169,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004929302714289107,
|
|
"loss": 5.7762,
|
|
"mean_token_accuracy": 0.17586284428834914,
|
|
"num_tokens": 18611856.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 5.925434350967407,
|
|
"epoch": 0.8817555634051572,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 0.0004929224371629634,
|
|
"loss": 5.8302,
|
|
"mean_token_accuracy": 0.1711333855986595,
|
|
"num_tokens": 18620500.0,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"entropy": 5.9265233993530275,
|
|
"epoch": 0.8821971034969975,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004929145986280394,
|
|
"loss": 5.7739,
|
|
"mean_token_accuracy": 0.17708952575922013,
|
|
"num_tokens": 18628836.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 5.947731924057007,
|
|
"epoch": 0.8826386435888378,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004929067558242922,
|
|
"loss": 5.7026,
|
|
"mean_token_accuracy": 0.17609767168760299,
|
|
"num_tokens": 18636831.0,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"entropy": 5.9934173107147215,
|
|
"epoch": 0.8830801836806782,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004928989087518758,
|
|
"loss": 5.8545,
|
|
"mean_token_accuracy": 0.17121524959802628,
|
|
"num_tokens": 18646327.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 5.985447359085083,
|
|
"epoch": 0.8835217237725186,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004928910574109435,
|
|
"loss": 5.8655,
|
|
"mean_token_accuracy": 0.16686370521783828,
|
|
"num_tokens": 18656125.0,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"entropy": 6.0014176845550535,
|
|
"epoch": 0.8839632638643589,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004928832018016495,
|
|
"loss": 5.8732,
|
|
"mean_token_accuracy": 0.16861837357282639,
|
|
"num_tokens": 18665853.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 6.076805973052979,
|
|
"epoch": 0.8844048039561992,
|
|
"grad_norm": 2.75,
|
|
"learning_rate": 0.0004928753419241472,
|
|
"loss": 5.9193,
|
|
"mean_token_accuracy": 0.15759100615978242,
|
|
"num_tokens": 18676674.0,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"entropy": 6.0625269412994385,
|
|
"epoch": 0.8848463440480395,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004928674777785912,
|
|
"loss": 5.749,
|
|
"mean_token_accuracy": 0.17859821617603303,
|
|
"num_tokens": 18685535.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 5.986581325531006,
|
|
"epoch": 0.8852878841398799,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004928596093651351,
|
|
"loss": 5.7937,
|
|
"mean_token_accuracy": 0.16889239251613616,
|
|
"num_tokens": 18694186.0,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"entropy": 5.949480724334717,
|
|
"epoch": 0.8857294242317202,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.000492851736683933,
|
|
"loss": 5.7872,
|
|
"mean_token_accuracy": 0.17278547137975692,
|
|
"num_tokens": 18703061.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 6.004718446731568,
|
|
"epoch": 0.8861709643235606,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004928438597351395,
|
|
"loss": 5.7324,
|
|
"mean_token_accuracy": 0.17449625134468078,
|
|
"num_tokens": 18712466.0,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"entropy": 5.996918392181397,
|
|
"epoch": 0.886612504415401,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004928359785189086,
|
|
"loss": 5.848,
|
|
"mean_token_accuracy": 0.16393670737743377,
|
|
"num_tokens": 18721522.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 5.895922231674194,
|
|
"epoch": 0.8870540445072412,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004928280930353948,
|
|
"loss": 5.7844,
|
|
"mean_token_accuracy": 0.168227019906044,
|
|
"num_tokens": 18730488.0,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"entropy": 5.964899063110352,
|
|
"epoch": 0.8874955845990816,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004928202032847525,
|
|
"loss": 5.7105,
|
|
"mean_token_accuracy": 0.17312824875116348,
|
|
"num_tokens": 18740318.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 5.997185611724854,
|
|
"epoch": 0.8879371246909219,
|
|
"grad_norm": 13.0625,
|
|
"learning_rate": 0.0004928123092671362,
|
|
"loss": 5.8393,
|
|
"mean_token_accuracy": 0.16409815549850465,
|
|
"num_tokens": 18748731.0,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"entropy": 5.89947476387024,
|
|
"epoch": 0.8883786647827623,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004928044109827008,
|
|
"loss": 5.7635,
|
|
"mean_token_accuracy": 0.1718330979347229,
|
|
"num_tokens": 18757847.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 6.0889490127563475,
|
|
"epoch": 0.8888202048746026,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004927965084316007,
|
|
"loss": 5.8568,
|
|
"mean_token_accuracy": 0.15744939744472503,
|
|
"num_tokens": 18767429.0,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"entropy": 6.042394828796387,
|
|
"epoch": 0.889261744966443,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.000492788601613991,
|
|
"loss": 5.773,
|
|
"mean_token_accuracy": 0.1722065582871437,
|
|
"num_tokens": 18776652.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 6.038338375091553,
|
|
"epoch": 0.8897032850582833,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004927806905300265,
|
|
"loss": 5.8307,
|
|
"mean_token_accuracy": 0.16547014862298964,
|
|
"num_tokens": 18785687.0,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"entropy": 5.9673412322998045,
|
|
"epoch": 0.8901448251501236,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.000492772775179862,
|
|
"loss": 5.867,
|
|
"mean_token_accuracy": 0.1659177601337433,
|
|
"num_tokens": 18794924.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 5.940322780609131,
|
|
"epoch": 0.890586365241964,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004927648555636528,
|
|
"loss": 5.7223,
|
|
"mean_token_accuracy": 0.18146456331014632,
|
|
"num_tokens": 18803359.0,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"entropy": 6.010080289840698,
|
|
"epoch": 0.8910279053338043,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004927569316815539,
|
|
"loss": 5.9499,
|
|
"mean_token_accuracy": 0.1580890327692032,
|
|
"num_tokens": 18813496.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 6.023469352722168,
|
|
"epoch": 0.8914694454256447,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004927490035337205,
|
|
"loss": 5.7941,
|
|
"mean_token_accuracy": 0.16975688487291335,
|
|
"num_tokens": 18823274.0,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"entropy": 5.9998383045196535,
|
|
"epoch": 0.8919109855174849,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000492741071120308,
|
|
"loss": 5.8082,
|
|
"mean_token_accuracy": 0.16879395693540572,
|
|
"num_tokens": 18832436.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 6.113572883605957,
|
|
"epoch": 0.8923525256093253,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004927331344414717,
|
|
"loss": 5.8595,
|
|
"mean_token_accuracy": 0.16608996838331222,
|
|
"num_tokens": 18841923.0,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"entropy": 6.045321798324585,
|
|
"epoch": 0.8927940657011657,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004927251934973672,
|
|
"loss": 5.8983,
|
|
"mean_token_accuracy": 0.1701177567243576,
|
|
"num_tokens": 18850957.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 6.070820665359497,
|
|
"epoch": 0.893235605793006,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004927172482881499,
|
|
"loss": 5.8935,
|
|
"mean_token_accuracy": 0.1602569743990898,
|
|
"num_tokens": 18859622.0,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"entropy": 6.068447256088257,
|
|
"epoch": 0.8936771458848464,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004927092988139755,
|
|
"loss": 5.8756,
|
|
"mean_token_accuracy": 0.163049054145813,
|
|
"num_tokens": 18868555.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 6.048940849304199,
|
|
"epoch": 0.8941186859766866,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004927013450749999,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.16745685636997223,
|
|
"num_tokens": 18877445.0,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"entropy": 5.962615346908569,
|
|
"epoch": 0.894560226068527,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004926933870713787,
|
|
"loss": 5.7496,
|
|
"mean_token_accuracy": 0.17192499935626984,
|
|
"num_tokens": 18886386.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 6.001466941833496,
|
|
"epoch": 0.8950017661603673,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004926854248032678,
|
|
"loss": 5.8533,
|
|
"mean_token_accuracy": 0.16207423508167268,
|
|
"num_tokens": 18895016.0,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"entropy": 5.983141565322876,
|
|
"epoch": 0.8954433062522077,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004926774582708234,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.17000181078910828,
|
|
"num_tokens": 18904126.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 6.032631301879883,
|
|
"epoch": 0.8958848463440481,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 0.0004926694874742012,
|
|
"loss": 5.794,
|
|
"mean_token_accuracy": 0.1646634042263031,
|
|
"num_tokens": 18913064.0,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"entropy": 6.034622526168823,
|
|
"epoch": 0.8963263864358884,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004926615124135577,
|
|
"loss": 5.8985,
|
|
"mean_token_accuracy": 0.1604196771979332,
|
|
"num_tokens": 18921189.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 5.966013050079345,
|
|
"epoch": 0.8967679265277287,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004926535330890488,
|
|
"loss": 5.8128,
|
|
"mean_token_accuracy": 0.17230157554149628,
|
|
"num_tokens": 18930818.0,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"entropy": 6.057120084762573,
|
|
"epoch": 0.897209466619569,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004926455495008311,
|
|
"loss": 5.8235,
|
|
"mean_token_accuracy": 0.1639510676264763,
|
|
"num_tokens": 18940376.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 6.042151069641113,
|
|
"epoch": 0.8976510067114094,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004926375616490608,
|
|
"loss": 5.8589,
|
|
"mean_token_accuracy": 0.16974166482686998,
|
|
"num_tokens": 18949418.0,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"entropy": 6.0127623081207275,
|
|
"epoch": 0.8980925468032497,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004926295695338945,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.1742484077811241,
|
|
"num_tokens": 18959315.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 5.948454856872559,
|
|
"epoch": 0.8985340868950901,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004926215731554887,
|
|
"loss": 5.7142,
|
|
"mean_token_accuracy": 0.17966544330120088,
|
|
"num_tokens": 18968336.0,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"entropy": 6.029808568954468,
|
|
"epoch": 0.8989756269869305,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.000492613572514,
|
|
"loss": 5.7844,
|
|
"mean_token_accuracy": 0.17160264104604722,
|
|
"num_tokens": 18977123.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 6.027696847915649,
|
|
"epoch": 0.8994171670787707,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004926055676095851,
|
|
"loss": 5.9058,
|
|
"mean_token_accuracy": 0.16375845670700073,
|
|
"num_tokens": 18986150.0,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"entropy": 6.090052270889283,
|
|
"epoch": 0.8998587071706111,
|
|
"grad_norm": 2.765625,
|
|
"learning_rate": 0.0004925975584424012,
|
|
"loss": 5.9086,
|
|
"mean_token_accuracy": 0.16778819710016252,
|
|
"num_tokens": 18996010.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 6.057377338409424,
|
|
"epoch": 0.9003002472624514,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.0004925895450126046,
|
|
"loss": 5.8869,
|
|
"mean_token_accuracy": 0.1585620239377022,
|
|
"num_tokens": 19005100.0,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"entropy": 5.996817922592163,
|
|
"epoch": 0.9007417873542918,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0004925815273203526,
|
|
"loss": 5.7848,
|
|
"mean_token_accuracy": 0.17036335468292235,
|
|
"num_tokens": 19013844.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 6.0277472019195555,
|
|
"epoch": 0.901183327446132,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004925735053658022,
|
|
"loss": 5.9292,
|
|
"mean_token_accuracy": 0.16351503431797026,
|
|
"num_tokens": 19024218.0,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"entropy": 6.083570337295532,
|
|
"epoch": 0.9016248675379724,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004925654791491106,
|
|
"loss": 5.8506,
|
|
"mean_token_accuracy": 0.1659012630581856,
|
|
"num_tokens": 19034725.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 6.1107401847839355,
|
|
"epoch": 0.9020664076298128,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004925574486704351,
|
|
"loss": 5.8669,
|
|
"mean_token_accuracy": 0.16051332503557206,
|
|
"num_tokens": 19044597.0,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"entropy": 6.169135284423828,
|
|
"epoch": 0.9025079477216531,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0004925494139299329,
|
|
"loss": 6.0053,
|
|
"mean_token_accuracy": 0.15389579087495803,
|
|
"num_tokens": 19055124.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 6.02846851348877,
|
|
"epoch": 0.9029494878134935,
|
|
"grad_norm": 2.953125,
|
|
"learning_rate": 0.0004925413749277613,
|
|
"loss": 5.7504,
|
|
"mean_token_accuracy": 0.17126532644033432,
|
|
"num_tokens": 19064516.0,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"entropy": 5.943942546844482,
|
|
"epoch": 0.9033910279053338,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0004925333316640779,
|
|
"loss": 5.8318,
|
|
"mean_token_accuracy": 0.16919440776109695,
|
|
"num_tokens": 19072926.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 5.965893793106079,
|
|
"epoch": 0.9038325679971742,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004925252841390403,
|
|
"loss": 5.8513,
|
|
"mean_token_accuracy": 0.16273195445537567,
|
|
"num_tokens": 19082313.0,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"entropy": 5.933097791671753,
|
|
"epoch": 0.9042741080890144,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004925172323528061,
|
|
"loss": 5.807,
|
|
"mean_token_accuracy": 0.1742863968014717,
|
|
"num_tokens": 19091891.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 6.075602293014526,
|
|
"epoch": 0.9047156481808548,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000492509176305533,
|
|
"loss": 5.8499,
|
|
"mean_token_accuracy": 0.16614402830600739,
|
|
"num_tokens": 19100311.0,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"entropy": 6.047270584106445,
|
|
"epoch": 0.9051571882726952,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.0004925011159973788,
|
|
"loss": 5.8418,
|
|
"mean_token_accuracy": 0.16825296431779863,
|
|
"num_tokens": 19110463.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 6.04560227394104,
|
|
"epoch": 0.9055987283645355,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0004924930514285015,
|
|
"loss": 5.8287,
|
|
"mean_token_accuracy": 0.16916860342025758,
|
|
"num_tokens": 19120723.0,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"entropy": 5.956229400634766,
|
|
"epoch": 0.9060402684563759,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000492484982599059,
|
|
"loss": 5.8134,
|
|
"mean_token_accuracy": 0.16655978858470916,
|
|
"num_tokens": 19130259.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 6.0474998474121096,
|
|
"epoch": 0.9064818085482161,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004924769095092094,
|
|
"loss": 5.8794,
|
|
"mean_token_accuracy": 0.16543149054050446,
|
|
"num_tokens": 19140256.0,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"entropy": 5.9884788513183596,
|
|
"epoch": 0.9069233486400565,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 0.0004924688321591109,
|
|
"loss": 5.8728,
|
|
"mean_token_accuracy": 0.1729671910405159,
|
|
"num_tokens": 19149165.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 5.965931797027588,
|
|
"epoch": 0.9073648887318968,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004924607505489215,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.17080118060111998,
|
|
"num_tokens": 19159254.0,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"entropy": 6.103870820999146,
|
|
"epoch": 0.9078064288237372,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004924526646787998,
|
|
"loss": 5.8222,
|
|
"mean_token_accuracy": 0.16772038340568543,
|
|
"num_tokens": 19168166.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 6.075332069396973,
|
|
"epoch": 0.9082479689155776,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.000492444574548904,
|
|
"loss": 5.8405,
|
|
"mean_token_accuracy": 0.16294437795877456,
|
|
"num_tokens": 19176887.0,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"entropy": 5.963015031814575,
|
|
"epoch": 0.9086895090074179,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004924364801593927,
|
|
"loss": 5.7515,
|
|
"mean_token_accuracy": 0.17720825374126434,
|
|
"num_tokens": 19185510.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 5.904071235656739,
|
|
"epoch": 0.9091310490992582,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004924283815104243,
|
|
"loss": 5.7939,
|
|
"mean_token_accuracy": 0.17455934584140778,
|
|
"num_tokens": 19195459.0,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"entropy": 6.018945026397705,
|
|
"epoch": 0.9095725891910985,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004924202786021578,
|
|
"loss": 5.821,
|
|
"mean_token_accuracy": 0.16881232410669328,
|
|
"num_tokens": 19204590.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 5.987010765075683,
|
|
"epoch": 0.9100141292829389,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004924121714347515,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.1634682223200798,
|
|
"num_tokens": 19213302.0,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"entropy": 5.972708415985108,
|
|
"epoch": 0.9104556693747792,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004924040600083644,
|
|
"loss": 5.8845,
|
|
"mean_token_accuracy": 0.16732324361801149,
|
|
"num_tokens": 19223172.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 6.043222093582154,
|
|
"epoch": 0.9108972094666196,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004923959443231554,
|
|
"loss": 5.7881,
|
|
"mean_token_accuracy": 0.16711824387311935,
|
|
"num_tokens": 19232234.0,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"entropy": 5.945252847671509,
|
|
"epoch": 0.91133874955846,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004923878243792836,
|
|
"loss": 5.7717,
|
|
"mean_token_accuracy": 0.17137450724840164,
|
|
"num_tokens": 19241070.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 5.989360427856445,
|
|
"epoch": 0.9117802896503002,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004923797001769079,
|
|
"loss": 5.8564,
|
|
"mean_token_accuracy": 0.16664954423904418,
|
|
"num_tokens": 19251094.0,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"entropy": 6.135943174362183,
|
|
"epoch": 0.9122218297421406,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004923715717161875,
|
|
"loss": 5.9542,
|
|
"mean_token_accuracy": 0.15768681317567826,
|
|
"num_tokens": 19260588.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 6.101527309417724,
|
|
"epoch": 0.9126633698339809,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004923634389972817,
|
|
"loss": 5.9774,
|
|
"mean_token_accuracy": 0.15897405296564102,
|
|
"num_tokens": 19270687.0,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"entropy": 5.965734195709229,
|
|
"epoch": 0.9131049099258213,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004923553020203497,
|
|
"loss": 5.7596,
|
|
"mean_token_accuracy": 0.1712796226143837,
|
|
"num_tokens": 19278739.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 6.115788078308105,
|
|
"epoch": 0.9135464500176615,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004923471607855511,
|
|
"loss": 5.9297,
|
|
"mean_token_accuracy": 0.1538059964776039,
|
|
"num_tokens": 19289184.0,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"entropy": 5.970717334747315,
|
|
"epoch": 0.9139879901095019,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.000492339015293045,
|
|
"loss": 5.7746,
|
|
"mean_token_accuracy": 0.1836891993880272,
|
|
"num_tokens": 19298390.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 5.948292970657349,
|
|
"epoch": 0.9144295302013423,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004923308655429913,
|
|
"loss": 5.7355,
|
|
"mean_token_accuracy": 0.17486001551151276,
|
|
"num_tokens": 19307227.0,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"entropy": 6.03751015663147,
|
|
"epoch": 0.9148710702931826,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004923227115355495,
|
|
"loss": 5.7478,
|
|
"mean_token_accuracy": 0.16704521477222442,
|
|
"num_tokens": 19317701.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 6.031579113006591,
|
|
"epoch": 0.915312610385023,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004923145532708794,
|
|
"loss": 5.8614,
|
|
"mean_token_accuracy": 0.16852820217609404,
|
|
"num_tokens": 19326983.0,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"entropy": 6.007206058502197,
|
|
"epoch": 0.9157541504768633,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.0004923063907491408,
|
|
"loss": 5.8839,
|
|
"mean_token_accuracy": 0.16169315129518508,
|
|
"num_tokens": 19337157.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 6.0474934577941895,
|
|
"epoch": 0.9161956905687036,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004922982239704936,
|
|
"loss": 5.9639,
|
|
"mean_token_accuracy": 0.16408284157514572,
|
|
"num_tokens": 19346845.0,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"entropy": 6.107244873046875,
|
|
"epoch": 0.9166372306605439,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004922900529350978,
|
|
"loss": 5.8326,
|
|
"mean_token_accuracy": 0.17033075243234636,
|
|
"num_tokens": 19356905.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 6.042671203613281,
|
|
"epoch": 0.9170787707523843,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004922818776431134,
|
|
"loss": 5.8214,
|
|
"mean_token_accuracy": 0.15983829498291016,
|
|
"num_tokens": 19366595.0,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"entropy": 6.018132495880127,
|
|
"epoch": 0.9175203108442247,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004922736980947005,
|
|
"loss": 5.8848,
|
|
"mean_token_accuracy": 0.16535525321960448,
|
|
"num_tokens": 19377213.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 6.001478481292724,
|
|
"epoch": 0.917961850936065,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004922655142900196,
|
|
"loss": 5.742,
|
|
"mean_token_accuracy": 0.16831236183643342,
|
|
"num_tokens": 19385510.0,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"entropy": 5.9540716171264645,
|
|
"epoch": 0.9184033910279054,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004922573262292308,
|
|
"loss": 5.7815,
|
|
"mean_token_accuracy": 0.16358954459428787,
|
|
"num_tokens": 19394505.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 5.962457704544067,
|
|
"epoch": 0.9188449311197456,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004922491339124945,
|
|
"loss": 5.6794,
|
|
"mean_token_accuracy": 0.17455343455076217,
|
|
"num_tokens": 19403140.0,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"entropy": 5.9841881275177,
|
|
"epoch": 0.919286471211586,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004922409373399713,
|
|
"loss": 5.8875,
|
|
"mean_token_accuracy": 0.16283275336027145,
|
|
"num_tokens": 19412709.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 5.9437761306762695,
|
|
"epoch": 0.9197280113034263,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004922327365118216,
|
|
"loss": 5.7306,
|
|
"mean_token_accuracy": 0.17520128786563874,
|
|
"num_tokens": 19422442.0,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"entropy": 6.013984823226929,
|
|
"epoch": 0.9201695513952667,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004922245314282062,
|
|
"loss": 5.8573,
|
|
"mean_token_accuracy": 0.16567598134279252,
|
|
"num_tokens": 19432153.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 5.988616323471069,
|
|
"epoch": 0.9206110914871071,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004922163220892858,
|
|
"loss": 5.6832,
|
|
"mean_token_accuracy": 0.1823098748922348,
|
|
"num_tokens": 19440811.0,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"entropy": 5.9555412292480465,
|
|
"epoch": 0.9210526315789473,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000492208108495221,
|
|
"loss": 5.8588,
|
|
"mean_token_accuracy": 0.16761756986379622,
|
|
"num_tokens": 19451330.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 5.9230536937713625,
|
|
"epoch": 0.9214941716707877,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.000492199890646173,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.1846833571791649,
|
|
"num_tokens": 19459689.0,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"entropy": 6.030665397644043,
|
|
"epoch": 0.921935711762628,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004921916685423028,
|
|
"loss": 5.7551,
|
|
"mean_token_accuracy": 0.17246931791305542,
|
|
"num_tokens": 19468110.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 5.983535623550415,
|
|
"epoch": 0.9223772518544684,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004921834421837711,
|
|
"loss": 5.8192,
|
|
"mean_token_accuracy": 0.16404144763946532,
|
|
"num_tokens": 19477190.0,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"entropy": 6.092430591583252,
|
|
"epoch": 0.9228187919463087,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004921752115707394,
|
|
"loss": 5.9195,
|
|
"mean_token_accuracy": 0.16309409886598586,
|
|
"num_tokens": 19485757.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 6.059690475463867,
|
|
"epoch": 0.9232603320381491,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004921669767033688,
|
|
"loss": 5.8539,
|
|
"mean_token_accuracy": 0.16356213837862016,
|
|
"num_tokens": 19494846.0,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"entropy": 5.986880207061768,
|
|
"epoch": 0.9237018721299894,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004921587375818207,
|
|
"loss": 5.8418,
|
|
"mean_token_accuracy": 0.16628807932138442,
|
|
"num_tokens": 19504307.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 6.010497951507569,
|
|
"epoch": 0.9241434122218297,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004921504942062564,
|
|
"loss": 5.7781,
|
|
"mean_token_accuracy": 0.16801584959030152,
|
|
"num_tokens": 19512741.0,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"entropy": 5.956787252426148,
|
|
"epoch": 0.9245849523136701,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004921422465768374,
|
|
"loss": 5.6549,
|
|
"mean_token_accuracy": 0.17740724086761475,
|
|
"num_tokens": 19521823.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 6.001222944259643,
|
|
"epoch": 0.9250264924055104,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004921339946937253,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.16669148355722427,
|
|
"num_tokens": 19531048.0,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"entropy": 6.044188165664673,
|
|
"epoch": 0.9254680324973508,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004921257385570818,
|
|
"loss": 5.8162,
|
|
"mean_token_accuracy": 0.17120853662490845,
|
|
"num_tokens": 19539543.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 6.096062421798706,
|
|
"epoch": 0.925909572589191,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004921174781670685,
|
|
"loss": 5.9247,
|
|
"mean_token_accuracy": 0.16498176455497743,
|
|
"num_tokens": 19549242.0,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"entropy": 6.070945692062378,
|
|
"epoch": 0.9263511126810314,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004921092135238473,
|
|
"loss": 5.8422,
|
|
"mean_token_accuracy": 0.1619497060775757,
|
|
"num_tokens": 19558259.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 6.021557855606079,
|
|
"epoch": 0.9267926527728718,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00049210094462758,
|
|
"loss": 5.8189,
|
|
"mean_token_accuracy": 0.16899777501821517,
|
|
"num_tokens": 19567327.0,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"entropy": 6.036462640762329,
|
|
"epoch": 0.9272341928647121,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004920926714784288,
|
|
"loss": 5.855,
|
|
"mean_token_accuracy": 0.16469730883836747,
|
|
"num_tokens": 19576696.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 6.01403579711914,
|
|
"epoch": 0.9276757329565525,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004920843940765555,
|
|
"loss": 5.7437,
|
|
"mean_token_accuracy": 0.17097172141075134,
|
|
"num_tokens": 19585708.0,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"entropy": 6.025144863128662,
|
|
"epoch": 0.9281172730483928,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004920761124221224,
|
|
"loss": 5.8908,
|
|
"mean_token_accuracy": 0.1659647688269615,
|
|
"num_tokens": 19595199.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 6.00489616394043,
|
|
"epoch": 0.9285588131402331,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004920678265152919,
|
|
"loss": 5.8256,
|
|
"mean_token_accuracy": 0.1605701707303524,
|
|
"num_tokens": 19604781.0,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"entropy": 6.026356172561646,
|
|
"epoch": 0.9290003532320734,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000492059536356226,
|
|
"loss": 5.7732,
|
|
"mean_token_accuracy": 0.1747516244649887,
|
|
"num_tokens": 19614203.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 6.019496011734009,
|
|
"epoch": 0.9294418933239138,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004920512419450872,
|
|
"loss": 5.8025,
|
|
"mean_token_accuracy": 0.17150902897119522,
|
|
"num_tokens": 19623664.0,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"entropy": 5.990518093109131,
|
|
"epoch": 0.9298834334157542,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000492042943282038,
|
|
"loss": 5.8108,
|
|
"mean_token_accuracy": 0.17041826993227005,
|
|
"num_tokens": 19632960.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 5.9955668449401855,
|
|
"epoch": 0.9303249735075945,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000492034640367241,
|
|
"loss": 5.9308,
|
|
"mean_token_accuracy": 0.15933274179697038,
|
|
"num_tokens": 19643087.0,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"entropy": 6.070011329650879,
|
|
"epoch": 0.9307665135994349,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000492026333200859,
|
|
"loss": 5.8737,
|
|
"mean_token_accuracy": 0.16645802557468414,
|
|
"num_tokens": 19653015.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 6.028250122070313,
|
|
"epoch": 0.9312080536912751,
|
|
"grad_norm": 2.640625,
|
|
"learning_rate": 0.0004920180217830543,
|
|
"loss": 5.8954,
|
|
"mean_token_accuracy": 0.16525914818048476,
|
|
"num_tokens": 19661569.0,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"entropy": 5.893161964416504,
|
|
"epoch": 0.9316495937831155,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00049200970611399,
|
|
"loss": 5.7027,
|
|
"mean_token_accuracy": 0.17925852835178374,
|
|
"num_tokens": 19669559.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 5.921646547317505,
|
|
"epoch": 0.9320911338749558,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004920013861938291,
|
|
"loss": 5.7284,
|
|
"mean_token_accuracy": 0.18288709819316865,
|
|
"num_tokens": 19678766.0,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"entropy": 5.988527011871338,
|
|
"epoch": 0.9325326739667962,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004919930620227345,
|
|
"loss": 5.8548,
|
|
"mean_token_accuracy": 0.15858465284109116,
|
|
"num_tokens": 19688422.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 6.099614858627319,
|
|
"epoch": 0.9329742140586366,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004919847336008692,
|
|
"loss": 5.8865,
|
|
"mean_token_accuracy": 0.16525954753160477,
|
|
"num_tokens": 19697206.0,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"entropy": 6.0650975704193115,
|
|
"epoch": 0.9334157541504768,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004919764009283963,
|
|
"loss": 5.8669,
|
|
"mean_token_accuracy": 0.16111723333597183,
|
|
"num_tokens": 19706065.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 6.030399179458618,
|
|
"epoch": 0.9338572942423172,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004919680640054792,
|
|
"loss": 5.7794,
|
|
"mean_token_accuracy": 0.17192087024450303,
|
|
"num_tokens": 19715402.0,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"entropy": 6.004525375366211,
|
|
"epoch": 0.9342988343341575,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.000491959722832281,
|
|
"loss": 5.7321,
|
|
"mean_token_accuracy": 0.1754280924797058,
|
|
"num_tokens": 19724328.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 5.910653209686279,
|
|
"epoch": 0.9347403744259979,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004919513774089653,
|
|
"loss": 5.8325,
|
|
"mean_token_accuracy": 0.1720520257949829,
|
|
"num_tokens": 19734087.0,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"entropy": 6.058596134185791,
|
|
"epoch": 0.9351819145178382,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004919430277356955,
|
|
"loss": 5.8104,
|
|
"mean_token_accuracy": 0.1688505083322525,
|
|
"num_tokens": 19743010.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 5.9669163703918455,
|
|
"epoch": 0.9356234546096786,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004919346738126351,
|
|
"loss": 5.7427,
|
|
"mean_token_accuracy": 0.175693878531456,
|
|
"num_tokens": 19752212.0,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"entropy": 5.984468603134156,
|
|
"epoch": 0.9360649947015189,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004919263156399478,
|
|
"loss": 5.8503,
|
|
"mean_token_accuracy": 0.164433790743351,
|
|
"num_tokens": 19762496.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 6.021150827407837,
|
|
"epoch": 0.9365065347933592,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004919179532177976,
|
|
"loss": 5.846,
|
|
"mean_token_accuracy": 0.16879568845033646,
|
|
"num_tokens": 19772665.0,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"entropy": 5.892103052139282,
|
|
"epoch": 0.9369480748851996,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004919095865463479,
|
|
"loss": 5.6709,
|
|
"mean_token_accuracy": 0.17989853024482727,
|
|
"num_tokens": 19781639.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 5.981200265884399,
|
|
"epoch": 0.9373896149770399,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004919012156257627,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.16907234638929367,
|
|
"num_tokens": 19791475.0,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"entropy": 6.02192645072937,
|
|
"epoch": 0.9378311550688803,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004918928404562061,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.1712593361735344,
|
|
"num_tokens": 19800967.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 5.947515249252319,
|
|
"epoch": 0.9382726951607206,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004918844610378421,
|
|
"loss": 5.9131,
|
|
"mean_token_accuracy": 0.16362484246492387,
|
|
"num_tokens": 19810303.0,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"entropy": 6.076837348937988,
|
|
"epoch": 0.9387142352525609,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004918760773708348,
|
|
"loss": 5.8193,
|
|
"mean_token_accuracy": 0.1612846091389656,
|
|
"num_tokens": 19819049.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 6.073859214782715,
|
|
"epoch": 0.9391557753444013,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004918676894553485,
|
|
"loss": 5.8531,
|
|
"mean_token_accuracy": 0.1676570177078247,
|
|
"num_tokens": 19828253.0,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"entropy": 5.961663913726807,
|
|
"epoch": 0.9395973154362416,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004918592972915476,
|
|
"loss": 5.8106,
|
|
"mean_token_accuracy": 0.1745462715625763,
|
|
"num_tokens": 19837673.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 5.932157897949219,
|
|
"epoch": 0.940038855528082,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004918509008795963,
|
|
"loss": 5.8512,
|
|
"mean_token_accuracy": 0.16296276599168777,
|
|
"num_tokens": 19846289.0,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"entropy": 5.946554470062256,
|
|
"epoch": 0.9404803956199222,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004918425002196594,
|
|
"loss": 5.8133,
|
|
"mean_token_accuracy": 0.16478978991508483,
|
|
"num_tokens": 19855384.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 5.981734752655029,
|
|
"epoch": 0.9409219357117626,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.000491834095311901,
|
|
"loss": 5.7256,
|
|
"mean_token_accuracy": 0.17702484130859375,
|
|
"num_tokens": 19864688.0,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"entropy": 5.9941754817962645,
|
|
"epoch": 0.941363475803603,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.000491825686156486,
|
|
"loss": 5.7347,
|
|
"mean_token_accuracy": 0.171346752345562,
|
|
"num_tokens": 19873692.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 6.019399261474609,
|
|
"epoch": 0.9418050158954433,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004918172727535792,
|
|
"loss": 5.8573,
|
|
"mean_token_accuracy": 0.16431866139173507,
|
|
"num_tokens": 19882063.0,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"entropy": 6.005358266830444,
|
|
"epoch": 0.9422465559872837,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004918088551033454,
|
|
"loss": 5.8143,
|
|
"mean_token_accuracy": 0.16709273606538771,
|
|
"num_tokens": 19891727.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 5.932406759262085,
|
|
"epoch": 0.942688096079124,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004918004332059493,
|
|
"loss": 5.8489,
|
|
"mean_token_accuracy": 0.17194831222295762,
|
|
"num_tokens": 19901851.0,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"entropy": 6.038689661026001,
|
|
"epoch": 0.9431296361709643,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004917920070615561,
|
|
"loss": 5.7874,
|
|
"mean_token_accuracy": 0.16205894947052002,
|
|
"num_tokens": 19911092.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 5.9625006198883055,
|
|
"epoch": 0.9435711762628046,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004917835766703307,
|
|
"loss": 5.7588,
|
|
"mean_token_accuracy": 0.17523420453071595,
|
|
"num_tokens": 19920842.0,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"entropy": 6.018384742736816,
|
|
"epoch": 0.944012716354645,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004917751420324384,
|
|
"loss": 5.8684,
|
|
"mean_token_accuracy": 0.16540421098470687,
|
|
"num_tokens": 19930833.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 6.0669207096099855,
|
|
"epoch": 0.9444542564464854,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004917667031480444,
|
|
"loss": 5.8884,
|
|
"mean_token_accuracy": 0.16578007638454437,
|
|
"num_tokens": 19940514.0,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"entropy": 6.007361221313476,
|
|
"epoch": 0.9448957965383257,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004917582600173138,
|
|
"loss": 5.7396,
|
|
"mean_token_accuracy": 0.17524563521146774,
|
|
"num_tokens": 19948949.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 5.867607307434082,
|
|
"epoch": 0.9453373366301661,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004917498126404123,
|
|
"loss": 5.717,
|
|
"mean_token_accuracy": 0.17226629108190536,
|
|
"num_tokens": 19958410.0,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"entropy": 5.93734655380249,
|
|
"epoch": 0.9457788767220063,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004917413610175052,
|
|
"loss": 5.7132,
|
|
"mean_token_accuracy": 0.18073296248912812,
|
|
"num_tokens": 19968160.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 6.001311540603638,
|
|
"epoch": 0.9462204168138467,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004917329051487582,
|
|
"loss": 5.8792,
|
|
"mean_token_accuracy": 0.15905057042837142,
|
|
"num_tokens": 19976811.0,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"entropy": 6.1178806781768795,
|
|
"epoch": 0.946661956905687,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004917244450343368,
|
|
"loss": 5.9291,
|
|
"mean_token_accuracy": 0.162720163166523,
|
|
"num_tokens": 19986326.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 5.937975215911865,
|
|
"epoch": 0.9471034969975274,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004917159806744067,
|
|
"loss": 5.6805,
|
|
"mean_token_accuracy": 0.171738201379776,
|
|
"num_tokens": 19996255.0,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"entropy": 5.956577587127685,
|
|
"epoch": 0.9475450370893678,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004917075120691338,
|
|
"loss": 5.8767,
|
|
"mean_token_accuracy": 0.16906536221504212,
|
|
"num_tokens": 20004805.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 5.96188440322876,
|
|
"epoch": 0.947986577181208,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004916990392186841,
|
|
"loss": 5.7767,
|
|
"mean_token_accuracy": 0.17188985794782638,
|
|
"num_tokens": 20014189.0,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"entropy": 6.049740886688232,
|
|
"epoch": 0.9484281172730484,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004916905621232233,
|
|
"loss": 5.7779,
|
|
"mean_token_accuracy": 0.1737987980246544,
|
|
"num_tokens": 20023261.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 5.931927347183228,
|
|
"epoch": 0.9488696573648887,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004916820807829177,
|
|
"loss": 5.7308,
|
|
"mean_token_accuracy": 0.1741984859108925,
|
|
"num_tokens": 20032433.0,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"entropy": 5.957098436355591,
|
|
"epoch": 0.9493111974567291,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004916735951979335,
|
|
"loss": 5.8473,
|
|
"mean_token_accuracy": 0.16521918177604675,
|
|
"num_tokens": 20041455.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 6.013712930679321,
|
|
"epoch": 0.9497527375485694,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004916651053684366,
|
|
"loss": 5.8583,
|
|
"mean_token_accuracy": 0.16902563199400902,
|
|
"num_tokens": 20050088.0,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"entropy": 6.03648190498352,
|
|
"epoch": 0.9501942776404098,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004916566112945936,
|
|
"loss": 5.8403,
|
|
"mean_token_accuracy": 0.16953138560056685,
|
|
"num_tokens": 20059114.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 6.005605697631836,
|
|
"epoch": 0.9506358177322501,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004916481129765708,
|
|
"loss": 5.8434,
|
|
"mean_token_accuracy": 0.16606313288211821,
|
|
"num_tokens": 20068692.0,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"entropy": 5.988512325286865,
|
|
"epoch": 0.9510773578240904,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.0004916396104145347,
|
|
"loss": 5.8705,
|
|
"mean_token_accuracy": 0.16754111647605896,
|
|
"num_tokens": 20078380.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 5.896767950057983,
|
|
"epoch": 0.9515188979159308,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004916311036086518,
|
|
"loss": 5.716,
|
|
"mean_token_accuracy": 0.18125931322574615,
|
|
"num_tokens": 20087849.0,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"entropy": 5.938125896453857,
|
|
"epoch": 0.9519604380077711,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004916225925590889,
|
|
"loss": 5.6547,
|
|
"mean_token_accuracy": 0.18121337294578552,
|
|
"num_tokens": 20097253.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 5.971297359466552,
|
|
"epoch": 0.9524019780996115,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004916140772660124,
|
|
"loss": 5.8462,
|
|
"mean_token_accuracy": 0.172994901239872,
|
|
"num_tokens": 20105931.0,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"entropy": 6.003271102905273,
|
|
"epoch": 0.9528435181914517,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004916055577295895,
|
|
"loss": 5.7553,
|
|
"mean_token_accuracy": 0.17517725229263306,
|
|
"num_tokens": 20115546.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 5.985776948928833,
|
|
"epoch": 0.9532850582832921,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004915970339499868,
|
|
"loss": 5.7738,
|
|
"mean_token_accuracy": 0.1728436455130577,
|
|
"num_tokens": 20125009.0,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"entropy": 5.862307453155518,
|
|
"epoch": 0.9537265983751325,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004915885059273714,
|
|
"loss": 5.7374,
|
|
"mean_token_accuracy": 0.17673692107200623,
|
|
"num_tokens": 20134007.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 5.971608638763428,
|
|
"epoch": 0.9541681384669728,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004915799736619104,
|
|
"loss": 5.7595,
|
|
"mean_token_accuracy": 0.173075895011425,
|
|
"num_tokens": 20142617.0,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"entropy": 5.951097583770752,
|
|
"epoch": 0.9546096785588132,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.000491571437153771,
|
|
"loss": 5.7702,
|
|
"mean_token_accuracy": 0.17575397044420243,
|
|
"num_tokens": 20151732.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 6.080293321609497,
|
|
"epoch": 0.9550512186506535,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004915628964031202,
|
|
"loss": 5.8827,
|
|
"mean_token_accuracy": 0.1621328040957451,
|
|
"num_tokens": 20161733.0,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"entropy": 6.004674625396729,
|
|
"epoch": 0.9554927587424938,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004915543514101253,
|
|
"loss": 5.8016,
|
|
"mean_token_accuracy": 0.16970572173595427,
|
|
"num_tokens": 20171022.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 5.930174112319946,
|
|
"epoch": 0.9559342988343341,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004915458021749541,
|
|
"loss": 5.8132,
|
|
"mean_token_accuracy": 0.1605657383799553,
|
|
"num_tokens": 20180809.0,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"entropy": 5.951100921630859,
|
|
"epoch": 0.9563758389261745,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004915372486977736,
|
|
"loss": 5.7428,
|
|
"mean_token_accuracy": 0.17625252157449722,
|
|
"num_tokens": 20189614.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 6.029724359512329,
|
|
"epoch": 0.9568173790180149,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004915286909787516,
|
|
"loss": 5.8935,
|
|
"mean_token_accuracy": 0.1590195283293724,
|
|
"num_tokens": 20199136.0,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"entropy": 5.989223146438599,
|
|
"epoch": 0.9572589191098552,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004915201290180556,
|
|
"loss": 5.7761,
|
|
"mean_token_accuracy": 0.16439834833145142,
|
|
"num_tokens": 20208768.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 5.914671230316162,
|
|
"epoch": 0.9577004592016956,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004915115628158535,
|
|
"loss": 5.7724,
|
|
"mean_token_accuracy": 0.17083653658628464,
|
|
"num_tokens": 20218109.0,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"entropy": 6.004290294647217,
|
|
"epoch": 0.9581419992935358,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000491502992372313,
|
|
"loss": 5.7731,
|
|
"mean_token_accuracy": 0.16725710779428482,
|
|
"num_tokens": 20226768.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 5.991324758529663,
|
|
"epoch": 0.9585835393853762,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.000491494417687602,
|
|
"loss": 5.7184,
|
|
"mean_token_accuracy": 0.17218151092529296,
|
|
"num_tokens": 20236519.0,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"entropy": 5.94624695777893,
|
|
"epoch": 0.9590250794772165,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004914858387618886,
|
|
"loss": 5.7573,
|
|
"mean_token_accuracy": 0.1722206875681877,
|
|
"num_tokens": 20245611.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 5.907992362976074,
|
|
"epoch": 0.9594666195690569,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004914772555953406,
|
|
"loss": 5.6558,
|
|
"mean_token_accuracy": 0.1819604218006134,
|
|
"num_tokens": 20255089.0,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"entropy": 5.935388612747192,
|
|
"epoch": 0.9599081596608973,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004914686681881264,
|
|
"loss": 5.8238,
|
|
"mean_token_accuracy": 0.16650898456573487,
|
|
"num_tokens": 20264794.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 5.934653568267822,
|
|
"epoch": 0.9603496997527375,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000491460076540414,
|
|
"loss": 5.6954,
|
|
"mean_token_accuracy": 0.18213992118835448,
|
|
"num_tokens": 20274088.0,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"entropy": 5.9760363578796385,
|
|
"epoch": 0.9607912398445779,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004914514806523718,
|
|
"loss": 5.859,
|
|
"mean_token_accuracy": 0.16019129902124404,
|
|
"num_tokens": 20283837.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 6.003293085098266,
|
|
"epoch": 0.9612327799364182,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004914428805241683,
|
|
"loss": 5.7622,
|
|
"mean_token_accuracy": 0.16625455170869827,
|
|
"num_tokens": 20293287.0,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"entropy": 5.907747411727906,
|
|
"epoch": 0.9616743200282586,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004914342761559718,
|
|
"loss": 5.719,
|
|
"mean_token_accuracy": 0.17710004597902299,
|
|
"num_tokens": 20302262.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 6.0164556980133055,
|
|
"epoch": 0.9621158601200989,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004914256675479509,
|
|
"loss": 5.8292,
|
|
"mean_token_accuracy": 0.1677974209189415,
|
|
"num_tokens": 20312475.0,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"entropy": 6.0269428253173825,
|
|
"epoch": 0.9625574002119393,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004914170547002742,
|
|
"loss": 5.833,
|
|
"mean_token_accuracy": 0.16870647370815278,
|
|
"num_tokens": 20322243.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 6.016332721710205,
|
|
"epoch": 0.9629989403037796,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004914084376131106,
|
|
"loss": 5.7569,
|
|
"mean_token_accuracy": 0.16687582582235336,
|
|
"num_tokens": 20331299.0,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"entropy": 5.913394355773926,
|
|
"epoch": 0.9634404803956199,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004913998162866288,
|
|
"loss": 5.6931,
|
|
"mean_token_accuracy": 0.17729486376047135,
|
|
"num_tokens": 20340800.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 5.959266328811646,
|
|
"epoch": 0.9638820204874603,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004913911907209976,
|
|
"loss": 5.7692,
|
|
"mean_token_accuracy": 0.17721022814512252,
|
|
"num_tokens": 20350292.0,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"entropy": 6.043052721023559,
|
|
"epoch": 0.9643235605793006,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004913825609163861,
|
|
"loss": 5.8679,
|
|
"mean_token_accuracy": 0.16006149500608444,
|
|
"num_tokens": 20360109.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 5.860322570800781,
|
|
"epoch": 0.964765100671141,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004913739268729633,
|
|
"loss": 5.6917,
|
|
"mean_token_accuracy": 0.1784888446331024,
|
|
"num_tokens": 20368917.0,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"entropy": 6.006972455978394,
|
|
"epoch": 0.9652066407629812,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004913652885908983,
|
|
"loss": 5.8962,
|
|
"mean_token_accuracy": 0.16221495121717452,
|
|
"num_tokens": 20378580.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 5.97238073348999,
|
|
"epoch": 0.9656481808548216,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004913566460703603,
|
|
"loss": 5.7268,
|
|
"mean_token_accuracy": 0.17603309750556945,
|
|
"num_tokens": 20387497.0,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"entropy": 5.99204421043396,
|
|
"epoch": 0.966089720946662,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 0.0004913479993115187,
|
|
"loss": 5.8727,
|
|
"mean_token_accuracy": 0.1619122162461281,
|
|
"num_tokens": 20396586.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 6.0532196998596195,
|
|
"epoch": 0.9665312610385023,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004913393483145428,
|
|
"loss": 5.9377,
|
|
"mean_token_accuracy": 0.15912597626447678,
|
|
"num_tokens": 20406183.0,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"entropy": 6.163526821136474,
|
|
"epoch": 0.9669728011303427,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004913306930796022,
|
|
"loss": 5.8877,
|
|
"mean_token_accuracy": 0.16394948363304138,
|
|
"num_tokens": 20415716.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 6.033840751647949,
|
|
"epoch": 0.967414341222183,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004913220336068662,
|
|
"loss": 5.801,
|
|
"mean_token_accuracy": 0.16864629238843917,
|
|
"num_tokens": 20424715.0,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"entropy": 5.9165130138397215,
|
|
"epoch": 0.9678558813140233,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004913133698965048,
|
|
"loss": 5.7995,
|
|
"mean_token_accuracy": 0.1718275874853134,
|
|
"num_tokens": 20434686.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 6.020512485504151,
|
|
"epoch": 0.9682974214058636,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004913047019486874,
|
|
"loss": 5.7933,
|
|
"mean_token_accuracy": 0.1689111992716789,
|
|
"num_tokens": 20443619.0,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"entropy": 5.928170108795166,
|
|
"epoch": 0.968738961497704,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004912960297635839,
|
|
"loss": 5.7247,
|
|
"mean_token_accuracy": 0.18071752190589904,
|
|
"num_tokens": 20453540.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 5.973074340820313,
|
|
"epoch": 0.9691805015895444,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004912873533413641,
|
|
"loss": 5.8386,
|
|
"mean_token_accuracy": 0.16714389324188234,
|
|
"num_tokens": 20463395.0,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"entropy": 6.01979546546936,
|
|
"epoch": 0.9696220416813847,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004912786726821982,
|
|
"loss": 5.7723,
|
|
"mean_token_accuracy": 0.1673387110233307,
|
|
"num_tokens": 20472302.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 5.932343435287476,
|
|
"epoch": 0.970063581773225,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.000491269987786256,
|
|
"loss": 5.8032,
|
|
"mean_token_accuracy": 0.16516162306070328,
|
|
"num_tokens": 20482077.0,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"entropy": 5.987225818634033,
|
|
"epoch": 0.9705051218650653,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004912612986537078,
|
|
"loss": 5.7917,
|
|
"mean_token_accuracy": 0.16847066432237626,
|
|
"num_tokens": 20492177.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 6.021402645111084,
|
|
"epoch": 0.9709466619569057,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004912526052847238,
|
|
"loss": 5.8207,
|
|
"mean_token_accuracy": 0.17326410710811616,
|
|
"num_tokens": 20502063.0,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"entropy": 5.953307962417602,
|
|
"epoch": 0.971388202048746,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004912439076794742,
|
|
"loss": 5.7591,
|
|
"mean_token_accuracy": 0.1668146625161171,
|
|
"num_tokens": 20510631.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 5.895603370666504,
|
|
"epoch": 0.9718297421405864,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004912352058381293,
|
|
"loss": 5.7888,
|
|
"mean_token_accuracy": 0.1695307195186615,
|
|
"num_tokens": 20519978.0,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"entropy": 5.953231716156006,
|
|
"epoch": 0.9722712822324268,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004912264997608599,
|
|
"loss": 5.7773,
|
|
"mean_token_accuracy": 0.1737328365445137,
|
|
"num_tokens": 20529346.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 6.022437715530396,
|
|
"epoch": 0.972712822324267,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004912177894478364,
|
|
"loss": 5.7437,
|
|
"mean_token_accuracy": 0.1649177610874176,
|
|
"num_tokens": 20537619.0,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"entropy": 5.978044605255127,
|
|
"epoch": 0.9731543624161074,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004912090748992291,
|
|
"loss": 5.7933,
|
|
"mean_token_accuracy": 0.17317971140146254,
|
|
"num_tokens": 20547011.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 5.990376043319702,
|
|
"epoch": 0.9735959025079477,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004912003561152092,
|
|
"loss": 5.7147,
|
|
"mean_token_accuracy": 0.17324843108654023,
|
|
"num_tokens": 20556425.0,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"entropy": 5.928810358047485,
|
|
"epoch": 0.9740374425997881,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004911916330959472,
|
|
"loss": 5.7334,
|
|
"mean_token_accuracy": 0.1760266274213791,
|
|
"num_tokens": 20565330.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 5.9286705493927006,
|
|
"epoch": 0.9744789826916284,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004911829058416141,
|
|
"loss": 5.8313,
|
|
"mean_token_accuracy": 0.1810968041419983,
|
|
"num_tokens": 20575320.0,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"entropy": 5.994808006286621,
|
|
"epoch": 0.9749205227834687,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004911741743523808,
|
|
"loss": 5.7431,
|
|
"mean_token_accuracy": 0.17848343253135682,
|
|
"num_tokens": 20585105.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 5.966033029556274,
|
|
"epoch": 0.9753620628753091,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 0.0004911654386284184,
|
|
"loss": 5.7916,
|
|
"mean_token_accuracy": 0.17322781980037688,
|
|
"num_tokens": 20593889.0,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"entropy": 5.978029823303222,
|
|
"epoch": 0.9758036029671494,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000491156698669898,
|
|
"loss": 5.8134,
|
|
"mean_token_accuracy": 0.17286840006709098,
|
|
"num_tokens": 20602322.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 6.022975111007691,
|
|
"epoch": 0.9762451430589898,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004911479544769908,
|
|
"loss": 5.8974,
|
|
"mean_token_accuracy": 0.15766229033470153,
|
|
"num_tokens": 20612077.0,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"entropy": 6.059322118759155,
|
|
"epoch": 0.9766866831508301,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.000491139206049868,
|
|
"loss": 5.8469,
|
|
"mean_token_accuracy": 0.17113145887851716,
|
|
"num_tokens": 20620683.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 5.986019706726074,
|
|
"epoch": 0.9771282232426705,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004911304533887011,
|
|
"loss": 5.9007,
|
|
"mean_token_accuracy": 0.16220422834157944,
|
|
"num_tokens": 20631124.0,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"entropy": 5.98687162399292,
|
|
"epoch": 0.9775697633345107,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004911216964936616,
|
|
"loss": 5.7466,
|
|
"mean_token_accuracy": 0.17153911888599396,
|
|
"num_tokens": 20640515.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 5.96340537071228,
|
|
"epoch": 0.9780113034263511,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 0.0004911129353649211,
|
|
"loss": 5.7365,
|
|
"mean_token_accuracy": 0.174886654317379,
|
|
"num_tokens": 20649819.0,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"entropy": 5.944764804840088,
|
|
"epoch": 0.9784528435181915,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.000491104170002651,
|
|
"loss": 5.7713,
|
|
"mean_token_accuracy": 0.17416697293519973,
|
|
"num_tokens": 20657831.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 6.038313531875611,
|
|
"epoch": 0.9788943836100318,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000491095400407023,
|
|
"loss": 5.8761,
|
|
"mean_token_accuracy": 0.17019579261541368,
|
|
"num_tokens": 20666807.0,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"entropy": 6.0149578094482425,
|
|
"epoch": 0.9793359237018722,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004910866265782091,
|
|
"loss": 5.7491,
|
|
"mean_token_accuracy": 0.17066479325294495,
|
|
"num_tokens": 20674859.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 6.064696598052978,
|
|
"epoch": 0.9797774637937124,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004910778485163811,
|
|
"loss": 5.8275,
|
|
"mean_token_accuracy": 0.16826068758964538,
|
|
"num_tokens": 20684060.0,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"entropy": 5.976723861694336,
|
|
"epoch": 0.9802190038855528,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.000491069066221711,
|
|
"loss": 5.7471,
|
|
"mean_token_accuracy": 0.1807445615530014,
|
|
"num_tokens": 20693446.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 6.011556386947632,
|
|
"epoch": 0.9806605439773931,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 0.0004910602796943707,
|
|
"loss": 5.8807,
|
|
"mean_token_accuracy": 0.1663028821349144,
|
|
"num_tokens": 20702635.0,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"entropy": 6.090901565551758,
|
|
"epoch": 0.9811020840692335,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 0.0004910514889345323,
|
|
"loss": 5.8358,
|
|
"mean_token_accuracy": 0.16323625594377517,
|
|
"num_tokens": 20712256.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 5.991445112228393,
|
|
"epoch": 0.9815436241610739,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004910426939423682,
|
|
"loss": 5.7859,
|
|
"mean_token_accuracy": 0.16746611446142196,
|
|
"num_tokens": 20720175.0,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"entropy": 5.970749759674073,
|
|
"epoch": 0.9819851642529142,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004910338947180506,
|
|
"loss": 5.8054,
|
|
"mean_token_accuracy": 0.16871384233236314,
|
|
"num_tokens": 20729849.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 5.9835960388183596,
|
|
"epoch": 0.9824267043447545,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004910250912617519,
|
|
"loss": 5.7546,
|
|
"mean_token_accuracy": 0.17132087796926498,
|
|
"num_tokens": 20739129.0,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"entropy": 6.016697931289673,
|
|
"epoch": 0.9828682444365948,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004910162835736445,
|
|
"loss": 5.709,
|
|
"mean_token_accuracy": 0.17840465158224106,
|
|
"num_tokens": 20748632.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 5.988829517364502,
|
|
"epoch": 0.9833097845284352,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.000491007471653901,
|
|
"loss": 5.7538,
|
|
"mean_token_accuracy": 0.1671848103404045,
|
|
"num_tokens": 20758535.0,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"entropy": 5.963017082214355,
|
|
"epoch": 0.9837513246202755,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004909986555026939,
|
|
"loss": 5.8005,
|
|
"mean_token_accuracy": 0.17092676162719728,
|
|
"num_tokens": 20768982.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 5.994900512695312,
|
|
"epoch": 0.9841928647121159,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.000490989835120196,
|
|
"loss": 5.862,
|
|
"mean_token_accuracy": 0.16857540905475615,
|
|
"num_tokens": 20778936.0,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"entropy": 6.042771339416504,
|
|
"epoch": 0.9846344048039563,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004909810105065801,
|
|
"loss": 5.848,
|
|
"mean_token_accuracy": 0.16031375378370286,
|
|
"num_tokens": 20788618.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 6.0676703453063965,
|
|
"epoch": 0.9850759448957965,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004909721816620191,
|
|
"loss": 5.8365,
|
|
"mean_token_accuracy": 0.17483314573764802,
|
|
"num_tokens": 20797721.0,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"entropy": 6.043312644958496,
|
|
"epoch": 0.9855174849876369,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004909633485866857,
|
|
"loss": 5.7721,
|
|
"mean_token_accuracy": 0.16904150396585466,
|
|
"num_tokens": 20807305.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 5.888973760604858,
|
|
"epoch": 0.9859590250794772,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004909545112807533,
|
|
"loss": 5.7272,
|
|
"mean_token_accuracy": 0.1731989398598671,
|
|
"num_tokens": 20816984.0,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"entropy": 6.0584144115448,
|
|
"epoch": 0.9864005651713176,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004909456697443948,
|
|
"loss": 5.8929,
|
|
"mean_token_accuracy": 0.15854351669549943,
|
|
"num_tokens": 20825738.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 6.052678203582763,
|
|
"epoch": 0.9868421052631579,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004909368239777835,
|
|
"loss": 5.8344,
|
|
"mean_token_accuracy": 0.17343858778476715,
|
|
"num_tokens": 20834135.0,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"entropy": 6.089842700958252,
|
|
"epoch": 0.9872836453549982,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004909279739810926,
|
|
"loss": 5.9038,
|
|
"mean_token_accuracy": 0.16258951276540756,
|
|
"num_tokens": 20843689.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 6.050971508026123,
|
|
"epoch": 0.9877251854468386,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004909191197544954,
|
|
"loss": 5.7843,
|
|
"mean_token_accuracy": 0.16911011934280396,
|
|
"num_tokens": 20851724.0,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"entropy": 6.092201471328735,
|
|
"epoch": 0.9881667255386789,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004909102612981655,
|
|
"loss": 5.9519,
|
|
"mean_token_accuracy": 0.1617167681455612,
|
|
"num_tokens": 20862325.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 6.003147792816162,
|
|
"epoch": 0.9886082656305193,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004909013986122763,
|
|
"loss": 5.7471,
|
|
"mean_token_accuracy": 0.17576643824577332,
|
|
"num_tokens": 20871046.0,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"entropy": 6.023550748825073,
|
|
"epoch": 0.9890498057223596,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004908925316970016,
|
|
"loss": 5.7915,
|
|
"mean_token_accuracy": 0.16211405843496324,
|
|
"num_tokens": 20881605.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 5.917197895050049,
|
|
"epoch": 0.9894913458142,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.000490883660552515,
|
|
"loss": 5.7289,
|
|
"mean_token_accuracy": 0.16584061533212663,
|
|
"num_tokens": 20891170.0,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"entropy": 5.991090631484985,
|
|
"epoch": 0.9899328859060402,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004908747851789902,
|
|
"loss": 5.7403,
|
|
"mean_token_accuracy": 0.15918954312801362,
|
|
"num_tokens": 20900307.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 5.9911338806152346,
|
|
"epoch": 0.9903744259978806,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004908659055766011,
|
|
"loss": 5.8943,
|
|
"mean_token_accuracy": 0.16472139954566956,
|
|
"num_tokens": 20910623.0,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"entropy": 6.066866540908814,
|
|
"epoch": 0.990815966089721,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004908570217455218,
|
|
"loss": 5.8166,
|
|
"mean_token_accuracy": 0.1658707246184349,
|
|
"num_tokens": 20919744.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 6.003478002548218,
|
|
"epoch": 0.9912575061815613,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004908481336859262,
|
|
"loss": 5.8639,
|
|
"mean_token_accuracy": 0.171516814827919,
|
|
"num_tokens": 20928701.0,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"entropy": 5.999844121932983,
|
|
"epoch": 0.9916990462734017,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004908392413979884,
|
|
"loss": 5.8468,
|
|
"mean_token_accuracy": 0.15534245297312738,
|
|
"num_tokens": 20937786.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 5.970778274536133,
|
|
"epoch": 0.9921405863652419,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004908303448818825,
|
|
"loss": 5.7455,
|
|
"mean_token_accuracy": 0.17506744116544723,
|
|
"num_tokens": 20947214.0,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"entropy": 5.949552249908447,
|
|
"epoch": 0.9925821264570823,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.000490821444137783,
|
|
"loss": 5.854,
|
|
"mean_token_accuracy": 0.1731862783432007,
|
|
"num_tokens": 20956041.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 6.05905122756958,
|
|
"epoch": 0.9930236665489226,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004908125391658641,
|
|
"loss": 5.8267,
|
|
"mean_token_accuracy": 0.16466867923736572,
|
|
"num_tokens": 20966162.0,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"entropy": 6.021831130981445,
|
|
"epoch": 0.993465206640763,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004908036299663003,
|
|
"loss": 5.8266,
|
|
"mean_token_accuracy": 0.16720837503671646,
|
|
"num_tokens": 20976025.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 6.02084493637085,
|
|
"epoch": 0.9939067467326034,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004907947165392662,
|
|
"loss": 5.8584,
|
|
"mean_token_accuracy": 0.16399545222520828,
|
|
"num_tokens": 20985606.0,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"entropy": 6.0483697891235355,
|
|
"epoch": 0.9943482868244437,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004907857988849363,
|
|
"loss": 5.8316,
|
|
"mean_token_accuracy": 0.16402145475149155,
|
|
"num_tokens": 20994308.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 5.948052406311035,
|
|
"epoch": 0.994789826916284,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004907768770034853,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.17144669741392135,
|
|
"num_tokens": 21003783.0,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"entropy": 6.023220729827881,
|
|
"epoch": 0.9952313670081243,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000490767950895088,
|
|
"loss": 5.7298,
|
|
"mean_token_accuracy": 0.16765106618404388,
|
|
"num_tokens": 21012870.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 6.064298534393311,
|
|
"epoch": 0.9956729070999647,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004907590205599192,
|
|
"loss": 5.8202,
|
|
"mean_token_accuracy": 0.16320745795965194,
|
|
"num_tokens": 21021987.0,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"entropy": 5.932501220703125,
|
|
"epoch": 0.996114447191805,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.000490750085998154,
|
|
"loss": 5.8329,
|
|
"mean_token_accuracy": 0.16234722584486008,
|
|
"num_tokens": 21030977.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 5.986550569534302,
|
|
"epoch": 0.9965559872836454,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004907411472099672,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.16745672076940538,
|
|
"num_tokens": 21039259.0,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"entropy": 6.030724573135376,
|
|
"epoch": 0.9969975273754857,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004907322041955341,
|
|
"loss": 5.8425,
|
|
"mean_token_accuracy": 0.1665562704205513,
|
|
"num_tokens": 21049321.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 6.031997394561768,
|
|
"epoch": 0.997439067467326,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004907232569550298,
|
|
"loss": 5.9026,
|
|
"mean_token_accuracy": 0.15812076702713967,
|
|
"num_tokens": 21059469.0,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"entropy": 6.024282503128052,
|
|
"epoch": 0.9978806075591664,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004907143054886297,
|
|
"loss": 5.8388,
|
|
"mean_token_accuracy": 0.16478039026260377,
|
|
"num_tokens": 21068694.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 6.020016002655029,
|
|
"epoch": 0.9983221476510067,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000490705349796509,
|
|
"loss": 5.7754,
|
|
"mean_token_accuracy": 0.1716444805264473,
|
|
"num_tokens": 21077232.0,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"entropy": 6.047950696945191,
|
|
"epoch": 0.9987636877428471,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004906963898788431,
|
|
"loss": 5.7888,
|
|
"mean_token_accuracy": 0.16969122588634492,
|
|
"num_tokens": 21086517.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 5.983556652069092,
|
|
"epoch": 0.9992052278346873,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004906874257358075,
|
|
"loss": 5.8444,
|
|
"mean_token_accuracy": 0.17099157720804214,
|
|
"num_tokens": 21096969.0,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"entropy": 5.980427312850952,
|
|
"epoch": 0.9996467679265277,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.000490678457367578,
|
|
"loss": 5.7915,
|
|
"mean_token_accuracy": 0.16400003731250762,
|
|
"num_tokens": 21106554.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 6.017829036712646,
|
|
"epoch": 1.000088308018368,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004906694847743302,
|
|
"loss": 5.7764,
|
|
"mean_token_accuracy": 0.16667501181364058,
|
|
"num_tokens": 21114829.0,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"entropy": 6.032284355163574,
|
|
"epoch": 1.0005298481102085,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004906605079562399,
|
|
"loss": 5.7214,
|
|
"mean_token_accuracy": 0.1728486344218254,
|
|
"num_tokens": 21123568.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 5.998350667953491,
|
|
"epoch": 1.0009713882020488,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0004906515269134827,
|
|
"loss": 5.7827,
|
|
"mean_token_accuracy": 0.16798163652420045,
|
|
"num_tokens": 21132811.0,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"entropy": 5.996047115325927,
|
|
"epoch": 1.001412928293889,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004906425416462349,
|
|
"loss": 5.8278,
|
|
"mean_token_accuracy": 0.16611848026514053,
|
|
"num_tokens": 21141549.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 6.016817903518676,
|
|
"epoch": 1.0018544683857293,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004906335521546723,
|
|
"loss": 5.7236,
|
|
"mean_token_accuracy": 0.17209179252386092,
|
|
"num_tokens": 21150603.0,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"entropy": 5.8438413619995115,
|
|
"epoch": 1.0022960084775698,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004906245584389712,
|
|
"loss": 5.5472,
|
|
"mean_token_accuracy": 0.18766358941793443,
|
|
"num_tokens": 21159842.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 5.922256517410278,
|
|
"epoch": 1.00273754856941,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004906155604993075,
|
|
"loss": 5.7026,
|
|
"mean_token_accuracy": 0.1780036374926567,
|
|
"num_tokens": 21169139.0,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"entropy": 5.837229537963867,
|
|
"epoch": 1.0031790886612504,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004906065583358576,
|
|
"loss": 5.6767,
|
|
"mean_token_accuracy": 0.17509741634130477,
|
|
"num_tokens": 21178059.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 5.94249906539917,
|
|
"epoch": 1.0036206287530909,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004905975519487978,
|
|
"loss": 5.7086,
|
|
"mean_token_accuracy": 0.17003520131111144,
|
|
"num_tokens": 21187966.0,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"entropy": 6.005231332778931,
|
|
"epoch": 1.0040621688449312,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004905885413383046,
|
|
"loss": 5.7315,
|
|
"mean_token_accuracy": 0.16974558532238007,
|
|
"num_tokens": 21197460.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 6.004400444030762,
|
|
"epoch": 1.0045037089367714,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004905795265045546,
|
|
"loss": 5.7716,
|
|
"mean_token_accuracy": 0.16958380937576295,
|
|
"num_tokens": 21206977.0,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"entropy": 5.978320980072022,
|
|
"epoch": 1.0049452490286117,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004905705074477242,
|
|
"loss": 5.7035,
|
|
"mean_token_accuracy": 0.16527058482170104,
|
|
"num_tokens": 21216046.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 5.938219118118286,
|
|
"epoch": 1.0053867891204522,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004905614841679901,
|
|
"loss": 5.7557,
|
|
"mean_token_accuracy": 0.1728820785880089,
|
|
"num_tokens": 21225612.0,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"entropy": 5.942444133758545,
|
|
"epoch": 1.0058283292122925,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004905524566655292,
|
|
"loss": 5.6532,
|
|
"mean_token_accuracy": 0.17415272444486618,
|
|
"num_tokens": 21233641.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 5.9322888374328615,
|
|
"epoch": 1.0062698693041328,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004905434249405183,
|
|
"loss": 5.7228,
|
|
"mean_token_accuracy": 0.1694304198026657,
|
|
"num_tokens": 21243444.0,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"entropy": 5.924720191955567,
|
|
"epoch": 1.0067114093959733,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004905343889931342,
|
|
"loss": 5.6807,
|
|
"mean_token_accuracy": 0.17966057658195494,
|
|
"num_tokens": 21253473.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 5.926651477813721,
|
|
"epoch": 1.0071529494878135,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.000490525348823554,
|
|
"loss": 5.6558,
|
|
"mean_token_accuracy": 0.18528196215629578,
|
|
"num_tokens": 21262049.0,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"entropy": 5.916750860214234,
|
|
"epoch": 1.0075944895796538,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004905163044319549,
|
|
"loss": 5.6808,
|
|
"mean_token_accuracy": 0.17246959656476973,
|
|
"num_tokens": 21270909.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 5.938456106185913,
|
|
"epoch": 1.008036029671494,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004905072558185139,
|
|
"loss": 5.7681,
|
|
"mean_token_accuracy": 0.17287649512290953,
|
|
"num_tokens": 21280503.0,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"entropy": 6.024928188323974,
|
|
"epoch": 1.0084775697633346,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004904982029834083,
|
|
"loss": 5.7682,
|
|
"mean_token_accuracy": 0.1688655585050583,
|
|
"num_tokens": 21289957.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 5.956943130493164,
|
|
"epoch": 1.0089191098551749,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004904891459268154,
|
|
"loss": 5.7434,
|
|
"mean_token_accuracy": 0.1691091001033783,
|
|
"num_tokens": 21298799.0,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"entropy": 5.936780691146851,
|
|
"epoch": 1.0093606499470151,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004904800846489128,
|
|
"loss": 5.7479,
|
|
"mean_token_accuracy": 0.16977997422218322,
|
|
"num_tokens": 21308251.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 6.027180004119873,
|
|
"epoch": 1.0098021900388556,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004904710191498779,
|
|
"loss": 5.7704,
|
|
"mean_token_accuracy": 0.16857996135950087,
|
|
"num_tokens": 21317705.0,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"entropy": 5.952654600143433,
|
|
"epoch": 1.010243730130696,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004904619494298882,
|
|
"loss": 5.744,
|
|
"mean_token_accuracy": 0.16214617639780043,
|
|
"num_tokens": 21326867.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 5.956965589523316,
|
|
"epoch": 1.0106852702225362,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004904528754891215,
|
|
"loss": 5.7537,
|
|
"mean_token_accuracy": 0.17429810911417007,
|
|
"num_tokens": 21336054.0,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"entropy": 5.939871788024902,
|
|
"epoch": 1.0111268103143765,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004904437973277555,
|
|
"loss": 5.693,
|
|
"mean_token_accuracy": 0.17177852243185043,
|
|
"num_tokens": 21344457.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 6.033869028091431,
|
|
"epoch": 1.011568350406217,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004904347149459681,
|
|
"loss": 5.7803,
|
|
"mean_token_accuracy": 0.17063615769147872,
|
|
"num_tokens": 21353639.0,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"entropy": 5.8367383003234865,
|
|
"epoch": 1.0120098904980572,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004904256283439371,
|
|
"loss": 5.5881,
|
|
"mean_token_accuracy": 0.1793370082974434,
|
|
"num_tokens": 21362005.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 5.990533638000488,
|
|
"epoch": 1.0124514305898975,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004904165375218405,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.17171766459941865,
|
|
"num_tokens": 21371917.0,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"entropy": 6.030205631256104,
|
|
"epoch": 1.012892970681738,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004904074424798565,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.16853656023740768,
|
|
"num_tokens": 21381500.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 5.9031003475189205,
|
|
"epoch": 1.0133345107735783,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004903983432181632,
|
|
"loss": 5.6027,
|
|
"mean_token_accuracy": 0.18293141275644303,
|
|
"num_tokens": 21391280.0,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"entropy": 5.960421705245972,
|
|
"epoch": 1.0137760508654186,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004903892397369388,
|
|
"loss": 5.7214,
|
|
"mean_token_accuracy": 0.17304562628269196,
|
|
"num_tokens": 21400371.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 5.857281732559204,
|
|
"epoch": 1.0142175909572588,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004903801320363618,
|
|
"loss": 5.7377,
|
|
"mean_token_accuracy": 0.17329668998718262,
|
|
"num_tokens": 21410493.0,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"entropy": 6.04967622756958,
|
|
"epoch": 1.0146591310490993,
|
|
"grad_norm": 5.09375,
|
|
"learning_rate": 0.0004903710201166105,
|
|
"loss": 5.8371,
|
|
"mean_token_accuracy": 0.15925003439188004,
|
|
"num_tokens": 21420516.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 6.017249536514282,
|
|
"epoch": 1.0151006711409396,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004903619039778634,
|
|
"loss": 5.7631,
|
|
"mean_token_accuracy": 0.16286804229021073,
|
|
"num_tokens": 21429614.0,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"entropy": 5.978137826919555,
|
|
"epoch": 1.0155422112327799,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.000490352783620299,
|
|
"loss": 5.7261,
|
|
"mean_token_accuracy": 0.16925765424966813,
|
|
"num_tokens": 21438595.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 6.019017267227173,
|
|
"epoch": 1.0159837513246204,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004903436590440961,
|
|
"loss": 5.758,
|
|
"mean_token_accuracy": 0.1755782648921013,
|
|
"num_tokens": 21447708.0,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"entropy": 5.963498783111572,
|
|
"epoch": 1.0164252914164607,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004903345302494334,
|
|
"loss": 5.7184,
|
|
"mean_token_accuracy": 0.17244312018156052,
|
|
"num_tokens": 21457210.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 5.895744132995605,
|
|
"epoch": 1.016866831508301,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004903253972364897,
|
|
"loss": 5.7202,
|
|
"mean_token_accuracy": 0.16742899119853974,
|
|
"num_tokens": 21466247.0,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"entropy": 5.960920906066894,
|
|
"epoch": 1.0173083716001412,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004903162600054439,
|
|
"loss": 5.6751,
|
|
"mean_token_accuracy": 0.17459392100572585,
|
|
"num_tokens": 21475634.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 6.021289539337158,
|
|
"epoch": 1.0177499116919817,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004903071185564751,
|
|
"loss": 5.8107,
|
|
"mean_token_accuracy": 0.17013828456401825,
|
|
"num_tokens": 21485523.0,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"entropy": 5.998600625991822,
|
|
"epoch": 1.018191451783822,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004902979728897623,
|
|
"loss": 5.7563,
|
|
"mean_token_accuracy": 0.17158716171979904,
|
|
"num_tokens": 21494909.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 6.029300785064697,
|
|
"epoch": 1.0186329918756623,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004902888230054846,
|
|
"loss": 5.7698,
|
|
"mean_token_accuracy": 0.17082602232694627,
|
|
"num_tokens": 21504413.0,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"entropy": 5.883685827255249,
|
|
"epoch": 1.0190745319675028,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004902796689038213,
|
|
"loss": 5.6508,
|
|
"mean_token_accuracy": 0.17973359674215317,
|
|
"num_tokens": 21514068.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 5.902773189544678,
|
|
"epoch": 1.019516072059343,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004902705105849517,
|
|
"loss": 5.6712,
|
|
"mean_token_accuracy": 0.17633067667484284,
|
|
"num_tokens": 21523536.0,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"entropy": 5.916357421875,
|
|
"epoch": 1.0199576121511833,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004902613480490553,
|
|
"loss": 5.7553,
|
|
"mean_token_accuracy": 0.16829731017351152,
|
|
"num_tokens": 21532554.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 5.904748344421387,
|
|
"epoch": 1.0203991522430236,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004902521812963116,
|
|
"loss": 5.6412,
|
|
"mean_token_accuracy": 0.17432893067598343,
|
|
"num_tokens": 21542508.0,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"entropy": 6.001231288909912,
|
|
"epoch": 1.020840692334864,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0004902430103268998,
|
|
"loss": 5.7336,
|
|
"mean_token_accuracy": 0.16080355644226074,
|
|
"num_tokens": 21551954.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 5.9578272819519045,
|
|
"epoch": 1.0212822324267044,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 0.0004902338351410002,
|
|
"loss": 5.6766,
|
|
"mean_token_accuracy": 0.17667103558778763,
|
|
"num_tokens": 21561676.0,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"entropy": 5.925234651565551,
|
|
"epoch": 1.0217237725185446,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.000490224655738792,
|
|
"loss": 5.7927,
|
|
"mean_token_accuracy": 0.1612503260374069,
|
|
"num_tokens": 21571916.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 5.990692281723023,
|
|
"epoch": 1.0221653126103851,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004902154721204552,
|
|
"loss": 5.7359,
|
|
"mean_token_accuracy": 0.1818292886018753,
|
|
"num_tokens": 21581892.0,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"entropy": 5.984904384613037,
|
|
"epoch": 1.0226068527022254,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004902062842861696,
|
|
"loss": 5.6871,
|
|
"mean_token_accuracy": 0.17149700671434404,
|
|
"num_tokens": 21591390.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 5.986855792999267,
|
|
"epoch": 1.0230483927940657,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004901970922361154,
|
|
"loss": 5.7468,
|
|
"mean_token_accuracy": 0.18060247749090194,
|
|
"num_tokens": 21600766.0,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"entropy": 5.908822011947632,
|
|
"epoch": 1.023489932885906,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004901878959704726,
|
|
"loss": 5.6908,
|
|
"mean_token_accuracy": 0.18095418363809584,
|
|
"num_tokens": 21609264.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 5.983219861984253,
|
|
"epoch": 1.0239314729777464,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004901786954894212,
|
|
"loss": 5.7692,
|
|
"mean_token_accuracy": 0.1688540369272232,
|
|
"num_tokens": 21618409.0,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"entropy": 5.90957407951355,
|
|
"epoch": 1.0243730130695867,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004901694907931416,
|
|
"loss": 5.6592,
|
|
"mean_token_accuracy": 0.1680929258465767,
|
|
"num_tokens": 21628158.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 5.991635131835937,
|
|
"epoch": 1.024814553161427,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.000490160281881814,
|
|
"loss": 5.7875,
|
|
"mean_token_accuracy": 0.17219406813383104,
|
|
"num_tokens": 21637859.0,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"entropy": 6.008704280853271,
|
|
"epoch": 1.0252560932532675,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004901510687556188,
|
|
"loss": 5.7749,
|
|
"mean_token_accuracy": 0.16721676737070085,
|
|
"num_tokens": 21647389.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 6.004774379730224,
|
|
"epoch": 1.0256976333451078,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004901418514147367,
|
|
"loss": 5.7352,
|
|
"mean_token_accuracy": 0.16795544773340226,
|
|
"num_tokens": 21655857.0,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"entropy": 5.949306964874268,
|
|
"epoch": 1.026139173436948,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.000490132629859348,
|
|
"loss": 5.6933,
|
|
"mean_token_accuracy": 0.17819422036409377,
|
|
"num_tokens": 21666150.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 5.954881620407105,
|
|
"epoch": 1.0265807135287883,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004901234040896334,
|
|
"loss": 5.7126,
|
|
"mean_token_accuracy": 0.1698005437850952,
|
|
"num_tokens": 21675662.0,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"entropy": 6.002849912643432,
|
|
"epoch": 1.0270222536206288,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004901141741057737,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.16578176319599153,
|
|
"num_tokens": 21685721.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 6.012971496582031,
|
|
"epoch": 1.027463793712469,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004901049399079495,
|
|
"loss": 5.7504,
|
|
"mean_token_accuracy": 0.1728194072842598,
|
|
"num_tokens": 21694507.0,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"entropy": 5.99225344657898,
|
|
"epoch": 1.0279053338043094,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004900957014963421,
|
|
"loss": 5.7583,
|
|
"mean_token_accuracy": 0.18061111122369766,
|
|
"num_tokens": 21704092.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 5.809380340576172,
|
|
"epoch": 1.0283468738961499,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004900864588711321,
|
|
"loss": 5.6694,
|
|
"mean_token_accuracy": 0.18092458546161652,
|
|
"num_tokens": 21713313.0,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"entropy": 5.9611798286437985,
|
|
"epoch": 1.0287884139879901,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004900772120325008,
|
|
"loss": 5.7366,
|
|
"mean_token_accuracy": 0.17457195073366166,
|
|
"num_tokens": 21721622.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 5.97767071723938,
|
|
"epoch": 1.0292299540798304,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004900679609806291,
|
|
"loss": 5.6707,
|
|
"mean_token_accuracy": 0.17381463348865508,
|
|
"num_tokens": 21730914.0,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"entropy": 5.980806922912597,
|
|
"epoch": 1.0296714941716707,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004900587057156985,
|
|
"loss": 5.6692,
|
|
"mean_token_accuracy": 0.17887462824583053,
|
|
"num_tokens": 21740124.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 5.933705759048462,
|
|
"epoch": 1.0301130342635112,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004900494462378902,
|
|
"loss": 5.7525,
|
|
"mean_token_accuracy": 0.16688884198665618,
|
|
"num_tokens": 21750013.0,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"entropy": 5.890878772735595,
|
|
"epoch": 1.0305545743553515,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004900401825473854,
|
|
"loss": 5.7507,
|
|
"mean_token_accuracy": 0.17413905560970305,
|
|
"num_tokens": 21758842.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 5.927459049224853,
|
|
"epoch": 1.0309961144471917,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004900309146443658,
|
|
"loss": 5.6536,
|
|
"mean_token_accuracy": 0.1740245521068573,
|
|
"num_tokens": 21767002.0,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"entropy": 5.974818420410156,
|
|
"epoch": 1.0314376545390322,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004900216425290128,
|
|
"loss": 5.7696,
|
|
"mean_token_accuracy": 0.16744465827941896,
|
|
"num_tokens": 21775372.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 5.955351209640503,
|
|
"epoch": 1.0318791946308725,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 0.0004900123662015082,
|
|
"loss": 5.7685,
|
|
"mean_token_accuracy": 0.16843886077404022,
|
|
"num_tokens": 21784541.0,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"entropy": 5.948904180526734,
|
|
"epoch": 1.0323207347227128,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004900030856620336,
|
|
"loss": 5.6529,
|
|
"mean_token_accuracy": 0.18304046094417573,
|
|
"num_tokens": 21794525.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 5.963394546508789,
|
|
"epoch": 1.032762274814553,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004899938009107708,
|
|
"loss": 5.758,
|
|
"mean_token_accuracy": 0.17228179275989533,
|
|
"num_tokens": 21804469.0,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"entropy": 5.917418718338013,
|
|
"epoch": 1.0332038149063936,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004899845119479019,
|
|
"loss": 5.6985,
|
|
"mean_token_accuracy": 0.1747218430042267,
|
|
"num_tokens": 21813986.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 5.9892199516296385,
|
|
"epoch": 1.0336453549982338,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004899752187736085,
|
|
"loss": 5.7045,
|
|
"mean_token_accuracy": 0.17282750010490416,
|
|
"num_tokens": 21824083.0,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"entropy": 5.891262626647949,
|
|
"epoch": 1.0340868950900741,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0004899659213880728,
|
|
"loss": 5.6508,
|
|
"mean_token_accuracy": 0.1831414967775345,
|
|
"num_tokens": 21833526.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 5.944592094421386,
|
|
"epoch": 1.0345284351819146,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004899566197914771,
|
|
"loss": 5.6768,
|
|
"mean_token_accuracy": 0.17587029933929443,
|
|
"num_tokens": 21842537.0,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"entropy": 5.985992670059204,
|
|
"epoch": 1.034969975273755,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004899473139840033,
|
|
"loss": 5.7212,
|
|
"mean_token_accuracy": 0.17308437824249268,
|
|
"num_tokens": 21851604.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 5.953021097183227,
|
|
"epoch": 1.0354115153655952,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004899380039658339,
|
|
"loss": 5.7686,
|
|
"mean_token_accuracy": 0.16968014240264892,
|
|
"num_tokens": 21861480.0,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"entropy": 5.939228630065918,
|
|
"epoch": 1.0358530554574354,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004899286897371513,
|
|
"loss": 5.7478,
|
|
"mean_token_accuracy": 0.17233823537826537,
|
|
"num_tokens": 21870901.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 5.8766419887542725,
|
|
"epoch": 1.036294595549276,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000489919371298138,
|
|
"loss": 5.6283,
|
|
"mean_token_accuracy": 0.18169922679662703,
|
|
"num_tokens": 21880491.0,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"entropy": 5.890109491348267,
|
|
"epoch": 1.0367361356411162,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004899100486489763,
|
|
"loss": 5.6946,
|
|
"mean_token_accuracy": 0.17681025713682175,
|
|
"num_tokens": 21889833.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 5.895658111572265,
|
|
"epoch": 1.0371776757329565,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004899007217898491,
|
|
"loss": 5.7354,
|
|
"mean_token_accuracy": 0.17619029581546783,
|
|
"num_tokens": 21899222.0,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"entropy": 6.013517904281616,
|
|
"epoch": 1.037619215824797,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004898913907209389,
|
|
"loss": 5.7485,
|
|
"mean_token_accuracy": 0.17189059853553773,
|
|
"num_tokens": 21908848.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 5.9578900814056395,
|
|
"epoch": 1.0380607559166373,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004898820554424285,
|
|
"loss": 5.6781,
|
|
"mean_token_accuracy": 0.16759795993566512,
|
|
"num_tokens": 21917867.0,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"entropy": 5.976204299926758,
|
|
"epoch": 1.0385022960084775,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000489872715954501,
|
|
"loss": 5.7706,
|
|
"mean_token_accuracy": 0.16450867652893067,
|
|
"num_tokens": 21927476.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 6.0786598205566404,
|
|
"epoch": 1.0389438361003178,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004898633722573391,
|
|
"loss": 5.8377,
|
|
"mean_token_accuracy": 0.1652936100959778,
|
|
"num_tokens": 21937068.0,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"entropy": 5.9219505310058596,
|
|
"epoch": 1.0393853761921583,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000489854024351126,
|
|
"loss": 5.7114,
|
|
"mean_token_accuracy": 0.17316484302282334,
|
|
"num_tokens": 21946144.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 5.972825527191162,
|
|
"epoch": 1.0398269162839986,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004898446722360446,
|
|
"loss": 5.6542,
|
|
"mean_token_accuracy": 0.17927841544151307,
|
|
"num_tokens": 21955346.0,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"entropy": 5.917709684371948,
|
|
"epoch": 1.0402684563758389,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004898353159122784,
|
|
"loss": 5.7345,
|
|
"mean_token_accuracy": 0.1762217789888382,
|
|
"num_tokens": 21965034.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 5.915597438812256,
|
|
"epoch": 1.0407099964676794,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004898259553800106,
|
|
"loss": 5.7021,
|
|
"mean_token_accuracy": 0.17157121747732162,
|
|
"num_tokens": 21975167.0,
|
|
"step": 11785
|
|
},
|
|
{
|
|
"entropy": 6.001184701919556,
|
|
"epoch": 1.0411515365595196,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004898165906394244,
|
|
"loss": 5.7073,
|
|
"mean_token_accuracy": 0.17095027565956117,
|
|
"num_tokens": 21984372.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 5.937566328048706,
|
|
"epoch": 1.04159307665136,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 0.0004898072216907033,
|
|
"loss": 5.7015,
|
|
"mean_token_accuracy": 0.17414507269859314,
|
|
"num_tokens": 21993513.0,
|
|
"step": 11795
|
|
},
|
|
{
|
|
"entropy": 5.936249399185181,
|
|
"epoch": 1.0420346167432002,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004897978485340311,
|
|
"loss": 5.7734,
|
|
"mean_token_accuracy": 0.16741596907377243,
|
|
"num_tokens": 22002593.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 6.005516290664673,
|
|
"epoch": 1.0424761568350407,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000489788471169591,
|
|
"loss": 5.7907,
|
|
"mean_token_accuracy": 0.16941866427659988,
|
|
"num_tokens": 22012296.0,
|
|
"step": 11805
|
|
},
|
|
{
|
|
"entropy": 5.9861159324646,
|
|
"epoch": 1.042917696926881,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004897790895975671,
|
|
"loss": 5.8051,
|
|
"mean_token_accuracy": 0.16327016949653625,
|
|
"num_tokens": 22021267.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 5.980770444869995,
|
|
"epoch": 1.0433592370187212,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004897697038181428,
|
|
"loss": 5.7768,
|
|
"mean_token_accuracy": 0.1721298351883888,
|
|
"num_tokens": 22030878.0,
|
|
"step": 11815
|
|
},
|
|
{
|
|
"entropy": 5.923258638381958,
|
|
"epoch": 1.0438007771105617,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004897603138315022,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.17359738200902938,
|
|
"num_tokens": 22040245.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 5.975063180923462,
|
|
"epoch": 1.044242317202402,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004897509196378293,
|
|
"loss": 5.7395,
|
|
"mean_token_accuracy": 0.1754832774400711,
|
|
"num_tokens": 22050169.0,
|
|
"step": 11825
|
|
},
|
|
{
|
|
"entropy": 6.012202167510987,
|
|
"epoch": 1.0446838572942423,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.000489741521237308,
|
|
"loss": 5.7602,
|
|
"mean_token_accuracy": 0.17230364978313445,
|
|
"num_tokens": 22059168.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 5.910138893127441,
|
|
"epoch": 1.0451253973860826,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004897321186301223,
|
|
"loss": 5.6953,
|
|
"mean_token_accuracy": 0.16782840788364412,
|
|
"num_tokens": 22068326.0,
|
|
"step": 11835
|
|
},
|
|
{
|
|
"entropy": 5.933950471878052,
|
|
"epoch": 1.045566937477923,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004897227118164566,
|
|
"loss": 5.7332,
|
|
"mean_token_accuracy": 0.17260353565216063,
|
|
"num_tokens": 22078026.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 5.97835111618042,
|
|
"epoch": 1.0460084775697633,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.000489713300796495,
|
|
"loss": 5.7714,
|
|
"mean_token_accuracy": 0.16629578769207,
|
|
"num_tokens": 22088230.0,
|
|
"step": 11845
|
|
},
|
|
{
|
|
"entropy": 5.906348705291748,
|
|
"epoch": 1.0464500176616036,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004897038855704221,
|
|
"loss": 5.5866,
|
|
"mean_token_accuracy": 0.18455611318349838,
|
|
"num_tokens": 22096667.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 5.899099445343017,
|
|
"epoch": 1.0468915577534441,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004896944661384222,
|
|
"loss": 5.7253,
|
|
"mean_token_accuracy": 0.1725448414683342,
|
|
"num_tokens": 22106871.0,
|
|
"step": 11855
|
|
},
|
|
{
|
|
"entropy": 5.929962873458862,
|
|
"epoch": 1.0473330978452844,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004896850425006797,
|
|
"loss": 5.6834,
|
|
"mean_token_accuracy": 0.17062857151031494,
|
|
"num_tokens": 22114954.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 5.962481164932251,
|
|
"epoch": 1.0477746379371247,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004896756146573794,
|
|
"loss": 5.7015,
|
|
"mean_token_accuracy": 0.17050741910934447,
|
|
"num_tokens": 22125293.0,
|
|
"step": 11865
|
|
},
|
|
{
|
|
"entropy": 6.005695104598999,
|
|
"epoch": 1.048216178028965,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004896661826087059,
|
|
"loss": 5.7319,
|
|
"mean_token_accuracy": 0.17089567333459854,
|
|
"num_tokens": 22133731.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 5.907718896865845,
|
|
"epoch": 1.0486577181208054,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004896567463548439,
|
|
"loss": 5.732,
|
|
"mean_token_accuracy": 0.16580215096473694,
|
|
"num_tokens": 22143007.0,
|
|
"step": 11875
|
|
},
|
|
{
|
|
"entropy": 5.963164615631103,
|
|
"epoch": 1.0490992582126457,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004896473058959783,
|
|
"loss": 5.736,
|
|
"mean_token_accuracy": 0.17010460942983627,
|
|
"num_tokens": 22152044.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 6.050265264511109,
|
|
"epoch": 1.049540798304486,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004896378612322942,
|
|
"loss": 5.8042,
|
|
"mean_token_accuracy": 0.16925871819257737,
|
|
"num_tokens": 22161473.0,
|
|
"step": 11885
|
|
},
|
|
{
|
|
"entropy": 5.987928819656372,
|
|
"epoch": 1.0499823383963265,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004896284123639763,
|
|
"loss": 5.7358,
|
|
"mean_token_accuracy": 0.16610346138477325,
|
|
"num_tokens": 22171704.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 5.939279222488404,
|
|
"epoch": 1.0504238784881668,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048961895929121,
|
|
"loss": 5.7271,
|
|
"mean_token_accuracy": 0.16976826190948485,
|
|
"num_tokens": 22180593.0,
|
|
"step": 11895
|
|
},
|
|
{
|
|
"entropy": 5.901366949081421,
|
|
"epoch": 1.050865418580007,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004896095020141802,
|
|
"loss": 5.6908,
|
|
"mean_token_accuracy": 0.17781361937522888,
|
|
"num_tokens": 22189552.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 5.954809141159058,
|
|
"epoch": 1.0513069586718473,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004896000405330724,
|
|
"loss": 5.7257,
|
|
"mean_token_accuracy": 0.16914680749177932,
|
|
"num_tokens": 22199314.0,
|
|
"step": 11905
|
|
},
|
|
{
|
|
"entropy": 5.900572395324707,
|
|
"epoch": 1.0517484987636878,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000489590574848072,
|
|
"loss": 5.6724,
|
|
"mean_token_accuracy": 0.17794393748044968,
|
|
"num_tokens": 22208128.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 5.897657966613769,
|
|
"epoch": 1.052190038855528,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.000489581104959364,
|
|
"loss": 5.6537,
|
|
"mean_token_accuracy": 0.1821790114045143,
|
|
"num_tokens": 22217700.0,
|
|
"step": 11915
|
|
},
|
|
{
|
|
"entropy": 5.941938734054565,
|
|
"epoch": 1.0526315789473684,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004895716308671343,
|
|
"loss": 5.7075,
|
|
"mean_token_accuracy": 0.1744215413928032,
|
|
"num_tokens": 22225941.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 5.920284128189087,
|
|
"epoch": 1.0530731190392089,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004895621525715685,
|
|
"loss": 5.6923,
|
|
"mean_token_accuracy": 0.17301340997219086,
|
|
"num_tokens": 22234935.0,
|
|
"step": 11925
|
|
},
|
|
{
|
|
"entropy": 5.934741544723511,
|
|
"epoch": 1.0535146591310491,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004895526700728521,
|
|
"loss": 5.7668,
|
|
"mean_token_accuracy": 0.17099023312330247,
|
|
"num_tokens": 22244087.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 5.9216385841369625,
|
|
"epoch": 1.0539561992228894,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004895431833711708,
|
|
"loss": 5.7086,
|
|
"mean_token_accuracy": 0.16913665533065797,
|
|
"num_tokens": 22254277.0,
|
|
"step": 11935
|
|
},
|
|
{
|
|
"entropy": 5.9343325138092045,
|
|
"epoch": 1.0543977393147297,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004895336924667107,
|
|
"loss": 5.6855,
|
|
"mean_token_accuracy": 0.17650166749954224,
|
|
"num_tokens": 22263150.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 5.946196985244751,
|
|
"epoch": 1.0548392794065702,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004895241973596576,
|
|
"loss": 5.6883,
|
|
"mean_token_accuracy": 0.1718298003077507,
|
|
"num_tokens": 22271973.0,
|
|
"step": 11945
|
|
},
|
|
{
|
|
"entropy": 5.884443902969361,
|
|
"epoch": 1.0552808194984105,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004895146980501973,
|
|
"loss": 5.6761,
|
|
"mean_token_accuracy": 0.17968894690275192,
|
|
"num_tokens": 22280637.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 5.889499378204346,
|
|
"epoch": 1.0557223595902507,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.0004895051945385163,
|
|
"loss": 5.6969,
|
|
"mean_token_accuracy": 0.169900843501091,
|
|
"num_tokens": 22290601.0,
|
|
"step": 11955
|
|
},
|
|
{
|
|
"entropy": 5.975777435302734,
|
|
"epoch": 1.0561638996820912,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004894956868248004,
|
|
"loss": 5.7397,
|
|
"mean_token_accuracy": 0.17498461306095123,
|
|
"num_tokens": 22300212.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 6.008127927780151,
|
|
"epoch": 1.0566054397739315,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004894861749092359,
|
|
"loss": 5.7115,
|
|
"mean_token_accuracy": 0.17412784695625305,
|
|
"num_tokens": 22309171.0,
|
|
"step": 11965
|
|
},
|
|
{
|
|
"entropy": 5.898822069168091,
|
|
"epoch": 1.0570469798657718,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004894766587920094,
|
|
"loss": 5.7267,
|
|
"mean_token_accuracy": 0.18330826759338378,
|
|
"num_tokens": 22318457.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 5.986385107040405,
|
|
"epoch": 1.057488519957612,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.000489467138473307,
|
|
"loss": 5.747,
|
|
"mean_token_accuracy": 0.1688412234187126,
|
|
"num_tokens": 22327077.0,
|
|
"step": 11975
|
|
},
|
|
{
|
|
"entropy": 5.930625486373901,
|
|
"epoch": 1.0579300600494526,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004894576139533154,
|
|
"loss": 5.7338,
|
|
"mean_token_accuracy": 0.17051164656877518,
|
|
"num_tokens": 22336259.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 5.942082929611206,
|
|
"epoch": 1.0583716001412928,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.000489448085232221,
|
|
"loss": 5.7088,
|
|
"mean_token_accuracy": 0.18380282819271088,
|
|
"num_tokens": 22344978.0,
|
|
"step": 11985
|
|
},
|
|
{
|
|
"entropy": 5.929968166351318,
|
|
"epoch": 1.058813140233133,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004894385523102107,
|
|
"loss": 5.6986,
|
|
"mean_token_accuracy": 0.1768606498837471,
|
|
"num_tokens": 22353950.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 5.964371824264527,
|
|
"epoch": 1.0592546803249736,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004894290151874711,
|
|
"loss": 5.6826,
|
|
"mean_token_accuracy": 0.1693834498524666,
|
|
"num_tokens": 22363100.0,
|
|
"step": 11995
|
|
},
|
|
{
|
|
"entropy": 5.892598438262939,
|
|
"epoch": 1.0596962204168139,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000489419473864189,
|
|
"loss": 5.6899,
|
|
"mean_token_accuracy": 0.17939914166927337,
|
|
"num_tokens": 22371979.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 1.0596962204168139,
|
|
"eval_entropy": 5.767613719840822,
|
|
"eval_loss": 5.8206987380981445,
|
|
"eval_mean_token_accuracy": 0.1758889344688568,
|
|
"eval_num_tokens": 22371979.0,
|
|
"eval_runtime": 26.119,
|
|
"eval_samples_per_second": 1352.083,
|
|
"eval_steps_per_second": 169.034,
|
|
"step": 12000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 113230,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.314235278592e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|