5264 lines
147 KiB
JSON
5264 lines
147 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 522,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 0.5506134033203125,
|
||
|
|
"epoch": 0.011494252873563218,
|
||
|
|
"grad_norm": 384.0606248233174,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 8.318,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 849672.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5534515380859375,
|
||
|
|
"epoch": 0.022988505747126436,
|
||
|
|
"grad_norm": 387.6767837142096,
|
||
|
|
"learning_rate": 1.8518518518518518e-07,
|
||
|
|
"loss": 8.3232,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 1671243.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5354843139648438,
|
||
|
|
"epoch": 0.034482758620689655,
|
||
|
|
"grad_norm": 383.45390757110886,
|
||
|
|
"learning_rate": 3.7037037037037036e-07,
|
||
|
|
"loss": 8.2998,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 2542121.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.552337646484375,
|
||
|
|
"epoch": 0.04597701149425287,
|
||
|
|
"grad_norm": 390.2053728104676,
|
||
|
|
"learning_rate": 5.555555555555555e-07,
|
||
|
|
"loss": 8.2361,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 3370166.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5463790893554688,
|
||
|
|
"epoch": 0.05747126436781609,
|
||
|
|
"grad_norm": 396.6880182595508,
|
||
|
|
"learning_rate": 7.407407407407407e-07,
|
||
|
|
"loss": 8.0897,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 4194809.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5587310791015625,
|
||
|
|
"epoch": 0.06896551724137931,
|
||
|
|
"grad_norm": 394.428520935306,
|
||
|
|
"learning_rate": 9.259259259259259e-07,
|
||
|
|
"loss": 8.0395,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 5001673.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5570907592773438,
|
||
|
|
"epoch": 0.08045977011494253,
|
||
|
|
"grad_norm": 400.1266091194488,
|
||
|
|
"learning_rate": 1.111111111111111e-06,
|
||
|
|
"loss": 7.3825,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 5831972.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5578536987304688,
|
||
|
|
"epoch": 0.09195402298850575,
|
||
|
|
"grad_norm": 268.1695863511667,
|
||
|
|
"learning_rate": 1.2962962962962962e-06,
|
||
|
|
"loss": 5.8784,
|
||
|
|
"mean_token_accuracy": 0.0013020833721384406,
|
||
|
|
"num_tokens": 6655190.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5546417236328125,
|
||
|
|
"epoch": 0.10344827586206896,
|
||
|
|
"grad_norm": 228.00484704024294,
|
||
|
|
"learning_rate": 1.4814814814814815e-06,
|
||
|
|
"loss": 5.6001,
|
||
|
|
"mean_token_accuracy": 0.0026041667442768812,
|
||
|
|
"num_tokens": 7497377.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5494537353515625,
|
||
|
|
"epoch": 0.11494252873563218,
|
||
|
|
"grad_norm": 189.63353230439253,
|
||
|
|
"learning_rate": 1.6666666666666667e-06,
|
||
|
|
"loss": 5.2696,
|
||
|
|
"mean_token_accuracy": 0.01822916720993817,
|
||
|
|
"num_tokens": 8345673.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5466766357421875,
|
||
|
|
"epoch": 0.12643678160919541,
|
||
|
|
"grad_norm": 102.52439706164563,
|
||
|
|
"learning_rate": 1.8518518518518519e-06,
|
||
|
|
"loss": 4.1157,
|
||
|
|
"mean_token_accuracy": 0.5065104317618534,
|
||
|
|
"num_tokens": 9206475.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5690078735351562,
|
||
|
|
"epoch": 0.13793103448275862,
|
||
|
|
"grad_norm": 97.09612459227306,
|
||
|
|
"learning_rate": 2.037037037037037e-06,
|
||
|
|
"loss": 4.027,
|
||
|
|
"mean_token_accuracy": 0.5195312654832378,
|
||
|
|
"num_tokens": 10013439.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5670013427734375,
|
||
|
|
"epoch": 0.14942528735632185,
|
||
|
|
"grad_norm": 81.99594392583451,
|
||
|
|
"learning_rate": 2.222222222222222e-06,
|
||
|
|
"loss": 3.8264,
|
||
|
|
"mean_token_accuracy": 0.5065104317618534,
|
||
|
|
"num_tokens": 10837729.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5586013793945312,
|
||
|
|
"epoch": 0.16091954022988506,
|
||
|
|
"grad_norm": 74.57440962105724,
|
||
|
|
"learning_rate": 2.4074074074074075e-06,
|
||
|
|
"loss": 3.711,
|
||
|
|
"mean_token_accuracy": 0.5039062650175765,
|
||
|
|
"num_tokens": 11666285.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5576553344726562,
|
||
|
|
"epoch": 0.1724137931034483,
|
||
|
|
"grad_norm": 59.69190752157155,
|
||
|
|
"learning_rate": 2.5925925925925925e-06,
|
||
|
|
"loss": 3.2731,
|
||
|
|
"mean_token_accuracy": 0.505208348389715,
|
||
|
|
"num_tokens": 12488399.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5660324096679688,
|
||
|
|
"epoch": 0.1839080459770115,
|
||
|
|
"grad_norm": 58.640319989957504,
|
||
|
|
"learning_rate": 2.7777777777777783e-06,
|
||
|
|
"loss": 3.2084,
|
||
|
|
"mean_token_accuracy": 0.5039062650175765,
|
||
|
|
"num_tokens": 13285405.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5326614379882812,
|
||
|
|
"epoch": 0.19540229885057472,
|
||
|
|
"grad_norm": 58.02496291902121,
|
||
|
|
"learning_rate": 2.962962962962963e-06,
|
||
|
|
"loss": 3.1538,
|
||
|
|
"mean_token_accuracy": 0.5312500158324838,
|
||
|
|
"num_tokens": 14169920.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5579299926757812,
|
||
|
|
"epoch": 0.20689655172413793,
|
||
|
|
"grad_norm": 57.3541740444759,
|
||
|
|
"learning_rate": 3.1481481481481483e-06,
|
||
|
|
"loss": 3.0936,
|
||
|
|
"mean_token_accuracy": 0.5325520992046222,
|
||
|
|
"num_tokens": 14983623.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.552978515625,
|
||
|
|
"epoch": 0.21839080459770116,
|
||
|
|
"grad_norm": 57.41173653229489,
|
||
|
|
"learning_rate": 3.3333333333333333e-06,
|
||
|
|
"loss": 3.0535,
|
||
|
|
"mean_token_accuracy": 0.5338541825767606,
|
||
|
|
"num_tokens": 15831705.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5468978881835938,
|
||
|
|
"epoch": 0.22988505747126436,
|
||
|
|
"grad_norm": 57.85464581418466,
|
||
|
|
"learning_rate": 3.5185185185185187e-06,
|
||
|
|
"loss": 2.9633,
|
||
|
|
"mean_token_accuracy": 0.5455729329260066,
|
||
|
|
"num_tokens": 16671559.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.543548583984375,
|
||
|
|
"epoch": 0.2413793103448276,
|
||
|
|
"grad_norm": 57.14746959353303,
|
||
|
|
"learning_rate": 3.7037037037037037e-06,
|
||
|
|
"loss": 2.917,
|
||
|
|
"mean_token_accuracy": 0.5638021001359448,
|
||
|
|
"num_tokens": 17493335.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5303497314453125,
|
||
|
|
"epoch": 0.25287356321839083,
|
||
|
|
"grad_norm": 61.39811744800574,
|
||
|
|
"learning_rate": 3.88888888888889e-06,
|
||
|
|
"loss": 2.9148,
|
||
|
|
"mean_token_accuracy": 0.5299479324603453,
|
||
|
|
"num_tokens": 18340307.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5362396240234375,
|
||
|
|
"epoch": 0.26436781609195403,
|
||
|
|
"grad_norm": 60.342413701495495,
|
||
|
|
"learning_rate": 4.074074074074074e-06,
|
||
|
|
"loss": 2.9023,
|
||
|
|
"mean_token_accuracy": 0.5455729329260066,
|
||
|
|
"num_tokens": 19185926.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5345382690429688,
|
||
|
|
"epoch": 0.27586206896551724,
|
||
|
|
"grad_norm": 58.48140818561053,
|
||
|
|
"learning_rate": 4.2592592592592596e-06,
|
||
|
|
"loss": 2.8588,
|
||
|
|
"mean_token_accuracy": 0.558593766647391,
|
||
|
|
"num_tokens": 20039684.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5341339111328125,
|
||
|
|
"epoch": 0.28735632183908044,
|
||
|
|
"grad_norm": 57.45969923517991,
|
||
|
|
"learning_rate": 4.444444444444444e-06,
|
||
|
|
"loss": 2.8357,
|
||
|
|
"mean_token_accuracy": 0.5494791830424219,
|
||
|
|
"num_tokens": 20867602.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.543975830078125,
|
||
|
|
"epoch": 0.2988505747126437,
|
||
|
|
"grad_norm": 58.24286332124725,
|
||
|
|
"learning_rate": 4.62962962962963e-06,
|
||
|
|
"loss": 2.8104,
|
||
|
|
"mean_token_accuracy": 0.5325520992046222,
|
||
|
|
"num_tokens": 21691039.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5395126342773438,
|
||
|
|
"epoch": 0.3103448275862069,
|
||
|
|
"grad_norm": 57.74485187701231,
|
||
|
|
"learning_rate": 4.814814814814815e-06,
|
||
|
|
"loss": 2.7818,
|
||
|
|
"mean_token_accuracy": 0.5468750162981451,
|
||
|
|
"num_tokens": 22501019.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.529571533203125,
|
||
|
|
"epoch": 0.3218390804597701,
|
||
|
|
"grad_norm": 57.27325469074225,
|
||
|
|
"learning_rate": 5e-06,
|
||
|
|
"loss": 2.7382,
|
||
|
|
"mean_token_accuracy": 0.5598958500195295,
|
||
|
|
"num_tokens": 23335788.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5239486694335938,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"grad_norm": 57.68762700547809,
|
||
|
|
"learning_rate": 4.999949650182267e-06,
|
||
|
|
"loss": 2.7075,
|
||
|
|
"mean_token_accuracy": 0.5611979333916679,
|
||
|
|
"num_tokens": 24179389.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.526702880859375,
|
||
|
|
"epoch": 0.3448275862068966,
|
||
|
|
"grad_norm": 57.52243272097885,
|
||
|
|
"learning_rate": 4.999798602757149e-06,
|
||
|
|
"loss": 2.672,
|
||
|
|
"mean_token_accuracy": 0.5716146003687754,
|
||
|
|
"num_tokens": 25001444.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5177001953125,
|
||
|
|
"epoch": 0.3563218390804598,
|
||
|
|
"grad_norm": 58.79536537541134,
|
||
|
|
"learning_rate": 4.999546863808815e-06,
|
||
|
|
"loss": 2.6479,
|
||
|
|
"mean_token_accuracy": 0.5533854331588373,
|
||
|
|
"num_tokens": 25869769.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515228271484375,
|
||
|
|
"epoch": 0.367816091954023,
|
||
|
|
"grad_norm": 57.98864616043999,
|
||
|
|
"learning_rate": 4.999194443477273e-06,
|
||
|
|
"loss": 2.627,
|
||
|
|
"mean_token_accuracy": 0.5546875165309757,
|
||
|
|
"num_tokens": 26716768.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5211105346679688,
|
||
|
|
"epoch": 0.3793103448275862,
|
||
|
|
"grad_norm": 58.71523852821683,
|
||
|
|
"learning_rate": 4.998741355957963e-06,
|
||
|
|
"loss": 2.5927,
|
||
|
|
"mean_token_accuracy": 0.5638021001359448,
|
||
|
|
"num_tokens": 27545869.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5121917724609375,
|
||
|
|
"epoch": 0.39080459770114945,
|
||
|
|
"grad_norm": 58.704187181529285,
|
||
|
|
"learning_rate": 4.998187619501185e-06,
|
||
|
|
"loss": 2.5905,
|
||
|
|
"mean_token_accuracy": 0.5455729329260066,
|
||
|
|
"num_tokens": 28399163.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5317153930664062,
|
||
|
|
"epoch": 0.40229885057471265,
|
||
|
|
"grad_norm": 58.90319706198557,
|
||
|
|
"learning_rate": 4.99753325641136e-06,
|
||
|
|
"loss": 2.5505,
|
||
|
|
"mean_token_accuracy": 0.5742187671130523,
|
||
|
|
"num_tokens": 29189974.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5278091430664062,
|
||
|
|
"epoch": 0.41379310344827586,
|
||
|
|
"grad_norm": 59.06761043984239,
|
||
|
|
"learning_rate": 4.9967782930461405e-06,
|
||
|
|
"loss": 2.5229,
|
||
|
|
"mean_token_accuracy": 0.5690104336244985,
|
||
|
|
"num_tokens": 30025939.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5297012329101562,
|
||
|
|
"epoch": 0.42528735632183906,
|
||
|
|
"grad_norm": 59.08737592365927,
|
||
|
|
"learning_rate": 4.9959227598153395e-06,
|
||
|
|
"loss": 2.4795,
|
||
|
|
"mean_token_accuracy": 0.5638021001359448,
|
||
|
|
"num_tokens": 30858152.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5217742919921875,
|
||
|
|
"epoch": 0.4367816091954023,
|
||
|
|
"grad_norm": 59.20003475363042,
|
||
|
|
"learning_rate": 4.994966691179712e-06,
|
||
|
|
"loss": 2.4737,
|
||
|
|
"mean_token_accuracy": 0.570312516996637,
|
||
|
|
"num_tokens": 31702950.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53533935546875,
|
||
|
|
"epoch": 0.4482758620689655,
|
||
|
|
"grad_norm": 59.67146679766626,
|
||
|
|
"learning_rate": 4.993910125649561e-06,
|
||
|
|
"loss": 2.4354,
|
||
|
|
"mean_token_accuracy": 0.5846354340901598,
|
||
|
|
"num_tokens": 32533459.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5344924926757812,
|
||
|
|
"epoch": 0.45977011494252873,
|
||
|
|
"grad_norm": 59.396870663210215,
|
||
|
|
"learning_rate": 4.992753105783194e-06,
|
||
|
|
"loss": 2.4049,
|
||
|
|
"mean_token_accuracy": 0.5872396008344367,
|
||
|
|
"num_tokens": 33344480.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5267333984375,
|
||
|
|
"epoch": 0.47126436781609193,
|
||
|
|
"grad_norm": 60.097920183732995,
|
||
|
|
"learning_rate": 4.991495678185202e-06,
|
||
|
|
"loss": 2.3931,
|
||
|
|
"mean_token_accuracy": 0.5729166837409139,
|
||
|
|
"num_tokens": 34168826.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5363311767578125,
|
||
|
|
"epoch": 0.4827586206896552,
|
||
|
|
"grad_norm": 60.149361087445314,
|
||
|
|
"learning_rate": 4.990137893504585e-06,
|
||
|
|
"loss": 2.3507,
|
||
|
|
"mean_token_accuracy": 0.5950521010672674,
|
||
|
|
"num_tokens": 34985268.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5448989868164062,
|
||
|
|
"epoch": 0.4942528735632184,
|
||
|
|
"grad_norm": 60.10877790561801,
|
||
|
|
"learning_rate": 4.988679806432712e-06,
|
||
|
|
"loss": 2.3546,
|
||
|
|
"mean_token_accuracy": 0.5768229338573292,
|
||
|
|
"num_tokens": 35795324.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5362472534179688,
|
||
|
|
"epoch": 0.5057471264367817,
|
||
|
|
"grad_norm": 62.17298377637022,
|
||
|
|
"learning_rate": 4.987121475701118e-06,
|
||
|
|
"loss": 2.336,
|
||
|
|
"mean_token_accuracy": 0.5807291838573292,
|
||
|
|
"num_tokens": 36626952.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5361404418945312,
|
||
|
|
"epoch": 0.5172413793103449,
|
||
|
|
"grad_norm": 60.1281035244482,
|
||
|
|
"learning_rate": 4.985462964079137e-06,
|
||
|
|
"loss": 2.3124,
|
||
|
|
"mean_token_accuracy": 0.6627604304812849,
|
||
|
|
"num_tokens": 37460441.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5405349731445312,
|
||
|
|
"epoch": 0.5287356321839081,
|
||
|
|
"grad_norm": 61.52627035325762,
|
||
|
|
"learning_rate": 4.983704338371375e-06,
|
||
|
|
"loss": 2.304,
|
||
|
|
"mean_token_accuracy": 0.8281250086147338,
|
||
|
|
"num_tokens": 38327419.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5349044799804688,
|
||
|
|
"epoch": 0.5402298850574713,
|
||
|
|
"grad_norm": 60.7002499169142,
|
||
|
|
"learning_rate": 4.981845669415022e-06,
|
||
|
|
"loss": 2.2617,
|
||
|
|
"mean_token_accuracy": 0.9153645883779973,
|
||
|
|
"num_tokens": 39178646.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5537567138671875,
|
||
|
|
"epoch": 0.5517241379310345,
|
||
|
|
"grad_norm": 60.72677769538659,
|
||
|
|
"learning_rate": 4.9798870320769884e-06,
|
||
|
|
"loss": 2.2418,
|
||
|
|
"mean_token_accuracy": 0.9036458390764892,
|
||
|
|
"num_tokens": 39996115.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.543853759765625,
|
||
|
|
"epoch": 0.5632183908045977,
|
||
|
|
"grad_norm": 60.797281222128134,
|
||
|
|
"learning_rate": 4.977828505250903e-06,
|
||
|
|
"loss": 2.221,
|
||
|
|
"mean_token_accuracy": 0.8958333395421505,
|
||
|
|
"num_tokens": 40816594.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.549407958984375,
|
||
|
|
"epoch": 0.5747126436781609,
|
||
|
|
"grad_norm": 60.30979043148616,
|
||
|
|
"learning_rate": 4.975670171853926e-06,
|
||
|
|
"loss": 2.1833,
|
||
|
|
"mean_token_accuracy": 0.9205729214008898,
|
||
|
|
"num_tokens": 41637390.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5470504760742188,
|
||
|
|
"epoch": 0.5862068965517241,
|
||
|
|
"grad_norm": 60.72587739612534,
|
||
|
|
"learning_rate": 4.9734121188234115e-06,
|
||
|
|
"loss": 2.1529,
|
||
|
|
"mean_token_accuracy": 0.9114583386108279,
|
||
|
|
"num_tokens": 42461263.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5331497192382812,
|
||
|
|
"epoch": 0.5977011494252874,
|
||
|
|
"grad_norm": 60.23446589036639,
|
||
|
|
"learning_rate": 4.971054437113406e-06,
|
||
|
|
"loss": 2.1406,
|
||
|
|
"mean_token_accuracy": 0.9036458390764892,
|
||
|
|
"num_tokens": 43324395.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5502243041992188,
|
||
|
|
"epoch": 0.6091954022988506,
|
||
|
|
"grad_norm": 59.73550494985957,
|
||
|
|
"learning_rate": 4.968597221690986e-06,
|
||
|
|
"loss": 2.0925,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 44144015.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55364990234375,
|
||
|
|
"epoch": 0.6206896551724138,
|
||
|
|
"grad_norm": 59.90243880461605,
|
||
|
|
"learning_rate": 4.96604057153243e-06,
|
||
|
|
"loss": 2.0862,
|
||
|
|
"mean_token_accuracy": 0.8984375060535967,
|
||
|
|
"num_tokens": 44968457.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5594482421875,
|
||
|
|
"epoch": 0.632183908045977,
|
||
|
|
"grad_norm": 59.85087788955637,
|
||
|
|
"learning_rate": 4.963384589619233e-06,
|
||
|
|
"loss": 2.0512,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 45771639.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5338211059570312,
|
||
|
|
"epoch": 0.6436781609195402,
|
||
|
|
"grad_norm": 59.685128376437696,
|
||
|
|
"learning_rate": 4.960629382933959e-06,
|
||
|
|
"loss": 2.0163,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 46631555.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55718994140625,
|
||
|
|
"epoch": 0.6551724137931034,
|
||
|
|
"grad_norm": 59.46218131066281,
|
||
|
|
"learning_rate": 4.957775062455933e-06,
|
||
|
|
"loss": 1.9855,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 47425135.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53204345703125,
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"grad_norm": 59.66591139674002,
|
||
|
|
"learning_rate": 4.9548217431567665e-06,
|
||
|
|
"loss": 1.9881,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 48282128.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5321273803710938,
|
||
|
|
"epoch": 0.6781609195402298,
|
||
|
|
"grad_norm": 59.413094982042885,
|
||
|
|
"learning_rate": 4.951769543995731e-06,
|
||
|
|
"loss": 1.951,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 49150035.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.552520751953125,
|
||
|
|
"epoch": 0.6896551724137931,
|
||
|
|
"grad_norm": 58.95179931513381,
|
||
|
|
"learning_rate": 4.948618587914963e-06,
|
||
|
|
"loss": 1.9159,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 49967970.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5478515625,
|
||
|
|
"epoch": 0.7011494252873564,
|
||
|
|
"grad_norm": 58.78293117045981,
|
||
|
|
"learning_rate": 4.9453690018345144e-06,
|
||
|
|
"loss": 1.8829,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 50796138.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55072021484375,
|
||
|
|
"epoch": 0.7126436781609196,
|
||
|
|
"grad_norm": 58.7211077235095,
|
||
|
|
"learning_rate": 4.9420209166472386e-06,
|
||
|
|
"loss": 1.8573,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 51611865.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5400009155273438,
|
||
|
|
"epoch": 0.7241379310344828,
|
||
|
|
"grad_norm": 61.677454113756916,
|
||
|
|
"learning_rate": 4.938574467213519e-06,
|
||
|
|
"loss": 1.8451,
|
||
|
|
"mean_token_accuracy": 0.9101562553551048,
|
||
|
|
"num_tokens": 52450130.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.551239013671875,
|
||
|
|
"epoch": 0.735632183908046,
|
||
|
|
"grad_norm": 58.55481924234654,
|
||
|
|
"learning_rate": 4.935029792355834e-06,
|
||
|
|
"loss": 1.827,
|
||
|
|
"mean_token_accuracy": 0.9205729214008898,
|
||
|
|
"num_tokens": 53249566.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.545135498046875,
|
||
|
|
"epoch": 0.7471264367816092,
|
||
|
|
"grad_norm": 58.16487922973258,
|
||
|
|
"learning_rate": 4.931387034853173e-06,
|
||
|
|
"loss": 1.8032,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 54077184.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5316085815429688,
|
||
|
|
"epoch": 0.7586206896551724,
|
||
|
|
"grad_norm": 58.576343100426335,
|
||
|
|
"learning_rate": 4.927646341435276e-06,
|
||
|
|
"loss": 1.7661,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 54933670.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5523452758789062,
|
||
|
|
"epoch": 0.7701149425287356,
|
||
|
|
"grad_norm": 58.08683764205428,
|
||
|
|
"learning_rate": 4.9238078627767285e-06,
|
||
|
|
"loss": 1.7396,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 55733802.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.551300048828125,
|
||
|
|
"epoch": 0.7816091954022989,
|
||
|
|
"grad_norm": 58.08491660003921,
|
||
|
|
"learning_rate": 4.919871753490892e-06,
|
||
|
|
"loss": 1.6895,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 56526451.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5546722412109375,
|
||
|
|
"epoch": 0.7931034482758621,
|
||
|
|
"grad_norm": 58.51853320871066,
|
||
|
|
"learning_rate": 4.9158381721236715e-06,
|
||
|
|
"loss": 1.6761,
|
||
|
|
"mean_token_accuracy": 0.9205729214008898,
|
||
|
|
"num_tokens": 57305070.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5334014892578125,
|
||
|
|
"epoch": 0.8045977011494253,
|
||
|
|
"grad_norm": 58.170621114524614,
|
||
|
|
"learning_rate": 4.91170728114714e-06,
|
||
|
|
"loss": 1.6546,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 58169562.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5419540405273438,
|
||
|
|
"epoch": 0.8160919540229885,
|
||
|
|
"grad_norm": 58.528006132883995,
|
||
|
|
"learning_rate": 4.907479246952981e-06,
|
||
|
|
"loss": 1.6312,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 58988176.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5484161376953125,
|
||
|
|
"epoch": 0.8275862068965517,
|
||
|
|
"grad_norm": 58.636271269409825,
|
||
|
|
"learning_rate": 4.903154239845798e-06,
|
||
|
|
"loss": 1.5899,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 59799834.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5280075073242188,
|
||
|
|
"epoch": 0.8390804597701149,
|
||
|
|
"grad_norm": 58.5835216129002,
|
||
|
|
"learning_rate": 4.8987324340362445e-06,
|
||
|
|
"loss": 1.5841,
|
||
|
|
"mean_token_accuracy": 0.9166666716337204,
|
||
|
|
"num_tokens": 60676006.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5616531372070312,
|
||
|
|
"epoch": 0.8505747126436781,
|
||
|
|
"grad_norm": 60.89085729912909,
|
||
|
|
"learning_rate": 4.894214007634014e-06,
|
||
|
|
"loss": 1.5472,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 61467427.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5347137451171875,
|
||
|
|
"epoch": 0.8620689655172413,
|
||
|
|
"grad_norm": 59.747068058722576,
|
||
|
|
"learning_rate": 4.889599142640663e-06,
|
||
|
|
"loss": 1.5215,
|
||
|
|
"mean_token_accuracy": 0.9088541720993817,
|
||
|
|
"num_tokens": 62295438.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5299224853515625,
|
||
|
|
"epoch": 0.8735632183908046,
|
||
|
|
"grad_norm": 58.493866078976886,
|
||
|
|
"learning_rate": 4.884888024942282e-06,
|
||
|
|
"loss": 1.4759,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 63166965.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54058837890625,
|
||
|
|
"epoch": 0.8850574712643678,
|
||
|
|
"grad_norm": 60.16738350731097,
|
||
|
|
"learning_rate": 4.880080844302004e-06,
|
||
|
|
"loss": 1.4711,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 64006639.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5329971313476562,
|
||
|
|
"epoch": 0.896551724137931,
|
||
|
|
"grad_norm": 59.15598057345835,
|
||
|
|
"learning_rate": 4.875177794352364e-06,
|
||
|
|
"loss": 1.4256,
|
||
|
|
"mean_token_accuracy": 0.9401041702367365,
|
||
|
|
"num_tokens": 64880312.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5373687744140625,
|
||
|
|
"epoch": 0.9080459770114943,
|
||
|
|
"grad_norm": 59.64780911126226,
|
||
|
|
"learning_rate": 4.870179072587499e-06,
|
||
|
|
"loss": 1.4136,
|
||
|
|
"mean_token_accuracy": 0.9114583386108279,
|
||
|
|
"num_tokens": 65737156.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5340957641601562,
|
||
|
|
"epoch": 0.9195402298850575,
|
||
|
|
"grad_norm": 59.31089224424462,
|
||
|
|
"learning_rate": 4.865084880355193e-06,
|
||
|
|
"loss": 1.366,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 66600246.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5457611083984375,
|
||
|
|
"epoch": 0.9310344827586207,
|
||
|
|
"grad_norm": 58.67365335737598,
|
||
|
|
"learning_rate": 4.859895422848767e-06,
|
||
|
|
"loss": 1.3352,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 67409974.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5508956909179688,
|
||
|
|
"epoch": 0.9425287356321839,
|
||
|
|
"grad_norm": 59.941253952471996,
|
||
|
|
"learning_rate": 4.854610909098813e-06,
|
||
|
|
"loss": 1.3101,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 68213017.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.545867919921875,
|
||
|
|
"epoch": 0.9540229885057471,
|
||
|
|
"grad_norm": 58.22197118807192,
|
||
|
|
"learning_rate": 4.849231551964771e-06,
|
||
|
|
"loss": 1.2806,
|
||
|
|
"mean_token_accuracy": 0.9335937539581209,
|
||
|
|
"num_tokens": 69047530.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5390625,
|
||
|
|
"epoch": 0.9655172413793104,
|
||
|
|
"grad_norm": 58.73986491110616,
|
||
|
|
"learning_rate": 4.843757568126366e-06,
|
||
|
|
"loss": 1.2614,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 69871145.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.57440185546875,
|
||
|
|
"epoch": 0.9770114942528736,
|
||
|
|
"grad_norm": 58.0306658042546,
|
||
|
|
"learning_rate": 4.838189178074867e-06,
|
||
|
|
"loss": 1.2314,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 70637600.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5460357666015625,
|
||
|
|
"epoch": 0.9885057471264368,
|
||
|
|
"grad_norm": 57.84110468380361,
|
||
|
|
"learning_rate": 4.832526606104213e-06,
|
||
|
|
"loss": 1.1956,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 71459220.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5465316772460938,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 57.45695249217127,
|
||
|
|
"learning_rate": 4.826770080301978e-06,
|
||
|
|
"loss": 1.1805,
|
||
|
|
"mean_token_accuracy": 0.9296875041909516,
|
||
|
|
"num_tokens": 72277431.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5504074096679688,
|
||
|
|
"epoch": 1.0114942528735633,
|
||
|
|
"grad_norm": 57.11647405728569,
|
||
|
|
"learning_rate": 4.8209198325401815e-06,
|
||
|
|
"loss": 1.1402,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 73118155.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5403900146484375,
|
||
|
|
"epoch": 1.0229885057471264,
|
||
|
|
"grad_norm": 57.58423055463277,
|
||
|
|
"learning_rate": 4.814976098465951e-06,
|
||
|
|
"loss": 1.1167,
|
||
|
|
"mean_token_accuracy": 0.9296875041909516,
|
||
|
|
"num_tokens": 73960483.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5394744873046875,
|
||
|
|
"epoch": 1.0344827586206897,
|
||
|
|
"grad_norm": 56.91664155850013,
|
||
|
|
"learning_rate": 4.808939117492028e-06,
|
||
|
|
"loss": 1.0679,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 74806371.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5373764038085938,
|
||
|
|
"epoch": 1.0459770114942528,
|
||
|
|
"grad_norm": 57.550759721577286,
|
||
|
|
"learning_rate": 4.802809132787125e-06,
|
||
|
|
"loss": 1.061,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 75649917.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.542877197265625,
|
||
|
|
"epoch": 1.0574712643678161,
|
||
|
|
"grad_norm": 56.581488938244036,
|
||
|
|
"learning_rate": 4.796586391266135e-06,
|
||
|
|
"loss": 1.0517,
|
||
|
|
"mean_token_accuracy": 0.9270833376795053,
|
||
|
|
"num_tokens": 76503812.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.551666259765625,
|
||
|
|
"epoch": 1.0689655172413792,
|
||
|
|
"grad_norm": 57.7727620529194,
|
||
|
|
"learning_rate": 4.790271143580174e-06,
|
||
|
|
"loss": 1.0115,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 77300193.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5273056030273438,
|
||
|
|
"epoch": 1.0804597701149425,
|
||
|
|
"grad_norm": 56.45974821885547,
|
||
|
|
"learning_rate": 4.783863644106502e-06,
|
||
|
|
"loss": 0.9868,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 78158080.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.545013427734375,
|
||
|
|
"epoch": 1.0919540229885056,
|
||
|
|
"grad_norm": 57.95646226031294,
|
||
|
|
"learning_rate": 4.777364150938263e-06,
|
||
|
|
"loss": 0.967,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 78970891.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5445480346679688,
|
||
|
|
"epoch": 1.103448275862069,
|
||
|
|
"grad_norm": 56.3826184106477,
|
||
|
|
"learning_rate": 4.770772925874093e-06,
|
||
|
|
"loss": 0.9342,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 79797185.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5520553588867188,
|
||
|
|
"epoch": 1.1149425287356323,
|
||
|
|
"grad_norm": 57.57229556412407,
|
||
|
|
"learning_rate": 4.764090234407578e-06,
|
||
|
|
"loss": 0.9378,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 80609961.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5395355224609375,
|
||
|
|
"epoch": 1.1264367816091954,
|
||
|
|
"grad_norm": 55.28494434882409,
|
||
|
|
"learning_rate": 4.757316345716554e-06,
|
||
|
|
"loss": 0.8895,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 81431397.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5417556762695312,
|
||
|
|
"epoch": 1.1379310344827587,
|
||
|
|
"grad_norm": 56.62889719836715,
|
||
|
|
"learning_rate": 4.75045153265227e-06,
|
||
|
|
"loss": 0.8857,
|
||
|
|
"mean_token_accuracy": 0.9192708381451666,
|
||
|
|
"num_tokens": 82257632.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5480728149414062,
|
||
|
|
"epoch": 1.1494252873563218,
|
||
|
|
"grad_norm": 54.66966542092686,
|
||
|
|
"learning_rate": 4.743496071728396e-06,
|
||
|
|
"loss": 0.8359,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 83059920.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5300827026367188,
|
||
|
|
"epoch": 1.160919540229885,
|
||
|
|
"grad_norm": 55.3701092949065,
|
||
|
|
"learning_rate": 4.736450243109885e-06,
|
||
|
|
"loss": 0.8529,
|
||
|
|
"mean_token_accuracy": 0.9166666716337204,
|
||
|
|
"num_tokens": 83910320.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5412673950195312,
|
||
|
|
"epoch": 1.1724137931034484,
|
||
|
|
"grad_norm": 54.925211873446585,
|
||
|
|
"learning_rate": 4.729314330601684e-06,
|
||
|
|
"loss": 0.8188,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 84728079.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5335769653320312,
|
||
|
|
"epoch": 1.1839080459770115,
|
||
|
|
"grad_norm": 53.599510414280125,
|
||
|
|
"learning_rate": 4.7220886216373095e-06,
|
||
|
|
"loss": 0.7627,
|
||
|
|
"mean_token_accuracy": 0.9544270860496908,
|
||
|
|
"num_tokens": 85588279.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5458831787109375,
|
||
|
|
"epoch": 1.1954022988505748,
|
||
|
|
"grad_norm": 54.85644042986777,
|
||
|
|
"learning_rate": 4.714773407267264e-06,
|
||
|
|
"loss": 0.7858,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 86423379.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5371475219726562,
|
||
|
|
"epoch": 1.206896551724138,
|
||
|
|
"grad_norm": 52.70508106481705,
|
||
|
|
"learning_rate": 4.707368982147318e-06,
|
||
|
|
"loss": 0.731,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 87283283.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.539794921875,
|
||
|
|
"epoch": 1.2183908045977012,
|
||
|
|
"grad_norm": 52.85664100248831,
|
||
|
|
"learning_rate": 4.699875644526633e-06,
|
||
|
|
"loss": 0.7239,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 88114338.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5485763549804688,
|
||
|
|
"epoch": 1.2298850574712643,
|
||
|
|
"grad_norm": 51.94175773034053,
|
||
|
|
"learning_rate": 4.692293696235758e-06,
|
||
|
|
"loss": 0.6903,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 88926461.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531646728515625,
|
||
|
|
"epoch": 1.2413793103448276,
|
||
|
|
"grad_norm": 50.87427138533599,
|
||
|
|
"learning_rate": 4.684623442674463e-06,
|
||
|
|
"loss": 0.6617,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 89769524.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54351806640625,
|
||
|
|
"epoch": 1.2528735632183907,
|
||
|
|
"grad_norm": 51.75745904673131,
|
||
|
|
"learning_rate": 4.676865192799443e-06,
|
||
|
|
"loss": 0.6903,
|
||
|
|
"mean_token_accuracy": 0.9270833376795053,
|
||
|
|
"num_tokens": 90602609.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5421981811523438,
|
||
|
|
"epoch": 1.264367816091954,
|
||
|
|
"grad_norm": 49.25842347613154,
|
||
|
|
"learning_rate": 4.669019259111873e-06,
|
||
|
|
"loss": 0.6236,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 91423615.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5615921020507812,
|
||
|
|
"epoch": 1.2758620689655173,
|
||
|
|
"grad_norm": 47.655979296452784,
|
||
|
|
"learning_rate": 4.661085957644817e-06,
|
||
|
|
"loss": 0.5967,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 92204901.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5518264770507812,
|
||
|
|
"epoch": 1.2873563218390804,
|
||
|
|
"grad_norm": 47.714755290736136,
|
||
|
|
"learning_rate": 4.653065607950502e-06,
|
||
|
|
"loss": 0.5973,
|
||
|
|
"mean_token_accuracy": 0.9296875041909516,
|
||
|
|
"num_tokens": 93005537.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5359039306640625,
|
||
|
|
"epoch": 1.2988505747126438,
|
||
|
|
"grad_norm": 45.89988870201635,
|
||
|
|
"learning_rate": 4.644958533087443e-06,
|
||
|
|
"loss": 0.5801,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 93854099.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5466156005859375,
|
||
|
|
"epoch": 1.3103448275862069,
|
||
|
|
"grad_norm": 45.66002812793437,
|
||
|
|
"learning_rate": 4.636765059607434e-06,
|
||
|
|
"loss": 0.5638,
|
||
|
|
"mean_token_accuracy": 0.9401041702367365,
|
||
|
|
"num_tokens": 94673565.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5435638427734375,
|
||
|
|
"epoch": 1.3218390804597702,
|
||
|
|
"grad_norm": 45.14215656166137,
|
||
|
|
"learning_rate": 4.628485517542393e-06,
|
||
|
|
"loss": 0.5556,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 95476803.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.541839599609375,
|
||
|
|
"epoch": 1.3333333333333333,
|
||
|
|
"grad_norm": 42.636378872173985,
|
||
|
|
"learning_rate": 4.620120240391065e-06,
|
||
|
|
"loss": 0.5019,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 96327452.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5332412719726562,
|
||
|
|
"epoch": 1.3448275862068966,
|
||
|
|
"grad_norm": 43.81341541141291,
|
||
|
|
"learning_rate": 4.611669565105597e-06,
|
||
|
|
"loss": 0.5114,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 97185071.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5449066162109375,
|
||
|
|
"epoch": 1.3563218390804597,
|
||
|
|
"grad_norm": 41.21199218337275,
|
||
|
|
"learning_rate": 4.603133832077953e-06,
|
||
|
|
"loss": 0.4932,
|
||
|
|
"mean_token_accuracy": 0.9335937539581209,
|
||
|
|
"num_tokens": 98001207.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5452804565429688,
|
||
|
|
"epoch": 1.367816091954023,
|
||
|
|
"grad_norm": 40.05351715592126,
|
||
|
|
"learning_rate": 4.5945133851262185e-06,
|
||
|
|
"loss": 0.4633,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 98827697.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5493011474609375,
|
||
|
|
"epoch": 1.3793103448275863,
|
||
|
|
"grad_norm": 38.1413690912593,
|
||
|
|
"learning_rate": 4.585808571480739e-06,
|
||
|
|
"loss": 0.4567,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 99658235.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5312347412109375,
|
||
|
|
"epoch": 1.3908045977011494,
|
||
|
|
"grad_norm": 35.77272010447715,
|
||
|
|
"learning_rate": 4.577019741770137e-06,
|
||
|
|
"loss": 0.4394,
|
||
|
|
"mean_token_accuracy": 0.9296875041909516,
|
||
|
|
"num_tokens": 100521415.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5474624633789062,
|
||
|
|
"epoch": 1.4022988505747127,
|
||
|
|
"grad_norm": 35.44749167453788,
|
||
|
|
"learning_rate": 4.5681472500071935e-06,
|
||
|
|
"loss": 0.4051,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 101350872.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5480728149414062,
|
||
|
|
"epoch": 1.4137931034482758,
|
||
|
|
"grad_norm": 33.3855582923729,
|
||
|
|
"learning_rate": 4.559191453574582e-06,
|
||
|
|
"loss": 0.3849,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 102172977.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.538360595703125,
|
||
|
|
"epoch": 1.4252873563218391,
|
||
|
|
"grad_norm": 33.1834131236442,
|
||
|
|
"learning_rate": 4.550152713210478e-06,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 102998176.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5465621948242188,
|
||
|
|
"epoch": 1.4367816091954024,
|
||
|
|
"grad_norm": 31.2542834783487,
|
||
|
|
"learning_rate": 4.541031392994025e-06,
|
||
|
|
"loss": 0.3635,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 103812200.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5382003784179688,
|
||
|
|
"epoch": 1.4482758620689655,
|
||
|
|
"grad_norm": 32.40479508190425,
|
||
|
|
"learning_rate": 4.53182786033067e-06,
|
||
|
|
"loss": 0.3612,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 104634429.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5458450317382812,
|
||
|
|
"epoch": 1.4597701149425286,
|
||
|
|
"grad_norm": 30.72340713652481,
|
||
|
|
"learning_rate": 4.522542485937369e-06,
|
||
|
|
"loss": 0.3399,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 105461037.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.552581787109375,
|
||
|
|
"epoch": 1.471264367816092,
|
||
|
|
"grad_norm": 29.132170258942477,
|
||
|
|
"learning_rate": 4.513175643827647e-06,
|
||
|
|
"loss": 0.313,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 106274400.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5401077270507812,
|
||
|
|
"epoch": 1.4827586206896552,
|
||
|
|
"grad_norm": 28.683201688609554,
|
||
|
|
"learning_rate": 4.503727711296539e-06,
|
||
|
|
"loss": 0.3137,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 107103293.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5371475219726562,
|
||
|
|
"epoch": 1.4942528735632183,
|
||
|
|
"grad_norm": 26.82838757384805,
|
||
|
|
"learning_rate": 4.494199068905389e-06,
|
||
|
|
"loss": 0.2902,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 107941406.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5441970825195312,
|
||
|
|
"epoch": 1.5057471264367817,
|
||
|
|
"grad_norm": 25.971681456288458,
|
||
|
|
"learning_rate": 4.484590100466524e-06,
|
||
|
|
"loss": 0.2772,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 108767533.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52886962890625,
|
||
|
|
"epoch": 1.5172413793103448,
|
||
|
|
"grad_norm": 24.8536339487485,
|
||
|
|
"learning_rate": 4.474901193027791e-06,
|
||
|
|
"loss": 0.2905,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 109628216.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.524261474609375,
|
||
|
|
"epoch": 1.528735632183908,
|
||
|
|
"grad_norm": 23.69318911646761,
|
||
|
|
"learning_rate": 4.4651327368569695e-06,
|
||
|
|
"loss": 0.2648,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 110503540.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5345916748046875,
|
||
|
|
"epoch": 1.5402298850574714,
|
||
|
|
"grad_norm": 22.723236503876162,
|
||
|
|
"learning_rate": 4.455285125426049e-06,
|
||
|
|
"loss": 0.2575,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 111345385.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5496292114257812,
|
||
|
|
"epoch": 1.5517241379310345,
|
||
|
|
"grad_norm": 22.92712147978677,
|
||
|
|
"learning_rate": 4.445358755395382e-06,
|
||
|
|
"loss": 0.2472,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 112152579.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5406265258789062,
|
||
|
|
"epoch": 1.5632183908045976,
|
||
|
|
"grad_norm": 21.357350310348682,
|
||
|
|
"learning_rate": 4.435354026597707e-06,
|
||
|
|
"loss": 0.2491,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 112991802.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5425033569335938,
|
||
|
|
"epoch": 1.5747126436781609,
|
||
|
|
"grad_norm": 18.884927838452082,
|
||
|
|
"learning_rate": 4.425271342022039e-06,
|
||
|
|
"loss": 0.2242,
|
||
|
|
"mean_token_accuracy": 0.9544270860496908,
|
||
|
|
"num_tokens": 113818986.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.545745849609375,
|
||
|
|
"epoch": 1.5862068965517242,
|
||
|
|
"grad_norm": 19.69653684409779,
|
||
|
|
"learning_rate": 4.415111107797445e-06,
|
||
|
|
"loss": 0.2174,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 114642387.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5489501953125,
|
||
|
|
"epoch": 1.5977011494252875,
|
||
|
|
"grad_norm": 17.837394227991627,
|
||
|
|
"learning_rate": 4.404873733176678e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"mean_token_accuracy": 0.9687500018626451,
|
||
|
|
"num_tokens": 115451222.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52032470703125,
|
||
|
|
"epoch": 1.6091954022988506,
|
||
|
|
"grad_norm": 17.24235084071355,
|
||
|
|
"learning_rate": 4.3945596305196925e-06,
|
||
|
|
"loss": 0.2063,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 116312542.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52130126953125,
|
||
|
|
"epoch": 1.6206896551724137,
|
||
|
|
"grad_norm": 19.23462423448226,
|
||
|
|
"learning_rate": 4.384169215277042e-06,
|
||
|
|
"loss": 0.2174,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 117191343.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5353240966796875,
|
||
|
|
"epoch": 1.632183908045977,
|
||
|
|
"grad_norm": 19.832558193797162,
|
||
|
|
"learning_rate": 4.373702905973136e-06,
|
||
|
|
"loss": 0.2222,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 118019469.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.537261962890625,
|
||
|
|
"epoch": 1.6436781609195403,
|
||
|
|
"grad_norm": 18.073863888632665,
|
||
|
|
"learning_rate": 4.363161124189387e-06,
|
||
|
|
"loss": 0.213,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 118848731.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5355072021484375,
|
||
|
|
"epoch": 1.6551724137931034,
|
||
|
|
"grad_norm": 13.643534993364158,
|
||
|
|
"learning_rate": 4.352544294547229e-06,
|
||
|
|
"loss": 0.2255,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 119699438.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53302001953125,
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"grad_norm": 13.32804577717328,
|
||
|
|
"learning_rate": 4.341852844691012e-06,
|
||
|
|
"loss": 0.1919,
|
||
|
|
"mean_token_accuracy": 0.9427083367481828,
|
||
|
|
"num_tokens": 120543167.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5304183959960938,
|
||
|
|
"epoch": 1.6781609195402298,
|
||
|
|
"grad_norm": 12.1831913402612,
|
||
|
|
"learning_rate": 4.331087205270778e-06,
|
||
|
|
"loss": 0.1573,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 121413453.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5264511108398438,
|
||
|
|
"epoch": 1.6896551724137931,
|
||
|
|
"grad_norm": 14.866153648828307,
|
||
|
|
"learning_rate": 4.320247809924911e-06,
|
||
|
|
"loss": 0.1717,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 122277203.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5309906005859375,
|
||
|
|
"epoch": 1.7011494252873565,
|
||
|
|
"grad_norm": 36.174423890876305,
|
||
|
|
"learning_rate": 4.309335095262675e-06,
|
||
|
|
"loss": 0.3047,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 123098358.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54193115234375,
|
||
|
|
"epoch": 1.7126436781609196,
|
||
|
|
"grad_norm": 19.683188431556264,
|
||
|
|
"learning_rate": 4.2983495008466285e-06,
|
||
|
|
"loss": 0.1893,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 123918082.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5468902587890625,
|
||
|
|
"epoch": 1.7241379310344827,
|
||
|
|
"grad_norm": 11.029033694493663,
|
||
|
|
"learning_rate": 4.287291469174909e-06,
|
||
|
|
"loss": 0.1802,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 124727934.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5524139404296875,
|
||
|
|
"epoch": 1.735632183908046,
|
||
|
|
"grad_norm": 12.741964950820162,
|
||
|
|
"learning_rate": 4.276161445663423e-06,
|
||
|
|
"loss": 0.1935,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 125521364.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5598068237304688,
|
||
|
|
"epoch": 1.7471264367816093,
|
||
|
|
"grad_norm": 10.684223439753536,
|
||
|
|
"learning_rate": 4.264959878627891e-06,
|
||
|
|
"loss": 0.1764,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 126310482.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.539764404296875,
|
||
|
|
"epoch": 1.7586206896551724,
|
||
|
|
"grad_norm": 7.967845652668114,
|
||
|
|
"learning_rate": 4.253687219265803e-06,
|
||
|
|
"loss": 0.1465,
|
||
|
|
"mean_token_accuracy": 0.9596354190725833,
|
||
|
|
"num_tokens": 127127924.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5313034057617188,
|
||
|
|
"epoch": 1.7701149425287355,
|
||
|
|
"grad_norm": 8.78172779941521,
|
||
|
|
"learning_rate": 4.242343921638235e-06,
|
||
|
|
"loss": 0.1423,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 127963998.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5254440307617188,
|
||
|
|
"epoch": 1.7816091954022988,
|
||
|
|
"grad_norm": 8.679498363997155,
|
||
|
|
"learning_rate": 4.230930442651558e-06,
|
||
|
|
"loss": 0.1402,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 128815367.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5338897705078125,
|
||
|
|
"epoch": 1.793103448275862,
|
||
|
|
"grad_norm": 12.414584279230683,
|
||
|
|
"learning_rate": 4.219447242039043e-06,
|
||
|
|
"loss": 0.1571,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 129649673.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5256805419921875,
|
||
|
|
"epoch": 1.8045977011494254,
|
||
|
|
"grad_norm": 11.900284697954019,
|
||
|
|
"learning_rate": 4.207894782342337e-06,
|
||
|
|
"loss": 0.1883,
|
||
|
|
"mean_token_accuracy": 0.9427083367481828,
|
||
|
|
"num_tokens": 130490556.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5339889526367188,
|
||
|
|
"epoch": 1.8160919540229885,
|
||
|
|
"grad_norm": 7.737654770007032,
|
||
|
|
"learning_rate": 4.196273528892831e-06,
|
||
|
|
"loss": 0.161,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 131336884.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5406951904296875,
|
||
|
|
"epoch": 1.8275862068965516,
|
||
|
|
"grad_norm": 14.436453211359808,
|
||
|
|
"learning_rate": 4.18458394979292e-06,
|
||
|
|
"loss": 0.1516,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 132170053.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5588531494140625,
|
||
|
|
"epoch": 1.839080459770115,
|
||
|
|
"grad_norm": 6.146992863163749,
|
||
|
|
"learning_rate": 4.172826515897146e-06,
|
||
|
|
"loss": 0.1441,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 132962150.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5524063110351562,
|
||
|
|
"epoch": 1.8505747126436782,
|
||
|
|
"grad_norm": 17.04399335228688,
|
||
|
|
"learning_rate": 4.161001700793231e-06,
|
||
|
|
"loss": 0.1792,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 133791084.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55145263671875,
|
||
|
|
"epoch": 1.8620689655172413,
|
||
|
|
"grad_norm": 7.828687166002703,
|
||
|
|
"learning_rate": 4.149109980783004e-06,
|
||
|
|
"loss": 0.1377,
|
||
|
|
"mean_token_accuracy": 0.9544270860496908,
|
||
|
|
"num_tokens": 134601699.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.541046142578125,
|
||
|
|
"epoch": 1.8735632183908046,
|
||
|
|
"grad_norm": 15.274961569812856,
|
||
|
|
"learning_rate": 4.137151834863213e-06,
|
||
|
|
"loss": 0.165,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 135435563.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5604324340820312,
|
||
|
|
"epoch": 1.8850574712643677,
|
||
|
|
"grad_norm": 10.201587419105804,
|
||
|
|
"learning_rate": 4.125127744706232e-06,
|
||
|
|
"loss": 0.1241,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 136227698.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5586700439453125,
|
||
|
|
"epoch": 1.896551724137931,
|
||
|
|
"grad_norm": 11.218767501992696,
|
||
|
|
"learning_rate": 4.113038194640658e-06,
|
||
|
|
"loss": 0.1361,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 137028072.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5378341674804688,
|
||
|
|
"epoch": 1.9080459770114944,
|
||
|
|
"grad_norm": 10.759553109195826,
|
||
|
|
"learning_rate": 4.100883671631806e-06,
|
||
|
|
"loss": 0.1467,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 137879487.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.566436767578125,
|
||
|
|
"epoch": 1.9195402298850575,
|
||
|
|
"grad_norm": 12.342573902869242,
|
||
|
|
"learning_rate": 4.088664665262091e-06,
|
||
|
|
"loss": 0.1545,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 138645762.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5373306274414062,
|
||
|
|
"epoch": 1.9310344827586206,
|
||
|
|
"grad_norm": 14.908844134109147,
|
||
|
|
"learning_rate": 4.076381667711306e-06,
|
||
|
|
"loss": 0.1839,
|
||
|
|
"mean_token_accuracy": 0.9401041702367365,
|
||
|
|
"num_tokens": 139495511.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5425796508789062,
|
||
|
|
"epoch": 1.9425287356321839,
|
||
|
|
"grad_norm": 4.4442828301660064,
|
||
|
|
"learning_rate": 4.064035173736804e-06,
|
||
|
|
"loss": 0.1113,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 140341321.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5580520629882812,
|
||
|
|
"epoch": 1.9540229885057472,
|
||
|
|
"grad_norm": 8.345686227442089,
|
||
|
|
"learning_rate": 4.05162568065356e-06,
|
||
|
|
"loss": 0.1313,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 141135938.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5560073852539062,
|
||
|
|
"epoch": 1.9655172413793105,
|
||
|
|
"grad_norm": 5.335070929960651,
|
||
|
|
"learning_rate": 4.039153688314146e-06,
|
||
|
|
"loss": 0.1195,
|
||
|
|
"mean_token_accuracy": 0.9635416688397527,
|
||
|
|
"num_tokens": 141960068.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5646591186523438,
|
||
|
|
"epoch": 1.9770114942528736,
|
||
|
|
"grad_norm": 8.240841304443089,
|
||
|
|
"learning_rate": 4.0266196990885955e-06,
|
||
|
|
"loss": 0.1065,
|
||
|
|
"mean_token_accuracy": 0.9622395855840296,
|
||
|
|
"num_tokens": 142742990.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5467605590820312,
|
||
|
|
"epoch": 1.9885057471264367,
|
||
|
|
"grad_norm": 10.082258712427155,
|
||
|
|
"learning_rate": 4.014024217844167e-06,
|
||
|
|
"loss": 0.1196,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 143557918.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5526123046875,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 7.158314474332936,
|
||
|
|
"learning_rate": 4.001367751925008e-06,
|
||
|
|
"loss": 0.1259,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 144380626.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5583114624023438,
|
||
|
|
"epoch": 2.0114942528735633,
|
||
|
|
"grad_norm": 8.124921778124246,
|
||
|
|
"learning_rate": 3.98865081113172e-06,
|
||
|
|
"loss": 0.1174,
|
||
|
|
"mean_token_accuracy": 0.9570312525611371,
|
||
|
|
"num_tokens": 145190047.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5553131103515625,
|
||
|
|
"epoch": 2.0229885057471266,
|
||
|
|
"grad_norm": 5.03898978737638,
|
||
|
|
"learning_rate": 3.9758739077008256e-06,
|
||
|
|
"loss": 0.0908,
|
||
|
|
"mean_token_accuracy": 0.9661458353511989,
|
||
|
|
"num_tokens": 146000722.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5393218994140625,
|
||
|
|
"epoch": 2.0344827586206895,
|
||
|
|
"grad_norm": 5.644079988912752,
|
||
|
|
"learning_rate": 3.96303755628413e-06,
|
||
|
|
"loss": 0.0908,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 146826351.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5538177490234375,
|
||
|
|
"epoch": 2.045977011494253,
|
||
|
|
"grad_norm": 7.201807862926888,
|
||
|
|
"learning_rate": 3.950142273927996e-06,
|
||
|
|
"loss": 0.0837,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 147603914.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5235366821289062,
|
||
|
|
"epoch": 2.057471264367816,
|
||
|
|
"grad_norm": 4.912652652723082,
|
||
|
|
"learning_rate": 3.937188580052518e-06,
|
||
|
|
"loss": 0.08,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 148447329.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5220870971679688,
|
||
|
|
"epoch": 2.0689655172413794,
|
||
|
|
"grad_norm": 10.473323494894288,
|
||
|
|
"learning_rate": 3.924176996430597e-06,
|
||
|
|
"loss": 0.1432,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 149295756.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.526580810546875,
|
||
|
|
"epoch": 2.0804597701149423,
|
||
|
|
"grad_norm": 8.04200314568136,
|
||
|
|
"learning_rate": 3.911108047166924e-06,
|
||
|
|
"loss": 0.0891,
|
||
|
|
"mean_token_accuracy": 0.9687500018626451,
|
||
|
|
"num_tokens": 150134016.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5185546875,
|
||
|
|
"epoch": 2.0919540229885056,
|
||
|
|
"grad_norm": 4.355747854192094,
|
||
|
|
"learning_rate": 3.897982258676867e-06,
|
||
|
|
"loss": 0.0844,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 150973641.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.533843994140625,
|
||
|
|
"epoch": 2.103448275862069,
|
||
|
|
"grad_norm": 7.793915871437659,
|
||
|
|
"learning_rate": 3.8848001596652765e-06,
|
||
|
|
"loss": 0.0865,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 151784661.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5393905639648438,
|
||
|
|
"epoch": 2.1149425287356323,
|
||
|
|
"grad_norm": 4.469170022546009,
|
||
|
|
"learning_rate": 3.8715622811051754e-06,
|
||
|
|
"loss": 0.0927,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 152598583.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.527130126953125,
|
||
|
|
"epoch": 2.1264367816091956,
|
||
|
|
"grad_norm": 3.9563194078415385,
|
||
|
|
"learning_rate": 3.858269156216383e-06,
|
||
|
|
"loss": 0.082,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 153415586.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.524139404296875,
|
||
|
|
"epoch": 2.1379310344827585,
|
||
|
|
"grad_norm": 4.965128765266531,
|
||
|
|
"learning_rate": 3.844921320444031e-06,
|
||
|
|
"loss": 0.0964,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 154257216.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5194931030273438,
|
||
|
|
"epoch": 2.1494252873563218,
|
||
|
|
"grad_norm": 12.18058735895402,
|
||
|
|
"learning_rate": 3.8315193114369995e-06,
|
||
|
|
"loss": 0.0965,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 155099957.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50604248046875,
|
||
|
|
"epoch": 2.160919540229885,
|
||
|
|
"grad_norm": 5.512495898345173,
|
||
|
|
"learning_rate": 3.8180636690262565e-06,
|
||
|
|
"loss": 0.0988,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 155957358.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5256881713867188,
|
||
|
|
"epoch": 2.1724137931034484,
|
||
|
|
"grad_norm": 11.30593507768243,
|
||
|
|
"learning_rate": 3.804554935203115e-06,
|
||
|
|
"loss": 0.1068,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 156762306.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5245895385742188,
|
||
|
|
"epoch": 2.1839080459770113,
|
||
|
|
"grad_norm": 4.544672386500914,
|
||
|
|
"learning_rate": 3.7909936540974052e-06,
|
||
|
|
"loss": 0.0734,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 157592554.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5061416625976562,
|
||
|
|
"epoch": 2.1954022988505746,
|
||
|
|
"grad_norm": 3.941196817466825,
|
||
|
|
"learning_rate": 3.777380371955552e-06,
|
||
|
|
"loss": 0.0772,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 158445907.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515838623046875,
|
||
|
|
"epoch": 2.206896551724138,
|
||
|
|
"grad_norm": 3.232650955550973,
|
||
|
|
"learning_rate": 3.7637156371185744e-06,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 159250314.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5301513671875,
|
||
|
|
"epoch": 2.218390804597701,
|
||
|
|
"grad_norm": 5.776572070005293,
|
||
|
|
"learning_rate": 3.7500000000000005e-06,
|
||
|
|
"loss": 0.0821,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 160039518.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5020217895507812,
|
||
|
|
"epoch": 2.2298850574712645,
|
||
|
|
"grad_norm": 8.011873369364945,
|
||
|
|
"learning_rate": 3.7362340130636926e-06,
|
||
|
|
"loss": 0.0858,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 160893468.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5221786499023438,
|
||
|
|
"epoch": 2.2413793103448274,
|
||
|
|
"grad_norm": 4.253581057609115,
|
||
|
|
"learning_rate": 3.7224182308015977e-06,
|
||
|
|
"loss": 0.078,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 161694335.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5163421630859375,
|
||
|
|
"epoch": 2.2528735632183907,
|
||
|
|
"grad_norm": 11.977620750971552,
|
||
|
|
"learning_rate": 3.7085532097114098e-06,
|
||
|
|
"loss": 0.1257,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 162532877.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5193557739257812,
|
||
|
|
"epoch": 2.264367816091954,
|
||
|
|
"grad_norm": 4.100816778244049,
|
||
|
|
"learning_rate": 3.6946395082741582e-06,
|
||
|
|
"loss": 0.0741,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 163353544.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5131072998046875,
|
||
|
|
"epoch": 2.2758620689655173,
|
||
|
|
"grad_norm": 14.336855059492699,
|
||
|
|
"learning_rate": 3.6806776869317074e-06,
|
||
|
|
"loss": 0.1028,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 164189466.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.522857666015625,
|
||
|
|
"epoch": 2.2873563218390807,
|
||
|
|
"grad_norm": 12.863083017991736,
|
||
|
|
"learning_rate": 3.6666683080641846e-06,
|
||
|
|
"loss": 0.1047,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 165024121.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5307998657226562,
|
||
|
|
"epoch": 2.2988505747126435,
|
||
|
|
"grad_norm": 5.384315613913392,
|
||
|
|
"learning_rate": 3.6526119359673283e-06,
|
||
|
|
"loss": 0.0825,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 165858531.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5401611328125,
|
||
|
|
"epoch": 2.310344827586207,
|
||
|
|
"grad_norm": 6.978944080668443,
|
||
|
|
"learning_rate": 3.6385091368297582e-06,
|
||
|
|
"loss": 0.0949,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 166672880.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5244674682617188,
|
||
|
|
"epoch": 2.32183908045977,
|
||
|
|
"grad_norm": 8.586521251755975,
|
||
|
|
"learning_rate": 3.624360478710165e-06,
|
||
|
|
"loss": 0.0979,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 167513008.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5154495239257812,
|
||
|
|
"epoch": 2.3333333333333335,
|
||
|
|
"grad_norm": 10.059252916770694,
|
||
|
|
"learning_rate": 3.6101665315144357e-06,
|
||
|
|
"loss": 0.1087,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 168386249.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5272903442382812,
|
||
|
|
"epoch": 2.344827586206897,
|
||
|
|
"grad_norm": 3.7220569133269326,
|
||
|
|
"learning_rate": 3.595927866972694e-06,
|
||
|
|
"loss": 0.0893,
|
||
|
|
"mean_token_accuracy": 0.9661458353511989,
|
||
|
|
"num_tokens": 169217721.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5361862182617188,
|
||
|
|
"epoch": 2.3563218390804597,
|
||
|
|
"grad_norm": 8.14296764792559,
|
||
|
|
"learning_rate": 3.581645058616271e-06,
|
||
|
|
"loss": 0.0914,
|
||
|
|
"mean_token_accuracy": 0.9687500018626451,
|
||
|
|
"num_tokens": 170033814.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5298843383789062,
|
||
|
|
"epoch": 2.367816091954023,
|
||
|
|
"grad_norm": 2.896667570609037,
|
||
|
|
"learning_rate": 3.5673186817546047e-06,
|
||
|
|
"loss": 0.0564,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 170852345.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5321426391601562,
|
||
|
|
"epoch": 2.3793103448275863,
|
||
|
|
"grad_norm": 6.510425360047998,
|
||
|
|
"learning_rate": 3.552949313452067e-06,
|
||
|
|
"loss": 0.0669,
|
||
|
|
"mean_token_accuracy": 0.9752604181412607,
|
||
|
|
"num_tokens": 171671019.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.517181396484375,
|
||
|
|
"epoch": 2.3908045977011496,
|
||
|
|
"grad_norm": 4.429647054928265,
|
||
|
|
"learning_rate": 3.5385375325047167e-06,
|
||
|
|
"loss": 0.0555,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 172531805.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5280838012695312,
|
||
|
|
"epoch": 2.4022988505747125,
|
||
|
|
"grad_norm": 3.9428408925354477,
|
||
|
|
"learning_rate": 3.5240839194169885e-06,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 173363059.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5230560302734375,
|
||
|
|
"epoch": 2.413793103448276,
|
||
|
|
"grad_norm": 6.031126783984352,
|
||
|
|
"learning_rate": 3.5095890563783124e-06,
|
||
|
|
"loss": 0.0431,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 174211868.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52056884765625,
|
||
|
|
"epoch": 2.425287356321839,
|
||
|
|
"grad_norm": 4.468991885868761,
|
||
|
|
"learning_rate": 3.4950535272396564e-06,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 175030480.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.529876708984375,
|
||
|
|
"epoch": 2.4367816091954024,
|
||
|
|
"grad_norm": 9.239483704756537,
|
||
|
|
"learning_rate": 3.480477917490014e-06,
|
||
|
|
"loss": 0.0716,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 175843572.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5277099609375,
|
||
|
|
"epoch": 2.4482758620689653,
|
||
|
|
"grad_norm": 5.300163295816919,
|
||
|
|
"learning_rate": 3.4658628142328215e-06,
|
||
|
|
"loss": 0.0756,
|
||
|
|
"mean_token_accuracy": 0.9752604181412607,
|
||
|
|
"num_tokens": 176656377.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.523345947265625,
|
||
|
|
"epoch": 2.4597701149425286,
|
||
|
|
"grad_norm": 4.579054112244679,
|
||
|
|
"learning_rate": 3.4512088061623077e-06,
|
||
|
|
"loss": 0.0509,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 177483746.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.523223876953125,
|
||
|
|
"epoch": 2.471264367816092,
|
||
|
|
"grad_norm": 5.040930541242033,
|
||
|
|
"learning_rate": 3.436516483539781e-06,
|
||
|
|
"loss": 0.0551,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 178330666.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531158447265625,
|
||
|
|
"epoch": 2.4827586206896552,
|
||
|
|
"grad_norm": 8.739883447359308,
|
||
|
|
"learning_rate": 3.4217864381698523e-06,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 179149396.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5261764526367188,
|
||
|
|
"epoch": 2.4942528735632186,
|
||
|
|
"grad_norm": 11.494350665933359,
|
||
|
|
"learning_rate": 3.4070192633766025e-06,
|
||
|
|
"loss": 0.0428,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 179974795.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5234909057617188,
|
||
|
|
"epoch": 2.5057471264367814,
|
||
|
|
"grad_norm": 6.838876203026226,
|
||
|
|
"learning_rate": 3.39221555397968e-06,
|
||
|
|
"loss": 0.0796,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 180802086.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5234756469726562,
|
||
|
|
"epoch": 2.5172413793103448,
|
||
|
|
"grad_norm": 4.455755339288859,
|
||
|
|
"learning_rate": 3.37737590627034e-06,
|
||
|
|
"loss": 0.0805,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 181658273.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5229568481445312,
|
||
|
|
"epoch": 2.528735632183908,
|
||
|
|
"grad_norm": 3.4578598266072866,
|
||
|
|
"learning_rate": 3.362500917987427e-06,
|
||
|
|
"loss": 0.0415,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 182474385.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5162353515625,
|
||
|
|
"epoch": 2.5402298850574714,
|
||
|
|
"grad_norm": 3.3923382783923386,
|
||
|
|
"learning_rate": 3.3475911882933014e-06,
|
||
|
|
"loss": 0.045,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 183315758.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5235671997070312,
|
||
|
|
"epoch": 2.5517241379310347,
|
||
|
|
"grad_norm": 4.903114742385986,
|
||
|
|
"learning_rate": 3.332647317749702e-06,
|
||
|
|
"loss": 0.0466,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 184143585.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5148849487304688,
|
||
|
|
"epoch": 2.5632183908045976,
|
||
|
|
"grad_norm": 4.864354866386424,
|
||
|
|
"learning_rate": 3.3176699082935546e-06,
|
||
|
|
"loss": 0.0461,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 184985732.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5059356689453125,
|
||
|
|
"epoch": 2.574712643678161,
|
||
|
|
"grad_norm": 9.984487721834693,
|
||
|
|
"learning_rate": 3.3026595632127274e-06,
|
||
|
|
"loss": 0.0708,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 185828814.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5138702392578125,
|
||
|
|
"epoch": 2.586206896551724,
|
||
|
|
"grad_norm": 6.845004434710824,
|
||
|
|
"learning_rate": 3.2876168871217322e-06,
|
||
|
|
"loss": 0.0655,
|
||
|
|
"mean_token_accuracy": 0.9804687511641532,
|
||
|
|
"num_tokens": 186649376.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.493011474609375,
|
||
|
|
"epoch": 2.5977011494252875,
|
||
|
|
"grad_norm": 22.95951100464713,
|
||
|
|
"learning_rate": 3.272542485937369e-06,
|
||
|
|
"loss": 0.1239,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 187517356.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5219573974609375,
|
||
|
|
"epoch": 2.609195402298851,
|
||
|
|
"grad_norm": 14.8549297549626,
|
||
|
|
"learning_rate": 3.2574369668543187e-06,
|
||
|
|
"loss": 0.1119,
|
||
|
|
"mean_token_accuracy": 0.9635416688397527,
|
||
|
|
"num_tokens": 188347541.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5112228393554688,
|
||
|
|
"epoch": 2.6206896551724137,
|
||
|
|
"grad_norm": 4.479017898039443,
|
||
|
|
"learning_rate": 3.2423009383206876e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 189198859.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.511871337890625,
|
||
|
|
"epoch": 2.632183908045977,
|
||
|
|
"grad_norm": 7.490721295049219,
|
||
|
|
"learning_rate": 3.227135010013498e-06,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 190033894.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.514923095703125,
|
||
|
|
"epoch": 2.6436781609195403,
|
||
|
|
"grad_norm": 5.050493531281534,
|
||
|
|
"learning_rate": 3.211939792814131e-06,
|
||
|
|
"loss": 0.0602,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 190869827.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5310745239257812,
|
||
|
|
"epoch": 2.655172413793103,
|
||
|
|
"grad_norm": 4.861042871342369,
|
||
|
|
"learning_rate": 3.19671589878372e-06,
|
||
|
|
"loss": 0.0393,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 191679736.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5336990356445312,
|
||
|
|
"epoch": 2.6666666666666665,
|
||
|
|
"grad_norm": 6.075779640917305,
|
||
|
|
"learning_rate": 3.1814639411384953e-06,
|
||
|
|
"loss": 0.0594,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 192490031.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5216522216796875,
|
||
|
|
"epoch": 2.67816091954023,
|
||
|
|
"grad_norm": 3.9095981775216457,
|
||
|
|
"learning_rate": 3.1661845342250874e-06,
|
||
|
|
"loss": 0.0538,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 193317563.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5286178588867188,
|
||
|
|
"epoch": 2.689655172413793,
|
||
|
|
"grad_norm": 4.097740161891809,
|
||
|
|
"learning_rate": 3.1508782934957804e-06,
|
||
|
|
"loss": 0.0689,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 194121291.0,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5184707641601562,
|
||
|
|
"epoch": 2.7011494252873565,
|
||
|
|
"grad_norm": 2.965792573695505,
|
||
|
|
"learning_rate": 3.1355458354837183e-06,
|
||
|
|
"loss": 0.0435,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 194990367.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5118179321289062,
|
||
|
|
"epoch": 2.7126436781609193,
|
||
|
|
"grad_norm": 3.9101911113146133,
|
||
|
|
"learning_rate": 3.1201877777780724e-06,
|
||
|
|
"loss": 0.0577,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 195834396.0,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.518096923828125,
|
||
|
|
"epoch": 2.7241379310344827,
|
||
|
|
"grad_norm": 4.9106691954268715,
|
||
|
|
"learning_rate": 3.1048047389991693e-06,
|
||
|
|
"loss": 0.0442,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 196680761.0,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5065536499023438,
|
||
|
|
"epoch": 2.735632183908046,
|
||
|
|
"grad_norm": 3.9425883682767138,
|
||
|
|
"learning_rate": 3.089397338773569e-06,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 197540695.0,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5074462890625,
|
||
|
|
"epoch": 2.7471264367816093,
|
||
|
|
"grad_norm": 3.8639716797812973,
|
||
|
|
"learning_rate": 3.0739661977091027e-06,
|
||
|
|
"loss": 0.03,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 198381995.0,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5099868774414062,
|
||
|
|
"epoch": 2.7586206896551726,
|
||
|
|
"grad_norm": 5.774630120152915,
|
||
|
|
"learning_rate": 3.0585119373698858e-06,
|
||
|
|
"loss": 0.034,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 199222309.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5069580078125,
|
||
|
|
"epoch": 2.7701149425287355,
|
||
|
|
"grad_norm": 3.1153325710771202,
|
||
|
|
"learning_rate": 3.04303518025127e-06,
|
||
|
|
"loss": 0.032,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 200069860.0,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5121688842773438,
|
||
|
|
"epoch": 2.781609195402299,
|
||
|
|
"grad_norm": 4.199338548501109,
|
||
|
|
"learning_rate": 3.0275365497547747e-06,
|
||
|
|
"loss": 0.0341,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 200911603.0,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5169677734375,
|
||
|
|
"epoch": 2.793103448275862,
|
||
|
|
"grad_norm": 5.348906239774751,
|
||
|
|
"learning_rate": 3.012016670162977e-06,
|
||
|
|
"loss": 0.044,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 201716379.0,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5079116821289062,
|
||
|
|
"epoch": 2.8045977011494254,
|
||
|
|
"grad_norm": 8.098700249722082,
|
||
|
|
"learning_rate": 2.9964761666143638e-06,
|
||
|
|
"loss": 0.0352,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 202537011.0,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5113906860351562,
|
||
|
|
"epoch": 2.8160919540229887,
|
||
|
|
"grad_norm": 6.415054459447643,
|
||
|
|
"learning_rate": 2.980915665078153e-06,
|
||
|
|
"loss": 0.0494,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 203368643.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52069091796875,
|
||
|
|
"epoch": 2.8275862068965516,
|
||
|
|
"grad_norm": 4.0837268974382885,
|
||
|
|
"learning_rate": 2.9653357923290753e-06,
|
||
|
|
"loss": 0.0264,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 204173970.0,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49139404296875,
|
||
|
|
"epoch": 2.839080459770115,
|
||
|
|
"grad_norm": 11.87980766761474,
|
||
|
|
"learning_rate": 2.949737175922135e-06,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 205069359.0,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5181503295898438,
|
||
|
|
"epoch": 2.8505747126436782,
|
||
|
|
"grad_norm": 5.55092789641183,
|
||
|
|
"learning_rate": 2.9341204441673267e-06,
|
||
|
|
"loss": 0.0569,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 205863015.0,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5153961181640625,
|
||
|
|
"epoch": 2.862068965517241,
|
||
|
|
"grad_norm": 11.249207750194575,
|
||
|
|
"learning_rate": 2.9184862261043272e-06,
|
||
|
|
"loss": 0.0652,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 206693116.0,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5162200927734375,
|
||
|
|
"epoch": 2.873563218390805,
|
||
|
|
"grad_norm": 11.262000025655086,
|
||
|
|
"learning_rate": 2.902835151477161e-06,
|
||
|
|
"loss": 0.0772,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 207535238.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49965667724609375,
|
||
|
|
"epoch": 2.8850574712643677,
|
||
|
|
"grad_norm": 5.331206524136488,
|
||
|
|
"learning_rate": 2.887167850708831e-06,
|
||
|
|
"loss": 0.0595,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 208414225.0,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.509613037109375,
|
||
|
|
"epoch": 2.896551724137931,
|
||
|
|
"grad_norm": 10.395838999195405,
|
||
|
|
"learning_rate": 2.8714849548759293e-06,
|
||
|
|
"loss": 0.0749,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 209267716.0,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5212478637695312,
|
||
|
|
"epoch": 2.9080459770114944,
|
||
|
|
"grad_norm": 3.4925992718532903,
|
||
|
|
"learning_rate": 2.8557870956832135e-06,
|
||
|
|
"loss": 0.0441,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 210079116.0,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52960205078125,
|
||
|
|
"epoch": 2.9195402298850572,
|
||
|
|
"grad_norm": 3.0763445569145795,
|
||
|
|
"learning_rate": 2.840074905438161e-06,
|
||
|
|
"loss": 0.0356,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 210868372.0,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.509857177734375,
|
||
|
|
"epoch": 2.9310344827586206,
|
||
|
|
"grad_norm": 6.085180472974132,
|
||
|
|
"learning_rate": 2.8243490170255046e-06,
|
||
|
|
"loss": 0.0374,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 211719378.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52435302734375,
|
||
|
|
"epoch": 2.942528735632184,
|
||
|
|
"grad_norm": 4.342033965891391,
|
||
|
|
"learning_rate": 2.808610063881737e-06,
|
||
|
|
"loss": 0.0367,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 212504744.0,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5114364624023438,
|
||
|
|
"epoch": 2.954022988505747,
|
||
|
|
"grad_norm": 4.53595707345555,
|
||
|
|
"learning_rate": 2.792858679969596e-06,
|
||
|
|
"loss": 0.041,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 213336715.0,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5150146484375,
|
||
|
|
"epoch": 2.9655172413793105,
|
||
|
|
"grad_norm": 3.532952348106941,
|
||
|
|
"learning_rate": 2.7770954997525277e-06,
|
||
|
|
"loss": 0.0301,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 214157360.0,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5124893188476562,
|
||
|
|
"epoch": 2.9770114942528734,
|
||
|
|
"grad_norm": 5.374052711219377,
|
||
|
|
"learning_rate": 2.761321158169134e-06,
|
||
|
|
"loss": 0.0333,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 214981518.0,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5047225952148438,
|
||
|
|
"epoch": 2.9885057471264367,
|
||
|
|
"grad_norm": 6.297842060515772,
|
||
|
|
"learning_rate": 2.745536290607593e-06,
|
||
|
|
"loss": 0.0416,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 215835954.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5225372314453125,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 5.830015018464227,
|
||
|
|
"learning_rate": 2.729741532880069e-06,
|
||
|
|
"loss": 0.0578,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 216642134.0,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5021514892578125,
|
||
|
|
"epoch": 3.0114942528735633,
|
||
|
|
"grad_norm": 2.477209280475651,
|
||
|
|
"learning_rate": 2.7139375211971e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 217503099.0,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5126724243164062,
|
||
|
|
"epoch": 3.0229885057471266,
|
||
|
|
"grad_norm": 3.8701219743611466,
|
||
|
|
"learning_rate": 2.6981248921419713e-06,
|
||
|
|
"loss": 0.0228,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 218333703.0,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5178909301757812,
|
||
|
|
"epoch": 3.0344827586206895,
|
||
|
|
"grad_norm": 2.7967744739077642,
|
||
|
|
"learning_rate": 2.682304282645077e-06,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 219141954.0,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5064697265625,
|
||
|
|
"epoch": 3.045977011494253,
|
||
|
|
"grad_norm": 2.8493437564285675,
|
||
|
|
"learning_rate": 2.66647632995826e-06,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 219981675.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5040740966796875,
|
||
|
|
"epoch": 3.057471264367816,
|
||
|
|
"grad_norm": 3.39106083775523,
|
||
|
|
"learning_rate": 2.6506416716291466e-06,
|
||
|
|
"loss": 0.0182,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 220814651.0,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5218353271484375,
|
||
|
|
"epoch": 3.0689655172413794,
|
||
|
|
"grad_norm": 3.245992716985913,
|
||
|
|
"learning_rate": 2.634800945475465e-06,
|
||
|
|
"loss": 0.0359,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 221625766.0,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515411376953125,
|
||
|
|
"epoch": 3.0804597701149423,
|
||
|
|
"grad_norm": 3.4758561756981003,
|
||
|
|
"learning_rate": 2.6189547895593565e-06,
|
||
|
|
"loss": 0.0272,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 222429577.0,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5009841918945312,
|
||
|
|
"epoch": 3.0919540229885056,
|
||
|
|
"grad_norm": 5.9657298113817445,
|
||
|
|
"learning_rate": 2.6031038421616684e-06,
|
||
|
|
"loss": 0.0216,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 223278965.0,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5102462768554688,
|
||
|
|
"epoch": 3.103448275862069,
|
||
|
|
"grad_norm": 4.077413121693344,
|
||
|
|
"learning_rate": 2.587248741756253e-06,
|
||
|
|
"loss": 0.021,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 224122111.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5262985229492188,
|
||
|
|
"epoch": 3.1149425287356323,
|
||
|
|
"grad_norm": 4.99945914752149,
|
||
|
|
"learning_rate": 2.5713901269842405e-06,
|
||
|
|
"loss": 0.0286,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 224930403.0,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4971160888671875,
|
||
|
|
"epoch": 3.1264367816091956,
|
||
|
|
"grad_norm": 4.210562762990907,
|
||
|
|
"learning_rate": 2.555528636628324e-06,
|
||
|
|
"loss": 0.0334,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 225800782.0,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5353775024414062,
|
||
|
|
"epoch": 3.1379310344827585,
|
||
|
|
"grad_norm": 3.3827078358399234,
|
||
|
|
"learning_rate": 2.53966490958702e-06,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 226565726.0,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5124588012695312,
|
||
|
|
"epoch": 3.1494252873563218,
|
||
|
|
"grad_norm": 3.6515337585771617,
|
||
|
|
"learning_rate": 2.5237995848489422e-06,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 227384084.0,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5161056518554688,
|
||
|
|
"epoch": 3.160919540229885,
|
||
|
|
"grad_norm": 2.5107176292809523,
|
||
|
|
"learning_rate": 2.507933301467056e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 228221106.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5079498291015625,
|
||
|
|
"epoch": 3.1724137931034484,
|
||
|
|
"grad_norm": 4.072490672739514,
|
||
|
|
"learning_rate": 2.4920666985329446e-06,
|
||
|
|
"loss": 0.0231,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 229063105.0,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5137939453125,
|
||
|
|
"epoch": 3.1839080459770113,
|
||
|
|
"grad_norm": 4.772256728134127,
|
||
|
|
"learning_rate": 2.4762004151510586e-06,
|
||
|
|
"loss": 0.0272,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 229887163.0,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5074691772460938,
|
||
|
|
"epoch": 3.1954022988505746,
|
||
|
|
"grad_norm": 2.5146705491674632,
|
||
|
|
"learning_rate": 2.4603350904129802e-06,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 230714460.0,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5012588500976562,
|
||
|
|
"epoch": 3.206896551724138,
|
||
|
|
"grad_norm": 3.4523116738901187,
|
||
|
|
"learning_rate": 2.4444713633716764e-06,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 231546214.0,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5137557983398438,
|
||
|
|
"epoch": 3.218390804597701,
|
||
|
|
"grad_norm": 4.103312747798813,
|
||
|
|
"learning_rate": 2.42860987301576e-06,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 232367417.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5189132690429688,
|
||
|
|
"epoch": 3.2298850574712645,
|
||
|
|
"grad_norm": 3.0711187257517323,
|
||
|
|
"learning_rate": 2.4127512582437486e-06,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 233171390.0,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4828948974609375,
|
||
|
|
"epoch": 3.2413793103448274,
|
||
|
|
"grad_norm": 4.593361566496384,
|
||
|
|
"learning_rate": 2.3968961578383324e-06,
|
||
|
|
"loss": 0.0242,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 234048614.0,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49707794189453125,
|
||
|
|
"epoch": 3.2528735632183907,
|
||
|
|
"grad_norm": 4.369720789995865,
|
||
|
|
"learning_rate": 2.3810452104406444e-06,
|
||
|
|
"loss": 0.0181,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 234880964.0,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49500274658203125,
|
||
|
|
"epoch": 3.264367816091954,
|
||
|
|
"grad_norm": 4.068902757291551,
|
||
|
|
"learning_rate": 2.3651990545245357e-06,
|
||
|
|
"loss": 0.0188,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 235742677.0,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5010452270507812,
|
||
|
|
"epoch": 3.2758620689655173,
|
||
|
|
"grad_norm": 4.533741620441784,
|
||
|
|
"learning_rate": 2.3493583283708542e-06,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 236562341.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5067520141601562,
|
||
|
|
"epoch": 3.2873563218390807,
|
||
|
|
"grad_norm": 7.309205026590191,
|
||
|
|
"learning_rate": 2.3335236700417404e-06,
|
||
|
|
"loss": 0.0294,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 237376240.0,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49813079833984375,
|
||
|
|
"epoch": 3.2988505747126435,
|
||
|
|
"grad_norm": 6.265890191493295,
|
||
|
|
"learning_rate": 2.3176957173549236e-06,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 238205161.0,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5123062133789062,
|
||
|
|
"epoch": 3.310344827586207,
|
||
|
|
"grad_norm": 5.357876728974506,
|
||
|
|
"learning_rate": 2.3018751078580287e-06,
|
||
|
|
"loss": 0.0301,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 239018120.0,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50250244140625,
|
||
|
|
"epoch": 3.32183908045977,
|
||
|
|
"grad_norm": 5.842965328064792,
|
||
|
|
"learning_rate": 2.2860624788029013e-06,
|
||
|
|
"loss": 0.0192,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 239830755.0,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48967742919921875,
|
||
|
|
"epoch": 3.3333333333333335,
|
||
|
|
"grad_norm": 6.615246255894702,
|
||
|
|
"learning_rate": 2.2702584671199317e-06,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 240669859.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49662017822265625,
|
||
|
|
"epoch": 3.344827586206897,
|
||
|
|
"grad_norm": 2.6840632671541362,
|
||
|
|
"learning_rate": 2.2544637093924072e-06,
|
||
|
|
"loss": 0.0181,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 241488468.0,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49695587158203125,
|
||
|
|
"epoch": 3.3563218390804597,
|
||
|
|
"grad_norm": 6.090789224137067,
|
||
|
|
"learning_rate": 2.238678841830867e-06,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 242306679.0,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5002517700195312,
|
||
|
|
"epoch": 3.367816091954023,
|
||
|
|
"grad_norm": 9.067054425714629,
|
||
|
|
"learning_rate": 2.2229045002474727e-06,
|
||
|
|
"loss": 0.028,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 243130959.0,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4901580810546875,
|
||
|
|
"epoch": 3.3793103448275863,
|
||
|
|
"grad_norm": 6.6957071542612425,
|
||
|
|
"learning_rate": 2.2071413200304046e-06,
|
||
|
|
"loss": 0.021,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 243963959.0,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49200439453125,
|
||
|
|
"epoch": 3.3908045977011496,
|
||
|
|
"grad_norm": 5.790677099390474,
|
||
|
|
"learning_rate": 2.1913899361182634e-06,
|
||
|
|
"loss": 0.0217,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 244789315.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48865509033203125,
|
||
|
|
"epoch": 3.4022988505747125,
|
||
|
|
"grad_norm": 6.64954441446596,
|
||
|
|
"learning_rate": 2.1756509829744958e-06,
|
||
|
|
"loss": 0.0309,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 245647468.0,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49068450927734375,
|
||
|
|
"epoch": 3.413793103448276,
|
||
|
|
"grad_norm": 5.000268942128174,
|
||
|
|
"learning_rate": 2.1599250945618404e-06,
|
||
|
|
"loss": 0.031,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 246506392.0,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5043487548828125,
|
||
|
|
"epoch": 3.425287356321839,
|
||
|
|
"grad_norm": 6.690882034959598,
|
||
|
|
"learning_rate": 2.1442129043167877e-06,
|
||
|
|
"loss": 0.0247,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 247317246.0,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48810577392578125,
|
||
|
|
"epoch": 3.4367816091954024,
|
||
|
|
"grad_norm": 7.4751141517356166,
|
||
|
|
"learning_rate": 2.128515045124071e-06,
|
||
|
|
"loss": 0.0303,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 248176152.0,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48333740234375,
|
||
|
|
"epoch": 3.4482758620689653,
|
||
|
|
"grad_norm": 2.8897721353550296,
|
||
|
|
"learning_rate": 2.1128321492911697e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 249019061.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48685455322265625,
|
||
|
|
"epoch": 3.4597701149425286,
|
||
|
|
"grad_norm": 3.6616455523506777,
|
||
|
|
"learning_rate": 2.0971648485228404e-06,
|
||
|
|
"loss": 0.0231,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 249879936.0,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4935455322265625,
|
||
|
|
"epoch": 3.471264367816092,
|
||
|
|
"grad_norm": 2.0624568242402432,
|
||
|
|
"learning_rate": 2.0815137738956736e-06,
|
||
|
|
"loss": 0.015,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 250720828.0,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.485931396484375,
|
||
|
|
"epoch": 3.4827586206896552,
|
||
|
|
"grad_norm": 3.7427616919674827,
|
||
|
|
"learning_rate": 2.0658795558326745e-06,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 251591480.0,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49802398681640625,
|
||
|
|
"epoch": 3.4942528735632186,
|
||
|
|
"grad_norm": 2.669949059595247,
|
||
|
|
"learning_rate": 2.0502628240778655e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 252425658.0,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4919586181640625,
|
||
|
|
"epoch": 3.5057471264367814,
|
||
|
|
"grad_norm": 4.948406162990609,
|
||
|
|
"learning_rate": 2.034664207670925e-06,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 253260881.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49547576904296875,
|
||
|
|
"epoch": 3.5172413793103448,
|
||
|
|
"grad_norm": 5.26095619649171,
|
||
|
|
"learning_rate": 2.019084334921849e-06,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 254103428.0,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49622344970703125,
|
||
|
|
"epoch": 3.528735632183908,
|
||
|
|
"grad_norm": 2.0304772167522755,
|
||
|
|
"learning_rate": 2.003523833385637e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 254934799.0,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49666595458984375,
|
||
|
|
"epoch": 3.5402298850574714,
|
||
|
|
"grad_norm": 3.2058148127523083,
|
||
|
|
"learning_rate": 1.987983329837024e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 255740355.0,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.484710693359375,
|
||
|
|
"epoch": 3.5517241379310347,
|
||
|
|
"grad_norm": 5.056038254220789,
|
||
|
|
"learning_rate": 1.972463450245226e-06,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 256592207.0,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49091339111328125,
|
||
|
|
"epoch": 3.5632183908045976,
|
||
|
|
"grad_norm": 4.295846823386714,
|
||
|
|
"learning_rate": 1.956964819748731e-06,
|
||
|
|
"loss": 0.0186,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 257394928.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.484405517578125,
|
||
|
|
"epoch": 3.574712643678161,
|
||
|
|
"grad_norm": 5.136383467949302,
|
||
|
|
"learning_rate": 1.9414880626301147e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 258218560.0,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47745513916015625,
|
||
|
|
"epoch": 3.586206896551724,
|
||
|
|
"grad_norm": 1.5987930235387136,
|
||
|
|
"learning_rate": 1.9260338022908972e-06,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 259076886.0,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.490386962890625,
|
||
|
|
"epoch": 3.5977011494252875,
|
||
|
|
"grad_norm": 5.630174223879903,
|
||
|
|
"learning_rate": 1.9106026612264316e-06,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 259903431.0,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4911651611328125,
|
||
|
|
"epoch": 3.609195402298851,
|
||
|
|
"grad_norm": 3.594997086155672,
|
||
|
|
"learning_rate": 1.895195261000831e-06,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 260724089.0,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4875335693359375,
|
||
|
|
"epoch": 3.6206896551724137,
|
||
|
|
"grad_norm": 7.1001251451875955,
|
||
|
|
"learning_rate": 1.8798122222219288e-06,
|
||
|
|
"loss": 0.0182,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 261570168.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49361419677734375,
|
||
|
|
"epoch": 3.632183908045977,
|
||
|
|
"grad_norm": 1.8488403064040235,
|
||
|
|
"learning_rate": 1.8644541645162834e-06,
|
||
|
|
"loss": 0.005,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 262377615.0,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47705841064453125,
|
||
|
|
"epoch": 3.6436781609195403,
|
||
|
|
"grad_norm": 8.419399542330668,
|
||
|
|
"learning_rate": 1.84912170650422e-06,
|
||
|
|
"loss": 0.017,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 263233842.0,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4798583984375,
|
||
|
|
"epoch": 3.655172413793103,
|
||
|
|
"grad_norm": 6.948749207718704,
|
||
|
|
"learning_rate": 1.833815465774913e-06,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 264054307.0,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48322296142578125,
|
||
|
|
"epoch": 3.6666666666666665,
|
||
|
|
"grad_norm": 4.4143868380602544,
|
||
|
|
"learning_rate": 1.818536058861506e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 264889320.0,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48094940185546875,
|
||
|
|
"epoch": 3.67816091954023,
|
||
|
|
"grad_norm": 7.51629286227342,
|
||
|
|
"learning_rate": 1.803284101216281e-06,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 265727293.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48848724365234375,
|
||
|
|
"epoch": 3.689655172413793,
|
||
|
|
"grad_norm": 4.390187366525572,
|
||
|
|
"learning_rate": 1.7880602071858694e-06,
|
||
|
|
"loss": 0.0248,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 266554947.0,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5014801025390625,
|
||
|
|
"epoch": 3.7011494252873565,
|
||
|
|
"grad_norm": 4.765097706705711,
|
||
|
|
"learning_rate": 1.7728649899865024e-06,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 267364160.0,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47812652587890625,
|
||
|
|
"epoch": 3.7126436781609193,
|
||
|
|
"grad_norm": 4.601028573147119,
|
||
|
|
"learning_rate": 1.7576990616793139e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 268207518.0,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5039901733398438,
|
||
|
|
"epoch": 3.7241379310344827,
|
||
|
|
"grad_norm": 9.345319142946543,
|
||
|
|
"learning_rate": 1.7425630331456821e-06,
|
||
|
|
"loss": 0.0297,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 268994732.0,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5049362182617188,
|
||
|
|
"epoch": 3.735632183908046,
|
||
|
|
"grad_norm": 10.813891797521325,
|
||
|
|
"learning_rate": 1.7274575140626318e-06,
|
||
|
|
"loss": 0.0305,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 269789989.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4865875244140625,
|
||
|
|
"epoch": 3.7471264367816093,
|
||
|
|
"grad_norm": 3.732890733936591,
|
||
|
|
"learning_rate": 1.7123831128782686e-06,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 270633429.0,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.487823486328125,
|
||
|
|
"epoch": 3.7586206896551726,
|
||
|
|
"grad_norm": 3.952992845746983,
|
||
|
|
"learning_rate": 1.697340436787273e-06,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 271480889.0,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49646759033203125,
|
||
|
|
"epoch": 3.7701149425287355,
|
||
|
|
"grad_norm": 6.939199060751946,
|
||
|
|
"learning_rate": 1.6823300917064462e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 272305320.0,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4934539794921875,
|
||
|
|
"epoch": 3.781609195402299,
|
||
|
|
"grad_norm": 5.40884515086912,
|
||
|
|
"learning_rate": 1.6673526822502982e-06,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 273117988.0,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48311614990234375,
|
||
|
|
"epoch": 3.793103448275862,
|
||
|
|
"grad_norm": 3.129709097075508,
|
||
|
|
"learning_rate": 1.6524088117066984e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 273990464.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.491790771484375,
|
||
|
|
"epoch": 3.8045977011494254,
|
||
|
|
"grad_norm": 3.8874739362962645,
|
||
|
|
"learning_rate": 1.637499082012574e-06,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 274810040.0,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5032882690429688,
|
||
|
|
"epoch": 3.8160919540229887,
|
||
|
|
"grad_norm": 4.149015091463172,
|
||
|
|
"learning_rate": 1.6226240937296617e-06,
|
||
|
|
"loss": 0.0231,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 275603319.0,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49298858642578125,
|
||
|
|
"epoch": 3.8275862068965516,
|
||
|
|
"grad_norm": 3.6377764732920284,
|
||
|
|
"learning_rate": 1.6077844460203207e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 276446052.0,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5046768188476562,
|
||
|
|
"epoch": 3.839080459770115,
|
||
|
|
"grad_norm": 2.0960585306443353,
|
||
|
|
"learning_rate": 1.5929807366233979e-06,
|
||
|
|
"loss": 0.0077,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 277257667.0,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5001907348632812,
|
||
|
|
"epoch": 3.8505747126436782,
|
||
|
|
"grad_norm": 1.1308830838111201,
|
||
|
|
"learning_rate": 1.5782135618301486e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 278073776.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49394989013671875,
|
||
|
|
"epoch": 3.862068965517241,
|
||
|
|
"grad_norm": 3.0083262718142056,
|
||
|
|
"learning_rate": 1.56348351646022e-06,
|
||
|
|
"loss": 0.007,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 278928254.0,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49828338623046875,
|
||
|
|
"epoch": 3.873563218390805,
|
||
|
|
"grad_norm": 5.123947951886666,
|
||
|
|
"learning_rate": 1.5487911938376925e-06,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 279751330.0,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.489593505859375,
|
||
|
|
"epoch": 3.8850574712643677,
|
||
|
|
"grad_norm": 4.035251892279188,
|
||
|
|
"learning_rate": 1.5341371857671782e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 280570344.0,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48815155029296875,
|
||
|
|
"epoch": 3.896551724137931,
|
||
|
|
"grad_norm": 5.476176656071363,
|
||
|
|
"learning_rate": 1.5195220825099863e-06,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 281410149.0,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48846435546875,
|
||
|
|
"epoch": 3.9080459770114944,
|
||
|
|
"grad_norm": 6.167355254742196,
|
||
|
|
"learning_rate": 1.5049464727603453e-06,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 282246542.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4861907958984375,
|
||
|
|
"epoch": 3.9195402298850572,
|
||
|
|
"grad_norm": 3.2444266602755376,
|
||
|
|
"learning_rate": 1.4904109436216885e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 283099687.0,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4877471923828125,
|
||
|
|
"epoch": 3.9310344827586206,
|
||
|
|
"grad_norm": 2.976257301458973,
|
||
|
|
"learning_rate": 1.475916080583012e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 283924877.0,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4852752685546875,
|
||
|
|
"epoch": 3.942528735632184,
|
||
|
|
"grad_norm": 5.111111810610919,
|
||
|
|
"learning_rate": 1.4614624674952843e-06,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 284789322.0,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.490936279296875,
|
||
|
|
"epoch": 3.954022988505747,
|
||
|
|
"grad_norm": 8.373925102066526,
|
||
|
|
"learning_rate": 1.4470506865479337e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 285618482.0,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49163818359375,
|
||
|
|
"epoch": 3.9655172413793105,
|
||
|
|
"grad_norm": 1.6045847963496473,
|
||
|
|
"learning_rate": 1.4326813182453959e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 286442938.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4775238037109375,
|
||
|
|
"epoch": 3.9770114942528734,
|
||
|
|
"grad_norm": 6.809758589293284,
|
||
|
|
"learning_rate": 1.4183549413837288e-06,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 287300227.0,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49134063720703125,
|
||
|
|
"epoch": 3.9885057471264367,
|
||
|
|
"grad_norm": 2.7969846655833166,
|
||
|
|
"learning_rate": 1.4040721330273063e-06,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 288119497.0,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478973388671875,
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 4.820339118038163,
|
||
|
|
"learning_rate": 1.3898334684855647e-06,
|
||
|
|
"loss": 0.0223,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 288974058.0,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486541748046875,
|
||
|
|
"epoch": 4.011494252873563,
|
||
|
|
"grad_norm": 2.0004363087793915,
|
||
|
|
"learning_rate": 1.375639521289836e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 289813648.0,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49737548828125,
|
||
|
|
"epoch": 4.022988505747127,
|
||
|
|
"grad_norm": 5.139419062447743,
|
||
|
|
"learning_rate": 1.3614908631702435e-06,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 290636567.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49163818359375,
|
||
|
|
"epoch": 4.0344827586206895,
|
||
|
|
"grad_norm": 0.788318794526319,
|
||
|
|
"learning_rate": 1.3473880640326725e-06,
|
||
|
|
"loss": 0.0026,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 291479050.0,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48863983154296875,
|
||
|
|
"epoch": 4.045977011494253,
|
||
|
|
"grad_norm": 2.0230698337679174,
|
||
|
|
"learning_rate": 1.3333316919358159e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 292300777.0,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49753570556640625,
|
||
|
|
"epoch": 4.057471264367816,
|
||
|
|
"grad_norm": 0.6796429821645044,
|
||
|
|
"learning_rate": 1.3193223130682937e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 293090581.0,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47222900390625,
|
||
|
|
"epoch": 4.068965517241379,
|
||
|
|
"grad_norm": 6.014376626856276,
|
||
|
|
"learning_rate": 1.3053604917258428e-06,
|
||
|
|
"loss": 0.0207,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 293972611.0,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4964752197265625,
|
||
|
|
"epoch": 4.080459770114943,
|
||
|
|
"grad_norm": 0.6912483480930615,
|
||
|
|
"learning_rate": 1.2914467902885902e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 294794906.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4761962890625,
|
||
|
|
"epoch": 4.091954022988506,
|
||
|
|
"grad_norm": 3.135946056259954,
|
||
|
|
"learning_rate": 1.2775817691984032e-06,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 295656117.0,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4886016845703125,
|
||
|
|
"epoch": 4.103448275862069,
|
||
|
|
"grad_norm": 1.7539444927331345,
|
||
|
|
"learning_rate": 1.2637659869363085e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 296488812.0,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486907958984375,
|
||
|
|
"epoch": 4.114942528735632,
|
||
|
|
"grad_norm": 1.3961250048157883,
|
||
|
|
"learning_rate": 1.2500000000000007e-06,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 297344919.0,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47713470458984375,
|
||
|
|
"epoch": 4.126436781609195,
|
||
|
|
"grad_norm": 1.0425696217158198,
|
||
|
|
"learning_rate": 1.2362843628814267e-06,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 298218199.0,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4909515380859375,
|
||
|
|
"epoch": 4.137931034482759,
|
||
|
|
"grad_norm": 1.5089572088275034,
|
||
|
|
"learning_rate": 1.222619628044449e-06,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 299059349.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5058059692382812,
|
||
|
|
"epoch": 4.149425287356322,
|
||
|
|
"grad_norm": 3.939531795615367,
|
||
|
|
"learning_rate": 1.2090063459025956e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 299836899.0,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5086669921875,
|
||
|
|
"epoch": 4.160919540229885,
|
||
|
|
"grad_norm": 5.465372506142496,
|
||
|
|
"learning_rate": 1.1954450647968856e-06,
|
||
|
|
"loss": 0.0159,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 300629307.0,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49275970458984375,
|
||
|
|
"epoch": 4.172413793103448,
|
||
|
|
"grad_norm": 1.2454373437193944,
|
||
|
|
"learning_rate": 1.181936330973744e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 301434572.0,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48398590087890625,
|
||
|
|
"epoch": 4.183908045977011,
|
||
|
|
"grad_norm": 0.8322721421543683,
|
||
|
|
"learning_rate": 1.1684806885630003e-06,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 302271254.0,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49010467529296875,
|
||
|
|
"epoch": 4.195402298850575,
|
||
|
|
"grad_norm": 0.7220174084022981,
|
||
|
|
"learning_rate": 1.155078679555969e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 303121365.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4936981201171875,
|
||
|
|
"epoch": 4.206896551724138,
|
||
|
|
"grad_norm": 1.6057246636473406,
|
||
|
|
"learning_rate": 1.1417308437836181e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 303940576.0,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48703765869140625,
|
||
|
|
"epoch": 4.218390804597701,
|
||
|
|
"grad_norm": 3.260885681236724,
|
||
|
|
"learning_rate": 1.1284377188948258e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 304779295.0,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486724853515625,
|
||
|
|
"epoch": 4.2298850574712645,
|
||
|
|
"grad_norm": 1.637009339887056,
|
||
|
|
"learning_rate": 1.1151998403347245e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 305615268.0,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48380279541015625,
|
||
|
|
"epoch": 4.241379310344827,
|
||
|
|
"grad_norm": 3.577848153648759,
|
||
|
|
"learning_rate": 1.1020177413231334e-06,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 306462446.0,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48940277099609375,
|
||
|
|
"epoch": 4.252873563218391,
|
||
|
|
"grad_norm": 2.527345402012226,
|
||
|
|
"learning_rate": 1.0888919528330778e-06,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 307285095.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.498809814453125,
|
||
|
|
"epoch": 4.264367816091954,
|
||
|
|
"grad_norm": 0.7591305134467865,
|
||
|
|
"learning_rate": 1.0758230035694031e-06,
|
||
|
|
"loss": 0.0029,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 308089063.0,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47843170166015625,
|
||
|
|
"epoch": 4.275862068965517,
|
||
|
|
"grad_norm": 1.3490864229676036,
|
||
|
|
"learning_rate": 1.062811419947482e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 308928800.0,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5036239624023438,
|
||
|
|
"epoch": 4.287356321839081,
|
||
|
|
"grad_norm": 1.4383283661296111,
|
||
|
|
"learning_rate": 1.049857726072005e-06,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 309730211.0,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4854278564453125,
|
||
|
|
"epoch": 4.2988505747126435,
|
||
|
|
"grad_norm": 6.081241592780342,
|
||
|
|
"learning_rate": 1.036962443715872e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 310582388.0,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47245025634765625,
|
||
|
|
"epoch": 4.310344827586207,
|
||
|
|
"grad_norm": 4.195343639160105,
|
||
|
|
"learning_rate": 1.0241260922991761e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 311440338.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48504638671875,
|
||
|
|
"epoch": 4.32183908045977,
|
||
|
|
"grad_norm": 1.4487989469262141,
|
||
|
|
"learning_rate": 1.0113491888682802e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 312258986.0,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.492950439453125,
|
||
|
|
"epoch": 4.333333333333333,
|
||
|
|
"grad_norm": 4.518532952463173,
|
||
|
|
"learning_rate": 9.986322480749926e-07,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 313058956.0,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4766845703125,
|
||
|
|
"epoch": 4.344827586206897,
|
||
|
|
"grad_norm": 6.1354639633433115,
|
||
|
|
"learning_rate": 9.85975782155834e-07,
|
||
|
|
"loss": 0.0233,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 313917084.0,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48448944091796875,
|
||
|
|
"epoch": 4.35632183908046,
|
||
|
|
"grad_norm": 6.820421188466488,
|
||
|
|
"learning_rate": 9.733803009114045e-07,
|
||
|
|
"loss": 0.0025,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 314751178.0,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5024642944335938,
|
||
|
|
"epoch": 4.3678160919540225,
|
||
|
|
"grad_norm": 3.825664202457435,
|
||
|
|
"learning_rate": 9.608463116858544e-07,
|
||
|
|
"loss": 0.006,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 315533124.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.489990234375,
|
||
|
|
"epoch": 4.379310344827586,
|
||
|
|
"grad_norm": 0.3606406189949594,
|
||
|
|
"learning_rate": 9.483743193464409e-07,
|
||
|
|
"loss": 0.0021,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 316351070.0,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4849853515625,
|
||
|
|
"epoch": 4.390804597701149,
|
||
|
|
"grad_norm": 4.022953332605156,
|
||
|
|
"learning_rate": 9.359648262631962e-07,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 317193504.0,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4813385009765625,
|
||
|
|
"epoch": 4.402298850574713,
|
||
|
|
"grad_norm": 0.3387422920732581,
|
||
|
|
"learning_rate": 9.236183322886946e-07,
|
||
|
|
"loss": 0.002,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 318022047.0,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48401641845703125,
|
||
|
|
"epoch": 4.413793103448276,
|
||
|
|
"grad_norm": 2.361175377691583,
|
||
|
|
"learning_rate": 9.113353347379097e-07,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 318833334.0,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48177337646484375,
|
||
|
|
"epoch": 4.425287356321839,
|
||
|
|
"grad_norm": 3.210148443515568,
|
||
|
|
"learning_rate": 8.991163283681945e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 319670458.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4922943115234375,
|
||
|
|
"epoch": 4.436781609195402,
|
||
|
|
"grad_norm": 0.8534452783712402,
|
||
|
|
"learning_rate": 8.869618053593429e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 320463804.0,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4735107421875,
|
||
|
|
"epoch": 4.448275862068965,
|
||
|
|
"grad_norm": 1.2810358599901657,
|
||
|
|
"learning_rate": 8.748722552937688e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 321305315.0,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49753570556640625,
|
||
|
|
"epoch": 4.459770114942529,
|
||
|
|
"grad_norm": 0.3022749679325425,
|
||
|
|
"learning_rate": 8.628481651367876e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 322115329.0,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478118896484375,
|
||
|
|
"epoch": 4.471264367816092,
|
||
|
|
"grad_norm": 3.9075388394359165,
|
||
|
|
"learning_rate": 8.508900192169964e-07,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 322943394.0,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48065948486328125,
|
||
|
|
"epoch": 4.482758620689655,
|
||
|
|
"grad_norm": 2.5853524966903825,
|
||
|
|
"learning_rate": 8.389982992067688e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 323762317.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47959136962890625,
|
||
|
|
"epoch": 4.494252873563219,
|
||
|
|
"grad_norm": 0.31803299256569667,
|
||
|
|
"learning_rate": 8.271734841028553e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 324614783.0,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46380615234375,
|
||
|
|
"epoch": 4.505747126436781,
|
||
|
|
"grad_norm": 1.4702478415777527,
|
||
|
|
"learning_rate": 8.154160502070804e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 325487590.0,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4808197021484375,
|
||
|
|
"epoch": 4.517241379310345,
|
||
|
|
"grad_norm": 0.3626143146323085,
|
||
|
|
"learning_rate": 8.037264711071699e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 326309233.0,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48209381103515625,
|
||
|
|
"epoch": 4.528735632183908,
|
||
|
|
"grad_norm": 4.1348271243110215,
|
||
|
|
"learning_rate": 7.921052176576643e-07,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 327109254.0,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4883575439453125,
|
||
|
|
"epoch": 4.540229885057471,
|
||
|
|
"grad_norm": 1.7488698968120222,
|
||
|
|
"learning_rate": 7.805527579609575e-07,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 327899654.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4583587646484375,
|
||
|
|
"epoch": 4.551724137931035,
|
||
|
|
"grad_norm": 1.0455651427697132,
|
||
|
|
"learning_rate": 7.690695573484433e-07,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 328801476.0,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.481842041015625,
|
||
|
|
"epoch": 4.563218390804598,
|
||
|
|
"grad_norm": 0.7387887077265088,
|
||
|
|
"learning_rate": 7.576560783617667e-07,
|
||
|
|
"loss": 0.0021,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 329628649.0,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4767913818359375,
|
||
|
|
"epoch": 4.574712643678161,
|
||
|
|
"grad_norm": 7.742789771491273,
|
||
|
|
"learning_rate": 7.463127807341966e-07,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 330484216.0,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47850799560546875,
|
||
|
|
"epoch": 4.586206896551724,
|
||
|
|
"grad_norm": 8.42931358748137,
|
||
|
|
"learning_rate": 7.35040121372109e-07,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 331339275.0,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4886627197265625,
|
||
|
|
"epoch": 4.597701149425287,
|
||
|
|
"grad_norm": 4.673599280994888,
|
||
|
|
"learning_rate": 7.238385543365783e-07,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 332156313.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4760284423828125,
|
||
|
|
"epoch": 4.609195402298851,
|
||
|
|
"grad_norm": 2.668169707423765,
|
||
|
|
"learning_rate": 7.127085308250914e-07,
|
||
|
|
"loss": 0.0026,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 332983166.0,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49083709716796875,
|
||
|
|
"epoch": 4.620689655172414,
|
||
|
|
"grad_norm": 0.26151093544975906,
|
||
|
|
"learning_rate": 7.016504991533727e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 333809756.0,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4631195068359375,
|
||
|
|
"epoch": 4.6321839080459775,
|
||
|
|
"grad_norm": 5.601919909459552,
|
||
|
|
"learning_rate": 6.906649047373246e-07,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 334706440.0,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48708343505859375,
|
||
|
|
"epoch": 4.64367816091954,
|
||
|
|
"grad_norm": 1.1889732572236726,
|
||
|
|
"learning_rate": 6.797521900750897e-07,
|
||
|
|
"loss": 0.0198,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 335534859.0,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48416900634765625,
|
||
|
|
"epoch": 4.655172413793103,
|
||
|
|
"grad_norm": 5.14978044907013,
|
||
|
|
"learning_rate": 6.689127947292232e-07,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 336338209.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.485504150390625,
|
||
|
|
"epoch": 4.666666666666667,
|
||
|
|
"grad_norm": 0.2866668178929426,
|
||
|
|
"learning_rate": 6.581471553089874e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 337177750.0,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48833465576171875,
|
||
|
|
"epoch": 4.67816091954023,
|
||
|
|
"grad_norm": 2.47127991193972,
|
||
|
|
"learning_rate": 6.474557054527709e-07,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 337984644.0,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4751434326171875,
|
||
|
|
"epoch": 4.689655172413794,
|
||
|
|
"grad_norm": 0.8103649633355084,
|
||
|
|
"learning_rate": 6.368388758106134e-07,
|
||
|
|
"loss": 0.009,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 338842954.0,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478729248046875,
|
||
|
|
"epoch": 4.7011494252873565,
|
||
|
|
"grad_norm": 0.6095178762388435,
|
||
|
|
"learning_rate": 6.262970940268653e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 339691867.0,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.490264892578125,
|
||
|
|
"epoch": 4.712643678160919,
|
||
|
|
"grad_norm": 0.29562950287042467,
|
||
|
|
"learning_rate": 6.158307847229594e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 340497554.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.496917724609375,
|
||
|
|
"epoch": 4.724137931034483,
|
||
|
|
"grad_norm": 1.389667795031155,
|
||
|
|
"learning_rate": 6.05440369480308e-07,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 341293415.0,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48715972900390625,
|
||
|
|
"epoch": 4.735632183908046,
|
||
|
|
"grad_norm": 1.2423943514661475,
|
||
|
|
"learning_rate": 5.951262668233232e-07,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 342132670.0,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49158477783203125,
|
||
|
|
"epoch": 4.747126436781609,
|
||
|
|
"grad_norm": 0.4338513235610179,
|
||
|
|
"learning_rate": 5.848888922025553e-07,
|
||
|
|
"loss": 0.0024,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 342954761.0,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4922332763671875,
|
||
|
|
"epoch": 4.758620689655173,
|
||
|
|
"grad_norm": 0.37787068708743277,
|
||
|
|
"learning_rate": 5.747286579779607e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 343768247.0,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48549652099609375,
|
||
|
|
"epoch": 4.7701149425287355,
|
||
|
|
"grad_norm": 2.3481180551859806,
|
||
|
|
"learning_rate": 5.646459734022938e-07,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 344601208.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49423980712890625,
|
||
|
|
"epoch": 4.781609195402299,
|
||
|
|
"grad_norm": 2.1462858882504197,
|
||
|
|
"learning_rate": 5.546412446046187e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 345394502.0,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4767913818359375,
|
||
|
|
"epoch": 4.793103448275862,
|
||
|
|
"grad_norm": 0.34717798081468426,
|
||
|
|
"learning_rate": 5.447148745739522e-07,
|
||
|
|
"loss": 0.0022,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 346254704.0,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48309326171875,
|
||
|
|
"epoch": 4.804597701149425,
|
||
|
|
"grad_norm": 2.574711249295488,
|
||
|
|
"learning_rate": 5.348672631430319e-07,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 347076198.0,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4826202392578125,
|
||
|
|
"epoch": 4.816091954022989,
|
||
|
|
"grad_norm": 0.29624777403809666,
|
||
|
|
"learning_rate": 5.250988069722096e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 347895786.0,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48499298095703125,
|
||
|
|
"epoch": 4.827586206896552,
|
||
|
|
"grad_norm": 3.836909556315625,
|
||
|
|
"learning_rate": 5.154098995334769e-07,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 348706168.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49582672119140625,
|
||
|
|
"epoch": 4.8390804597701145,
|
||
|
|
"grad_norm": 3.2563250713457106,
|
||
|
|
"learning_rate": 5.058009310946119e-07,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 349502269.0,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.476806640625,
|
||
|
|
"epoch": 4.850574712643678,
|
||
|
|
"grad_norm": 0.304724756838409,
|
||
|
|
"learning_rate": 4.962722887034616e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 350325332.0,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47916412353515625,
|
||
|
|
"epoch": 4.862068965517241,
|
||
|
|
"grad_norm": 5.162633488189246,
|
||
|
|
"learning_rate": 4.868243561723535e-07,
|
||
|
|
"loss": 0.013,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 351159607.0,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47540283203125,
|
||
|
|
"epoch": 4.873563218390805,
|
||
|
|
"grad_norm": 0.5206824035136977,
|
||
|
|
"learning_rate": 4.774575140626317e-07,
|
||
|
|
"loss": 0.0021,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 352006157.0,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4759368896484375,
|
||
|
|
"epoch": 4.885057471264368,
|
||
|
|
"grad_norm": 0.3162820177895493,
|
||
|
|
"learning_rate": 4.681721396693303e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 352835790.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48349761962890625,
|
||
|
|
"epoch": 4.896551724137931,
|
||
|
|
"grad_norm": 3.5611052036772124,
|
||
|
|
"learning_rate": 4.589686070059762e-07,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 353654373.0,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4834136962890625,
|
||
|
|
"epoch": 4.908045977011494,
|
||
|
|
"grad_norm": 0.6544510464811919,
|
||
|
|
"learning_rate": 4.4984728678952234e-07,
|
||
|
|
"loss": 0.0021,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 354477439.0,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.477691650390625,
|
||
|
|
"epoch": 4.919540229885057,
|
||
|
|
"grad_norm": 0.3747410441822833,
|
||
|
|
"learning_rate": 4.4080854642541833e-07,
|
||
|
|
"loss": 0.002,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 355302452.0,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4850006103515625,
|
||
|
|
"epoch": 4.931034482758621,
|
||
|
|
"grad_norm": 0.5252478732331773,
|
||
|
|
"learning_rate": 4.318527499928074e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 356125422.0,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47662353515625,
|
||
|
|
"epoch": 4.942528735632184,
|
||
|
|
"grad_norm": 0.29643930127778295,
|
||
|
|
"learning_rate": 4.229802582298634e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 356978310.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5031585693359375,
|
||
|
|
"epoch": 4.954022988505747,
|
||
|
|
"grad_norm": 0.6671572700465241,
|
||
|
|
"learning_rate": 4.141914285192619e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 357743388.0,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46810150146484375,
|
||
|
|
"epoch": 4.9655172413793105,
|
||
|
|
"grad_norm": 0.24179886566109368,
|
||
|
|
"learning_rate": 4.0548661487378184e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 358578324.0,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4846649169921875,
|
||
|
|
"epoch": 4.977011494252873,
|
||
|
|
"grad_norm": 0.9015919155281225,
|
||
|
|
"learning_rate": 3.9686616792204677e-07,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 359402028.0,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47179412841796875,
|
||
|
|
"epoch": 4.988505747126437,
|
||
|
|
"grad_norm": 1.9609784789544666,
|
||
|
|
"learning_rate": 3.8833043489440477e-07,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 360259014.0,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46057891845703125,
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 3.135548029845806,
|
||
|
|
"learning_rate": 3.798797596089351e-07,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 361140653.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4911651611328125,
|
||
|
|
"epoch": 5.011494252873563,
|
||
|
|
"grad_norm": 4.784507319668607,
|
||
|
|
"learning_rate": 3.715144824576078e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 361910005.0,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45987701416015625,
|
||
|
|
"epoch": 5.022988505747127,
|
||
|
|
"grad_norm": 3.53439712252648,
|
||
|
|
"learning_rate": 3.632349403925664e-07,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 362789389.0,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48267364501953125,
|
||
|
|
"epoch": 5.0344827586206895,
|
||
|
|
"grad_norm": 0.23679774220430147,
|
||
|
|
"learning_rate": 3.5504146691255736e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 363594617.0,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47402191162109375,
|
||
|
|
"epoch": 5.045977011494253,
|
||
|
|
"grad_norm": 3.7140900011253017,
|
||
|
|
"learning_rate": 3.469343920494986e-07,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 364431317.0,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48137664794921875,
|
||
|
|
"epoch": 5.057471264367816,
|
||
|
|
"grad_norm": 0.24349208704506586,
|
||
|
|
"learning_rate": 3.389140423551834e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 365260126.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47275543212890625,
|
||
|
|
"epoch": 5.068965517241379,
|
||
|
|
"grad_norm": 2.9727752314913225,
|
||
|
|
"learning_rate": 3.3098074088812686e-07,
|
||
|
|
"loss": 0.0026,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 366112155.0,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47698974609375,
|
||
|
|
"epoch": 5.080459770114943,
|
||
|
|
"grad_norm": 3.2479910943373254,
|
||
|
|
"learning_rate": 3.2313480720055747e-07,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 366950263.0,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4771575927734375,
|
||
|
|
"epoch": 5.091954022988506,
|
||
|
|
"grad_norm": 1.2691063578241986,
|
||
|
|
"learning_rate": 3.153765573255377e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 367776644.0,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.483184814453125,
|
||
|
|
"epoch": 5.103448275862069,
|
||
|
|
"grad_norm": 2.8124807980483384,
|
||
|
|
"learning_rate": 3.0770630376424276e-07,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 368596908.0,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4910125732421875,
|
||
|
|
"epoch": 5.114942528735632,
|
||
|
|
"grad_norm": 3.891577262381854,
|
||
|
|
"learning_rate": 3.0012435547336737e-07,
|
||
|
|
"loss": 0.0026,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 369390244.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.475433349609375,
|
||
|
|
"epoch": 5.126436781609195,
|
||
|
|
"grad_norm": 0.9801102432206495,
|
||
|
|
"learning_rate": 2.9263101785268253e-07,
|
||
|
|
"loss": 0.0021,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 370237002.0,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4729156494140625,
|
||
|
|
"epoch": 5.137931034482759,
|
||
|
|
"grad_norm": 0.46368160142667864,
|
||
|
|
"learning_rate": 2.8522659273273606e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 371089064.0,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4679107666015625,
|
||
|
|
"epoch": 5.149425287356322,
|
||
|
|
"grad_norm": 0.4466182713447019,
|
||
|
|
"learning_rate": 2.779113783626916e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 371950407.0,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4793701171875,
|
||
|
|
"epoch": 5.160919540229885,
|
||
|
|
"grad_norm": 0.24997338132137475,
|
||
|
|
"learning_rate": 2.7068566939831646e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 372769217.0,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468536376953125,
|
||
|
|
"epoch": 5.172413793103448,
|
||
|
|
"grad_norm": 5.078234022478168,
|
||
|
|
"learning_rate": 2.6354975689011576e-07,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 373597313.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4726409912109375,
|
||
|
|
"epoch": 5.183908045977011,
|
||
|
|
"grad_norm": 0.23098046397612482,
|
||
|
|
"learning_rate": 2.5650392827160446e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 374442696.0,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4760284423828125,
|
||
|
|
"epoch": 5.195402298850575,
|
||
|
|
"grad_norm": 0.2444782213662235,
|
||
|
|
"learning_rate": 2.4954846734773054e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 375274472.0,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4780120849609375,
|
||
|
|
"epoch": 5.206896551724138,
|
||
|
|
"grad_norm": 0.23065508145988362,
|
||
|
|
"learning_rate": 2.4268365428344737e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 376082294.0,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4742889404296875,
|
||
|
|
"epoch": 5.218390804597701,
|
||
|
|
"grad_norm": 0.22841771210591458,
|
||
|
|
"learning_rate": 2.3590976559242278e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 376913132.0,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.484130859375,
|
||
|
|
"epoch": 5.2298850574712645,
|
||
|
|
"grad_norm": 0.2312717975757419,
|
||
|
|
"learning_rate": 2.29227074125907e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 377719614.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4854583740234375,
|
||
|
|
"epoch": 5.241379310344827,
|
||
|
|
"grad_norm": 2.7996410031164864,
|
||
|
|
"learning_rate": 2.2263584906173723e-07,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 378523136.0,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47391510009765625,
|
||
|
|
"epoch": 5.252873563218391,
|
||
|
|
"grad_norm": 0.22824235020101177,
|
||
|
|
"learning_rate": 2.1613635589349756e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 379375322.0,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4659881591796875,
|
||
|
|
"epoch": 5.264367816091954,
|
||
|
|
"grad_norm": 3.476542826635523,
|
||
|
|
"learning_rate": 2.0972885641982605e-07,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 380214107.0,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4750518798828125,
|
||
|
|
"epoch": 5.275862068965517,
|
||
|
|
"grad_norm": 4.380500664778174,
|
||
|
|
"learning_rate": 2.0341360873386673e-07,
|
||
|
|
"loss": 0.005,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 381038853.0,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46482086181640625,
|
||
|
|
"epoch": 5.287356321839081,
|
||
|
|
"grad_norm": 2.015274773802452,
|
||
|
|
"learning_rate": 1.97190867212875e-07,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 381914412.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47858428955078125,
|
||
|
|
"epoch": 5.2988505747126435,
|
||
|
|
"grad_norm": 0.22456749005464405,
|
||
|
|
"learning_rate": 1.9106088250797266e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 382734720.0,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47489166259765625,
|
||
|
|
"epoch": 5.310344827586207,
|
||
|
|
"grad_norm": 0.22731724902580622,
|
||
|
|
"learning_rate": 1.8502390153404936e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 383560929.0,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47196197509765625,
|
||
|
|
"epoch": 5.32183908045977,
|
||
|
|
"grad_norm": 4.740213779864106,
|
||
|
|
"learning_rate": 1.790801674598186e-07,
|
||
|
|
"loss": 0.0075,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 384389530.0,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47057342529296875,
|
||
|
|
"epoch": 5.333333333333333,
|
||
|
|
"grad_norm": 0.2258431066367018,
|
||
|
|
"learning_rate": 1.732299196980225e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 385233840.0,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47980499267578125,
|
||
|
|
"epoch": 5.344827586206897,
|
||
|
|
"grad_norm": 1.4221626617705267,
|
||
|
|
"learning_rate": 1.6747339389578732e-07,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 386042787.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47692108154296875,
|
||
|
|
"epoch": 5.35632183908046,
|
||
|
|
"grad_norm": 4.521214139616767,
|
||
|
|
"learning_rate": 1.6181082192513352e-07,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 386861845.0,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.482086181640625,
|
||
|
|
"epoch": 5.3678160919540225,
|
||
|
|
"grad_norm": 3.3231820358196895,
|
||
|
|
"learning_rate": 1.5624243187363442e-07,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 387661163.0,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48667144775390625,
|
||
|
|
"epoch": 5.379310344827586,
|
||
|
|
"grad_norm": 0.2304870362722343,
|
||
|
|
"learning_rate": 1.507684480352292e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 388485432.0,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4650726318359375,
|
||
|
|
"epoch": 5.390804597701149,
|
||
|
|
"grad_norm": 0.23256845046208066,
|
||
|
|
"learning_rate": 1.4538909090118846e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 389334143.0,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48242950439453125,
|
||
|
|
"epoch": 5.402298850574713,
|
||
|
|
"grad_norm": 0.2585670975063039,
|
||
|
|
"learning_rate": 1.4010457715123355e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 390131143.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.492431640625,
|
||
|
|
"epoch": 5.413793103448276,
|
||
|
|
"grad_norm": 0.2528521501053522,
|
||
|
|
"learning_rate": 1.3491511964480703e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 390931756.0,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4763031005859375,
|
||
|
|
"epoch": 5.425287356321839,
|
||
|
|
"grad_norm": 1.4739001845694437,
|
||
|
|
"learning_rate": 1.2982092741250145e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 391752298.0,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4845428466796875,
|
||
|
|
"epoch": 5.436781609195402,
|
||
|
|
"grad_norm": 0.26445355496340167,
|
||
|
|
"learning_rate": 1.2482220564763669e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 392545229.0,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4782867431640625,
|
||
|
|
"epoch": 5.448275862068965,
|
||
|
|
"grad_norm": 0.9542333022796412,
|
||
|
|
"learning_rate": 1.1991915569799645e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 393380338.0,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.476104736328125,
|
||
|
|
"epoch": 5.459770114942529,
|
||
|
|
"grad_norm": 0.26357178354088867,
|
||
|
|
"learning_rate": 1.1511197505771843e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 394226931.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48026275634765625,
|
||
|
|
"epoch": 5.471264367816092,
|
||
|
|
"grad_norm": 0.4080153444923309,
|
||
|
|
"learning_rate": 1.1040085735933681e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 395032835.0,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46416473388671875,
|
||
|
|
"epoch": 5.482758620689655,
|
||
|
|
"grad_norm": 0.6081932699825013,
|
||
|
|
"learning_rate": 1.0578599236598708e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 395872556.0,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47927093505859375,
|
||
|
|
"epoch": 5.494252873563219,
|
||
|
|
"grad_norm": 0.2756351529387358,
|
||
|
|
"learning_rate": 1.0126756596375687e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 396695404.0,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46880340576171875,
|
||
|
|
"epoch": 5.505747126436781,
|
||
|
|
"grad_norm": 1.7900528624209657,
|
||
|
|
"learning_rate": 9.684576015420277e-08,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 397548893.0,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47908782958984375,
|
||
|
|
"epoch": 5.517241379310345,
|
||
|
|
"grad_norm": 0.3007803080522495,
|
||
|
|
"learning_rate": 9.252075304701929e-08,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 398357809.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472137451171875,
|
||
|
|
"epoch": 5.528735632183908,
|
||
|
|
"grad_norm": 0.2546056245953954,
|
||
|
|
"learning_rate": 8.829271885286095e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 399193393.0,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4811248779296875,
|
||
|
|
"epoch": 5.540229885057471,
|
||
|
|
"grad_norm": 0.24009342643072343,
|
||
|
|
"learning_rate": 8.416182787632871e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 399990785.0,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4644927978515625,
|
||
|
|
"epoch": 5.551724137931035,
|
||
|
|
"grad_norm": 0.23317450891721894,
|
||
|
|
"learning_rate": 8.012824650910938e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 400845157.0,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47351837158203125,
|
||
|
|
"epoch": 5.563218390804598,
|
||
|
|
"grad_norm": 2.1147010615882196,
|
||
|
|
"learning_rate": 7.619213722327184e-08,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 401679005.0,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468231201171875,
|
||
|
|
"epoch": 5.574712643678161,
|
||
|
|
"grad_norm": 0.24668197764885869,
|
||
|
|
"learning_rate": 7.235365856472443e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 402520327.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4737548828125,
|
||
|
|
"epoch": 5.586206896551724,
|
||
|
|
"grad_norm": 0.24623472017693857,
|
||
|
|
"learning_rate": 6.86129651468273e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 403342914.0,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46429443359375,
|
||
|
|
"epoch": 5.597701149425287,
|
||
|
|
"grad_norm": 1.0601206333936144,
|
||
|
|
"learning_rate": 6.497020764416633e-08,
|
||
|
|
"loss": 0.002,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 404200783.0,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46352386474609375,
|
||
|
|
"epoch": 5.609195402298851,
|
||
|
|
"grad_norm": 0.22699217170268132,
|
||
|
|
"learning_rate": 6.142553278648239e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 405062916.0,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.483367919921875,
|
||
|
|
"epoch": 5.620689655172414,
|
||
|
|
"grad_norm": 0.23077255806225605,
|
||
|
|
"learning_rate": 5.7979083352762146e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 405902942.0,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49231719970703125,
|
||
|
|
"epoch": 5.6321839080459775,
|
||
|
|
"grad_norm": 0.23144381904668673,
|
||
|
|
"learning_rate": 5.463099816548578e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 406710949.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46439361572265625,
|
||
|
|
"epoch": 5.64367816091954,
|
||
|
|
"grad_norm": 2.3498455147538175,
|
||
|
|
"learning_rate": 5.1381412085036994e-08,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 407554838.0,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48107147216796875,
|
||
|
|
"epoch": 5.655172413793103,
|
||
|
|
"grad_norm": 0.22765796262677274,
|
||
|
|
"learning_rate": 4.823045600426901e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 408367291.0,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4683685302734375,
|
||
|
|
"epoch": 5.666666666666667,
|
||
|
|
"grad_norm": 0.23126276882370067,
|
||
|
|
"learning_rate": 4.5178256843233235e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 409212322.0,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.470672607421875,
|
||
|
|
"epoch": 5.67816091954023,
|
||
|
|
"grad_norm": 0.22570374435713633,
|
||
|
|
"learning_rate": 4.2224937544067254e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 410059887.0,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47655487060546875,
|
||
|
|
"epoch": 5.689655172413794,
|
||
|
|
"grad_norm": 0.2395343469118418,
|
||
|
|
"learning_rate": 3.9370617066040726e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 410882734.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4618377685546875,
|
||
|
|
"epoch": 5.7011494252873565,
|
||
|
|
"grad_norm": 0.22662430438163453,
|
||
|
|
"learning_rate": 3.661541038076755e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 411766798.0,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4715728759765625,
|
||
|
|
"epoch": 5.712643678160919,
|
||
|
|
"grad_norm": 0.22608240200051918,
|
||
|
|
"learning_rate": 3.395942846757067e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 412620860.0,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4783782958984375,
|
||
|
|
"epoch": 5.724137931034483,
|
||
|
|
"grad_norm": 0.23276920955328978,
|
||
|
|
"learning_rate": 3.1402778309014284e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 413434227.0,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47417449951171875,
|
||
|
|
"epoch": 5.735632183908046,
|
||
|
|
"grad_norm": 0.22647336925027828,
|
||
|
|
"learning_rate": 2.8945562886593948e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 414252581.0,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47452545166015625,
|
||
|
|
"epoch": 5.747126436781609,
|
||
|
|
"grad_norm": 0.22490185870003843,
|
||
|
|
"learning_rate": 2.6587881176588782e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 415068424.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48020172119140625,
|
||
|
|
"epoch": 5.758620689655173,
|
||
|
|
"grad_norm": 0.23110370834284083,
|
||
|
|
"learning_rate": 2.4329828146074096e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 415902990.0,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47350311279296875,
|
||
|
|
"epoch": 5.7701149425287355,
|
||
|
|
"grad_norm": 2.839859264372696,
|
||
|
|
"learning_rate": 2.2171494749097243e-08,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 416757511.0,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46640777587890625,
|
||
|
|
"epoch": 5.781609195402299,
|
||
|
|
"grad_norm": 1.6462241133899467,
|
||
|
|
"learning_rate": 2.011296792301165e-08,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 417604658.0,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46356964111328125,
|
||
|
|
"epoch": 5.793103448275862,
|
||
|
|
"grad_norm": 0.23650139037887488,
|
||
|
|
"learning_rate": 1.8154330584978785e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 418461199.0,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45654296875,
|
||
|
|
"epoch": 5.804597701149425,
|
||
|
|
"grad_norm": 1.0619531610176047,
|
||
|
|
"learning_rate": 1.629566162862445e-08,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 419328853.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.473480224609375,
|
||
|
|
"epoch": 5.816091954022989,
|
||
|
|
"grad_norm": 0.22868022966207807,
|
||
|
|
"learning_rate": 1.453703592086353e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 420144761.0,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46999359130859375,
|
||
|
|
"epoch": 5.827586206896552,
|
||
|
|
"grad_norm": 2.547490956818682,
|
||
|
|
"learning_rate": 1.28785242988827e-08,
|
||
|
|
"loss": 0.002,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 420982312.0,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46546173095703125,
|
||
|
|
"epoch": 5.8390804597701145,
|
||
|
|
"grad_norm": 6.379105642099488,
|
||
|
|
"learning_rate": 1.132019356728853e-08,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 421827384.0,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47126007080078125,
|
||
|
|
"epoch": 5.850574712643678,
|
||
|
|
"grad_norm": 1.8118443829201367,
|
||
|
|
"learning_rate": 9.862106495415469e-09,
|
||
|
|
"loss": 0.005,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 422668841.0,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46734619140625,
|
||
|
|
"epoch": 5.862068965517241,
|
||
|
|
"grad_norm": 0.22332659835801152,
|
||
|
|
"learning_rate": 8.504321814798433e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 423516715.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4660186767578125,
|
||
|
|
"epoch": 5.873563218390805,
|
||
|
|
"grad_norm": 0.2260995469649812,
|
||
|
|
"learning_rate": 7.246894216806355e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 424340682.0,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4914398193359375,
|
||
|
|
"epoch": 5.885057471264368,
|
||
|
|
"grad_norm": 0.22784964173297753,
|
||
|
|
"learning_rate": 6.089874350439507e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 425132197.0,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4688720703125,
|
||
|
|
"epoch": 5.896551724137931,
|
||
|
|
"grad_norm": 0.22366761094391158,
|
||
|
|
"learning_rate": 5.033308820289185e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 425969675.0,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.475860595703125,
|
||
|
|
"epoch": 5.908045977011494,
|
||
|
|
"grad_norm": 0.22732344127849993,
|
||
|
|
"learning_rate": 4.07724018466088e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 426790240.0,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.465087890625,
|
||
|
|
"epoch": 5.919540229885057,
|
||
|
|
"grad_norm": 0.26543552224647043,
|
||
|
|
"learning_rate": 3.2217069538600932e-09,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 427651071.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47153472900390625,
|
||
|
|
"epoch": 5.931034482758621,
|
||
|
|
"grad_norm": 0.2295916592336764,
|
||
|
|
"learning_rate": 2.4667435886402414e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 428470168.0,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46768951416015625,
|
||
|
|
"epoch": 5.942528735632184,
|
||
|
|
"grad_norm": 0.2277843849699895,
|
||
|
|
"learning_rate": 1.8123804988159909e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 429292956.0,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48101043701171875,
|
||
|
|
"epoch": 5.954022988505747,
|
||
|
|
"grad_norm": 0.22645001219989788,
|
||
|
|
"learning_rate": 1.2586440420372936e-09,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 430114842.0,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.475311279296875,
|
||
|
|
"epoch": 5.9655172413793105,
|
||
|
|
"grad_norm": 0.22916157689876235,
|
||
|
|
"learning_rate": 8.0555652272718e-10,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 430955923.0,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4584197998046875,
|
||
|
|
"epoch": 5.977011494252873,
|
||
|
|
"grad_norm": 0.22928233995534253,
|
||
|
|
"learning_rate": 4.5313619118553256e-10,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 431823368.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47629547119140625,
|
||
|
|
"epoch": 5.988505747126437,
|
||
|
|
"grad_norm": 0.23437557579169707,
|
||
|
|
"learning_rate": 2.0139724285161976e-10,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 432631185.0,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47571563720703125,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"grad_norm": 0.41607559776285885,
|
||
|
|
"learning_rate": 5.0349817733719165e-11,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 433455617.0,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.0,
|
||
|
|
"step": 522,
|
||
|
|
"total_flos": 509990855180288.0,
|
||
|
|
"train_loss": 0.558762426631948,
|
||
|
|
"train_runtime": 70510.0264,
|
||
|
|
"train_samples_per_second": 3.489,
|
||
|
|
"train_steps_per_second": 0.007
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 522,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 6,
|
||
|
|
"save_steps": 44,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 509990855180288.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|