Model: Hyeongwon/P9-split3_only_answer_Qwen3-4B-Base_0402-01-5e-6 Source: Original Platform
5324 lines
149 KiB
JSON
5324 lines
149 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.0,
|
|
"eval_steps": 500,
|
|
"global_step": 528,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 0.5571823120117188,
|
|
"epoch": 0.011363636363636364,
|
|
"grad_norm": 385.21640368234523,
|
|
"learning_rate": 0.0,
|
|
"loss": 8.3268,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 822388.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 0.5536270141601562,
|
|
"epoch": 0.022727272727272728,
|
|
"grad_norm": 384.42809469067583,
|
|
"learning_rate": 1.8518518518518518e-07,
|
|
"loss": 8.3143,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 1663780.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 0.5437393188476562,
|
|
"epoch": 0.03409090909090909,
|
|
"grad_norm": 381.67155444176905,
|
|
"learning_rate": 3.7037037037037036e-07,
|
|
"loss": 8.3398,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 2509387.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 0.565765380859375,
|
|
"epoch": 0.045454545454545456,
|
|
"grad_norm": 393.0428633816126,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": 8.2353,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 3309476.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 0.569061279296875,
|
|
"epoch": 0.056818181818181816,
|
|
"grad_norm": 393.9524081345757,
|
|
"learning_rate": 7.407407407407407e-07,
|
|
"loss": 8.0805,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4091951.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 0.5697250366210938,
|
|
"epoch": 0.06818181818181818,
|
|
"grad_norm": 393.9329273505747,
|
|
"learning_rate": 9.259259259259259e-07,
|
|
"loss": 8.0282,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 4914273.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 0.5641937255859375,
|
|
"epoch": 0.07954545454545454,
|
|
"grad_norm": 395.153279708828,
|
|
"learning_rate": 1.111111111111111e-06,
|
|
"loss": 7.4123,
|
|
"mean_token_accuracy": 0.0,
|
|
"num_tokens": 5731174.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 0.5568161010742188,
|
|
"epoch": 0.09090909090909091,
|
|
"grad_norm": 268.0751377771076,
|
|
"learning_rate": 1.2962962962962962e-06,
|
|
"loss": 5.8473,
|
|
"mean_token_accuracy": 0.0013020833721384406,
|
|
"num_tokens": 6548593.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 0.5587234497070312,
|
|
"epoch": 0.10227272727272728,
|
|
"grad_norm": 224.3841134938914,
|
|
"learning_rate": 1.4814814814814815e-06,
|
|
"loss": 5.5565,
|
|
"mean_token_accuracy": 0.0026041667442768812,
|
|
"num_tokens": 7368193.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 0.5561370849609375,
|
|
"epoch": 0.11363636363636363,
|
|
"grad_norm": 189.00265530913885,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 5.2765,
|
|
"mean_token_accuracy": 0.02343750069849193,
|
|
"num_tokens": 8218117.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 0.5667953491210938,
|
|
"epoch": 0.125,
|
|
"grad_norm": 103.65546608546263,
|
|
"learning_rate": 1.8518518518518519e-06,
|
|
"loss": 4.1131,
|
|
"mean_token_accuracy": 0.5013020982732996,
|
|
"num_tokens": 9033810.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 0.5578842163085938,
|
|
"epoch": 0.13636363636363635,
|
|
"grad_norm": 96.59175301418695,
|
|
"learning_rate": 2.037037037037037e-06,
|
|
"loss": 4.0319,
|
|
"mean_token_accuracy": 0.5195312654832378,
|
|
"num_tokens": 9874463.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 0.5625457763671875,
|
|
"epoch": 0.14772727272727273,
|
|
"grad_norm": 82.36322891422334,
|
|
"learning_rate": 2.222222222222222e-06,
|
|
"loss": 3.8226,
|
|
"mean_token_accuracy": 0.5182291821110994,
|
|
"num_tokens": 10695671.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 0.5607070922851562,
|
|
"epoch": 0.1590909090909091,
|
|
"grad_norm": 74.48472875071353,
|
|
"learning_rate": 2.4074074074074075e-06,
|
|
"loss": 3.7081,
|
|
"mean_token_accuracy": 0.5078125151339918,
|
|
"num_tokens": 11502999.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 0.544952392578125,
|
|
"epoch": 0.17045454545454544,
|
|
"grad_norm": 59.80968159915808,
|
|
"learning_rate": 2.5925925925925925e-06,
|
|
"loss": 3.269,
|
|
"mean_token_accuracy": 0.514322931994684,
|
|
"num_tokens": 12334457.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 0.5462646484375,
|
|
"epoch": 0.18181818181818182,
|
|
"grad_norm": 58.86687930413585,
|
|
"learning_rate": 2.7777777777777783e-06,
|
|
"loss": 3.1993,
|
|
"mean_token_accuracy": 0.5312500158324838,
|
|
"num_tokens": 13183738.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 0.556304931640625,
|
|
"epoch": 0.19318181818181818,
|
|
"grad_norm": 57.620691853575465,
|
|
"learning_rate": 2.962962962962963e-06,
|
|
"loss": 3.1523,
|
|
"mean_token_accuracy": 0.5429687661817297,
|
|
"num_tokens": 14017046.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 0.5455474853515625,
|
|
"epoch": 0.20454545454545456,
|
|
"grad_norm": 57.480386770069764,
|
|
"learning_rate": 3.1481481481481483e-06,
|
|
"loss": 3.0979,
|
|
"mean_token_accuracy": 0.5234375155996531,
|
|
"num_tokens": 14864077.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 0.5444412231445312,
|
|
"epoch": 0.2159090909090909,
|
|
"grad_norm": 57.75046967405615,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 3.0502,
|
|
"mean_token_accuracy": 0.5299479324603453,
|
|
"num_tokens": 15703491.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 0.54150390625,
|
|
"epoch": 0.22727272727272727,
|
|
"grad_norm": 58.06698268429491,
|
|
"learning_rate": 3.5185185185185187e-06,
|
|
"loss": 2.9672,
|
|
"mean_token_accuracy": 0.5299479324603453,
|
|
"num_tokens": 16547715.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 0.532958984375,
|
|
"epoch": 0.23863636363636365,
|
|
"grad_norm": 57.73279218676568,
|
|
"learning_rate": 3.7037037037037037e-06,
|
|
"loss": 2.9237,
|
|
"mean_token_accuracy": 0.5481770996702835,
|
|
"num_tokens": 17398906.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 0.5381011962890625,
|
|
"epoch": 0.25,
|
|
"grad_norm": 58.76356982867235,
|
|
"learning_rate": 3.88888888888889e-06,
|
|
"loss": 2.9056,
|
|
"mean_token_accuracy": 0.5520833497866988,
|
|
"num_tokens": 18233243.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 0.5452651977539062,
|
|
"epoch": 0.26136363636363635,
|
|
"grad_norm": 60.813397297916524,
|
|
"learning_rate": 4.074074074074074e-06,
|
|
"loss": 2.8986,
|
|
"mean_token_accuracy": 0.5520833497866988,
|
|
"num_tokens": 19088014.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 0.535552978515625,
|
|
"epoch": 0.2727272727272727,
|
|
"grad_norm": 58.575483868055635,
|
|
"learning_rate": 4.2592592592592596e-06,
|
|
"loss": 2.8673,
|
|
"mean_token_accuracy": 0.5442708495538682,
|
|
"num_tokens": 19910897.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 0.551544189453125,
|
|
"epoch": 0.2840909090909091,
|
|
"grad_norm": 58.059741108606026,
|
|
"learning_rate": 4.444444444444444e-06,
|
|
"loss": 2.8368,
|
|
"mean_token_accuracy": 0.5416666828095913,
|
|
"num_tokens": 20704060.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 0.5462570190429688,
|
|
"epoch": 0.29545454545454547,
|
|
"grad_norm": 57.74145163004164,
|
|
"learning_rate": 4.62962962962963e-06,
|
|
"loss": 2.8138,
|
|
"mean_token_accuracy": 0.5559895999031141,
|
|
"num_tokens": 21502809.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 0.535614013671875,
|
|
"epoch": 0.3068181818181818,
|
|
"grad_norm": 57.087826374867525,
|
|
"learning_rate": 4.814814814814815e-06,
|
|
"loss": 2.7772,
|
|
"mean_token_accuracy": 0.5546875165309757,
|
|
"num_tokens": 22341880.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 0.5427932739257812,
|
|
"epoch": 0.3181818181818182,
|
|
"grad_norm": 57.19764036404579,
|
|
"learning_rate": 5e-06,
|
|
"loss": 2.7402,
|
|
"mean_token_accuracy": 0.5625000167638063,
|
|
"num_tokens": 23163457.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 0.5347366333007812,
|
|
"epoch": 0.32954545454545453,
|
|
"grad_norm": 57.175262078414924,
|
|
"learning_rate": 4.999950848940538e-06,
|
|
"loss": 2.7043,
|
|
"mean_token_accuracy": 0.5598958500195295,
|
|
"num_tokens": 24015452.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 0.5446624755859375,
|
|
"epoch": 0.3409090909090909,
|
|
"grad_norm": 58.34082962604848,
|
|
"learning_rate": 4.999803397694811e-06,
|
|
"loss": 2.6615,
|
|
"mean_token_accuracy": 0.5807291839737445,
|
|
"num_tokens": 24837556.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 0.5349578857421875,
|
|
"epoch": 0.3522727272727273,
|
|
"grad_norm": 59.10124128246407,
|
|
"learning_rate": 4.999557652060729e-06,
|
|
"loss": 2.6609,
|
|
"mean_token_accuracy": 0.5494791830424219,
|
|
"num_tokens": 25669799.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 0.5413665771484375,
|
|
"epoch": 0.36363636363636365,
|
|
"grad_norm": 58.793667611594415,
|
|
"learning_rate": 4.9992136217012184e-06,
|
|
"loss": 2.6378,
|
|
"mean_token_accuracy": 0.5533854331588373,
|
|
"num_tokens": 26474626.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 0.5285873413085938,
|
|
"epoch": 0.375,
|
|
"grad_norm": 58.352494787635415,
|
|
"learning_rate": 4.998771320143843e-06,
|
|
"loss": 2.5815,
|
|
"mean_token_accuracy": 0.5755208504851907,
|
|
"num_tokens": 27323183.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 0.5407485961914062,
|
|
"epoch": 0.38636363636363635,
|
|
"grad_norm": 59.10192767158911,
|
|
"learning_rate": 4.998230764780277e-06,
|
|
"loss": 2.5841,
|
|
"mean_token_accuracy": 0.5559895999031141,
|
|
"num_tokens": 28160848.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 0.5376205444335938,
|
|
"epoch": 0.3977272727272727,
|
|
"grad_norm": 58.51220654461849,
|
|
"learning_rate": 4.9975919768656125e-06,
|
|
"loss": 2.5609,
|
|
"mean_token_accuracy": 0.5520833497866988,
|
|
"num_tokens": 28984769.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 0.5457916259765625,
|
|
"epoch": 0.4090909090909091,
|
|
"grad_norm": 58.821677282544606,
|
|
"learning_rate": 4.996854981517535e-06,
|
|
"loss": 2.5264,
|
|
"mean_token_accuracy": 0.5690104336244985,
|
|
"num_tokens": 29793313.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 0.5445785522460938,
|
|
"epoch": 0.42045454545454547,
|
|
"grad_norm": 58.75541166163536,
|
|
"learning_rate": 4.996019807715324e-06,
|
|
"loss": 2.483,
|
|
"mean_token_accuracy": 0.582031267345883,
|
|
"num_tokens": 30612686.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 0.5450210571289062,
|
|
"epoch": 0.4318181818181818,
|
|
"grad_norm": 59.079272782778474,
|
|
"learning_rate": 4.995086488298723e-06,
|
|
"loss": 2.4537,
|
|
"mean_token_accuracy": 0.5781250172294676,
|
|
"num_tokens": 31438404.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 0.5361175537109375,
|
|
"epoch": 0.4431818181818182,
|
|
"grad_norm": 59.24200293608633,
|
|
"learning_rate": 4.994055059966641e-06,
|
|
"loss": 2.449,
|
|
"mean_token_accuracy": 0.5651041835080832,
|
|
"num_tokens": 32287533.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 0.5356903076171875,
|
|
"epoch": 0.45454545454545453,
|
|
"grad_norm": 59.44643683264253,
|
|
"learning_rate": 4.992925563275714e-06,
|
|
"loss": 2.4143,
|
|
"mean_token_accuracy": 0.5716146003687754,
|
|
"num_tokens": 33108278.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 0.5239410400390625,
|
|
"epoch": 0.4659090909090909,
|
|
"grad_norm": 59.81603078314444,
|
|
"learning_rate": 4.991698042638711e-06,
|
|
"loss": 2.395,
|
|
"mean_token_accuracy": 0.5742187671130523,
|
|
"num_tokens": 33976204.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 0.5448150634765625,
|
|
"epoch": 0.4772727272727273,
|
|
"grad_norm": 59.85712972440978,
|
|
"learning_rate": 4.990372546322782e-06,
|
|
"loss": 2.3735,
|
|
"mean_token_accuracy": 0.5742187671130523,
|
|
"num_tokens": 34776355.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 0.536407470703125,
|
|
"epoch": 0.48863636363636365,
|
|
"grad_norm": 59.778479025488444,
|
|
"learning_rate": 4.988949126447567e-06,
|
|
"loss": 2.3306,
|
|
"mean_token_accuracy": 0.5846354340901598,
|
|
"num_tokens": 35613147.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 0.539764404296875,
|
|
"epoch": 0.5,
|
|
"grad_norm": 60.09274289295303,
|
|
"learning_rate": 4.987427838983141e-06,
|
|
"loss": 2.3192,
|
|
"mean_token_accuracy": 0.5898437672294676,
|
|
"num_tokens": 36433167.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 0.5478591918945312,
|
|
"epoch": 0.5113636363636364,
|
|
"grad_norm": 60.55403859339502,
|
|
"learning_rate": 4.985808743747817e-06,
|
|
"loss": 2.3149,
|
|
"mean_token_accuracy": 0.6705729302484542,
|
|
"num_tokens": 37236428.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 0.5397415161132812,
|
|
"epoch": 0.5227272727272727,
|
|
"grad_norm": 62.35805884322216,
|
|
"learning_rate": 4.984091904405793e-06,
|
|
"loss": 2.2891,
|
|
"mean_token_accuracy": 0.8437500081490725,
|
|
"num_tokens": 38067267.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 0.5498504638671875,
|
|
"epoch": 0.5340909090909091,
|
|
"grad_norm": 60.83879041574092,
|
|
"learning_rate": 4.9822773884646444e-06,
|
|
"loss": 2.2473,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 38875935.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 0.5675277709960938,
|
|
"epoch": 0.5454545454545454,
|
|
"grad_norm": 61.64315026315899,
|
|
"learning_rate": 4.980365267272679e-06,
|
|
"loss": 2.2178,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 39647451.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 0.5543670654296875,
|
|
"epoch": 0.5568181818181818,
|
|
"grad_norm": 60.46219588564338,
|
|
"learning_rate": 4.97835561601612e-06,
|
|
"loss": 2.2005,
|
|
"mean_token_accuracy": 0.9088541720993817,
|
|
"num_tokens": 40443361.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 0.5416412353515625,
|
|
"epoch": 0.5681818181818182,
|
|
"grad_norm": 61.28005392495543,
|
|
"learning_rate": 4.97624851371616e-06,
|
|
"loss": 2.1746,
|
|
"mean_token_accuracy": 0.912760421866551,
|
|
"num_tokens": 41287350.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 0.5378799438476562,
|
|
"epoch": 0.5795454545454546,
|
|
"grad_norm": 60.86193896824303,
|
|
"learning_rate": 4.974044043225846e-06,
|
|
"loss": 2.1638,
|
|
"mean_token_accuracy": 0.8971354227978736,
|
|
"num_tokens": 42124083.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 0.5416336059570312,
|
|
"epoch": 0.5909090909090909,
|
|
"grad_norm": 60.167542832450046,
|
|
"learning_rate": 4.9717422912268265e-06,
|
|
"loss": 2.1153,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 42953412.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 0.5258102416992188,
|
|
"epoch": 0.6022727272727273,
|
|
"grad_norm": 60.83496750149086,
|
|
"learning_rate": 4.969343348225942e-06,
|
|
"loss": 2.1027,
|
|
"mean_token_accuracy": 0.9166666716337204,
|
|
"num_tokens": 43836493.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 0.5307540893554688,
|
|
"epoch": 0.6136363636363636,
|
|
"grad_norm": 59.703581210859504,
|
|
"learning_rate": 4.966847308551664e-06,
|
|
"loss": 2.0617,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 44713961.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 0.5646438598632812,
|
|
"epoch": 0.625,
|
|
"grad_norm": 59.70907545001026,
|
|
"learning_rate": 4.9642542703503874e-06,
|
|
"loss": 2.0297,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 45494752.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 0.5508499145507812,
|
|
"epoch": 0.6363636363636364,
|
|
"grad_norm": 60.33452301695544,
|
|
"learning_rate": 4.961564335582572e-06,
|
|
"loss": 2.0186,
|
|
"mean_token_accuracy": 0.9101562553551048,
|
|
"num_tokens": 46313051.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 0.5439376831054688,
|
|
"epoch": 0.6477272727272727,
|
|
"grad_norm": 59.417420203908236,
|
|
"learning_rate": 4.958777610018734e-06,
|
|
"loss": 1.9711,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 47131891.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 0.535064697265625,
|
|
"epoch": 0.6590909090909091,
|
|
"grad_norm": 61.1612600556266,
|
|
"learning_rate": 4.955894203235285e-06,
|
|
"loss": 1.987,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 47986676.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 0.5515365600585938,
|
|
"epoch": 0.6704545454545454,
|
|
"grad_norm": 59.294618242081526,
|
|
"learning_rate": 4.952914228610221e-06,
|
|
"loss": 1.925,
|
|
"mean_token_accuracy": 0.9335937539581209,
|
|
"num_tokens": 48787244.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 0.5373687744140625,
|
|
"epoch": 0.6818181818181818,
|
|
"grad_norm": 59.999785375745,
|
|
"learning_rate": 4.949837803318672e-06,
|
|
"loss": 1.9208,
|
|
"mean_token_accuracy": 0.9075520888436586,
|
|
"num_tokens": 49618007.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 0.5391998291015625,
|
|
"epoch": 0.6931818181818182,
|
|
"grad_norm": 59.46795359368021,
|
|
"learning_rate": 4.946665048328288e-06,
|
|
"loss": 1.9003,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 50454992.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 0.5372238159179688,
|
|
"epoch": 0.7045454545454546,
|
|
"grad_norm": 58.901412847118884,
|
|
"learning_rate": 4.943396088394482e-06,
|
|
"loss": 1.8635,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 51281637.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 0.5271835327148438,
|
|
"epoch": 0.7159090909090909,
|
|
"grad_norm": 59.33608755577397,
|
|
"learning_rate": 4.940031052055532e-06,
|
|
"loss": 1.8429,
|
|
"mean_token_accuracy": 0.9218750046566129,
|
|
"num_tokens": 52144982.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 0.5448532104492188,
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 58.68009408640386,
|
|
"learning_rate": 4.936570071627517e-06,
|
|
"loss": 1.8119,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 52944511.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 0.5371246337890625,
|
|
"epoch": 0.7386363636363636,
|
|
"grad_norm": 58.322502737167724,
|
|
"learning_rate": 4.933013283199124e-06,
|
|
"loss": 1.7839,
|
|
"mean_token_accuracy": 0.9335937539581209,
|
|
"num_tokens": 53800640.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 0.5559005737304688,
|
|
"epoch": 0.75,
|
|
"grad_norm": 58.84329551172846,
|
|
"learning_rate": 4.929360826626286e-06,
|
|
"loss": 1.7529,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 54608145.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 0.5375823974609375,
|
|
"epoch": 0.7613636363636364,
|
|
"grad_norm": 58.43819701446535,
|
|
"learning_rate": 4.925612845526691e-06,
|
|
"loss": 1.7149,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 55447213.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 0.53643798828125,
|
|
"epoch": 0.7727272727272727,
|
|
"grad_norm": 58.71021880546486,
|
|
"learning_rate": 4.921769487274132e-06,
|
|
"loss": 1.7003,
|
|
"mean_token_accuracy": 0.9257812544237822,
|
|
"num_tokens": 56264710.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 0.5373077392578125,
|
|
"epoch": 0.7840909090909091,
|
|
"grad_norm": 59.089983009907726,
|
|
"learning_rate": 4.917830902992716e-06,
|
|
"loss": 1.6821,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 57110504.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 0.5417709350585938,
|
|
"epoch": 0.7954545454545454,
|
|
"grad_norm": 59.291978039995975,
|
|
"learning_rate": 4.913797247550912e-06,
|
|
"loss": 1.6536,
|
|
"mean_token_accuracy": 0.9257812544237822,
|
|
"num_tokens": 57947180.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 0.5328216552734375,
|
|
"epoch": 0.8068181818181818,
|
|
"grad_norm": 59.78514618831787,
|
|
"learning_rate": 4.9096686795554725e-06,
|
|
"loss": 1.6315,
|
|
"mean_token_accuracy": 0.9153645883779973,
|
|
"num_tokens": 58807727.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 0.5300979614257812,
|
|
"epoch": 0.8181818181818182,
|
|
"grad_norm": 58.70313760166556,
|
|
"learning_rate": 4.90544536134519e-06,
|
|
"loss": 1.6079,
|
|
"mean_token_accuracy": 0.9153645883779973,
|
|
"num_tokens": 59648751.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 0.5442657470703125,
|
|
"epoch": 0.8295454545454546,
|
|
"grad_norm": 58.89538681848992,
|
|
"learning_rate": 4.901127458984516e-06,
|
|
"loss": 1.57,
|
|
"mean_token_accuracy": 0.923177087912336,
|
|
"num_tokens": 60449685.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 0.5391616821289062,
|
|
"epoch": 0.8409090909090909,
|
|
"grad_norm": 58.164567635067016,
|
|
"learning_rate": 4.8967151422570314e-06,
|
|
"loss": 1.5271,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 61277350.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 0.5380706787109375,
|
|
"epoch": 0.8522727272727273,
|
|
"grad_norm": 58.50553706931795,
|
|
"learning_rate": 4.89220858465877e-06,
|
|
"loss": 1.5198,
|
|
"mean_token_accuracy": 0.9179687548894435,
|
|
"num_tokens": 62103032.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 0.5351943969726562,
|
|
"epoch": 0.8636363636363636,
|
|
"grad_norm": 58.68232205702871,
|
|
"learning_rate": 4.887607963391394e-06,
|
|
"loss": 1.4977,
|
|
"mean_token_accuracy": 0.9036458390764892,
|
|
"num_tokens": 62933713.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 0.5309371948242188,
|
|
"epoch": 0.875,
|
|
"grad_norm": 58.4965215154889,
|
|
"learning_rate": 4.882913459355233e-06,
|
|
"loss": 1.4475,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 63777329.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 0.5414886474609375,
|
|
"epoch": 0.8863636363636364,
|
|
"grad_norm": 58.945500460294106,
|
|
"learning_rate": 4.878125257142165e-06,
|
|
"loss": 1.406,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 64599815.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 0.5440673828125,
|
|
"epoch": 0.8977272727272727,
|
|
"grad_norm": 59.64424314623865,
|
|
"learning_rate": 4.873243545028356e-06,
|
|
"loss": 1.3944,
|
|
"mean_token_accuracy": 0.9388020869810134,
|
|
"num_tokens": 65404581.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 0.5271148681640625,
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 58.8986649354466,
|
|
"learning_rate": 4.868268514966869e-06,
|
|
"loss": 1.3574,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 66249371.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 0.533050537109375,
|
|
"epoch": 0.9204545454545454,
|
|
"grad_norm": 58.86258736268561,
|
|
"learning_rate": 4.8632003625800995e-06,
|
|
"loss": 1.3254,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 67065160.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 0.5338363647460938,
|
|
"epoch": 0.9318181818181818,
|
|
"grad_norm": 58.54026874474276,
|
|
"learning_rate": 4.858039287152095e-06,
|
|
"loss": 1.3278,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 67910596.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 0.54443359375,
|
|
"epoch": 0.9431818181818182,
|
|
"grad_norm": 59.28551515148561,
|
|
"learning_rate": 4.852785491620716e-06,
|
|
"loss": 1.2784,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 68714945.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 0.5306396484375,
|
|
"epoch": 0.9545454545454546,
|
|
"grad_norm": 58.110179443715445,
|
|
"learning_rate": 4.847439182569656e-06,
|
|
"loss": 1.2402,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 69553156.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 0.5545120239257812,
|
|
"epoch": 0.9659090909090909,
|
|
"grad_norm": 57.89545411687433,
|
|
"learning_rate": 4.84200057022032e-06,
|
|
"loss": 1.2466,
|
|
"mean_token_accuracy": 0.9179687548894435,
|
|
"num_tokens": 70339835.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 0.5556106567382812,
|
|
"epoch": 0.9772727272727273,
|
|
"grad_norm": 57.867886870613994,
|
|
"learning_rate": 4.836469868423552e-06,
|
|
"loss": 1.19,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 71112665.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 0.5272903442382812,
|
|
"epoch": 0.9886363636363636,
|
|
"grad_norm": 57.47960696717133,
|
|
"learning_rate": 4.830847294651236e-06,
|
|
"loss": 1.1546,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 71973001.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 0.5232162475585938,
|
|
"epoch": 1.0,
|
|
"grad_norm": 57.363764683028826,
|
|
"learning_rate": 4.825133069987737e-06,
|
|
"loss": 1.128,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 72849046.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 0.5514373779296875,
|
|
"epoch": 1.0113636363636365,
|
|
"grad_norm": 58.53909714804268,
|
|
"learning_rate": 4.819327419121215e-06,
|
|
"loss": 1.1251,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 73657560.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 0.5366439819335938,
|
|
"epoch": 1.0227272727272727,
|
|
"grad_norm": 57.5800616711963,
|
|
"learning_rate": 4.81343057033478e-06,
|
|
"loss": 1.0801,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 74476899.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 0.5482177734375,
|
|
"epoch": 1.0340909090909092,
|
|
"grad_norm": 56.91254331845363,
|
|
"learning_rate": 4.8074427554975235e-06,
|
|
"loss": 1.0439,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 75271183.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 0.5299148559570312,
|
|
"epoch": 1.0454545454545454,
|
|
"grad_norm": 56.66420629218523,
|
|
"learning_rate": 4.8013642100554034e-06,
|
|
"loss": 1.0211,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 76125797.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 0.5338363647460938,
|
|
"epoch": 1.0568181818181819,
|
|
"grad_norm": 57.11090670537613,
|
|
"learning_rate": 4.795195173021976e-06,
|
|
"loss": 0.9976,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 76945222.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 0.5279388427734375,
|
|
"epoch": 1.0681818181818181,
|
|
"grad_norm": 56.299495025749735,
|
|
"learning_rate": 4.7889358869690065e-06,
|
|
"loss": 0.9768,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 77779661.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 0.535430908203125,
|
|
"epoch": 1.0795454545454546,
|
|
"grad_norm": 56.34968738969898,
|
|
"learning_rate": 4.782586598016928e-06,
|
|
"loss": 0.9661,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 78597732.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 0.5536880493164062,
|
|
"epoch": 1.0909090909090908,
|
|
"grad_norm": 55.85444993026245,
|
|
"learning_rate": 4.776147555825164e-06,
|
|
"loss": 0.9308,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 79368044.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 0.53662109375,
|
|
"epoch": 1.1022727272727273,
|
|
"grad_norm": 55.50053901427309,
|
|
"learning_rate": 4.769619013582309e-06,
|
|
"loss": 0.8934,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 80185996.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 0.5202407836914062,
|
|
"epoch": 1.1136363636363635,
|
|
"grad_norm": 55.28328962448184,
|
|
"learning_rate": 4.7630012279961805e-06,
|
|
"loss": 0.886,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 81053015.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 0.5313262939453125,
|
|
"epoch": 1.125,
|
|
"grad_norm": 54.8320410044771,
|
|
"learning_rate": 4.7562944592837145e-06,
|
|
"loss": 0.8601,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 81874229.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 0.52691650390625,
|
|
"epoch": 1.1363636363636362,
|
|
"grad_norm": 54.964844470984715,
|
|
"learning_rate": 4.749498971160742e-06,
|
|
"loss": 0.8504,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 82712812.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 0.524139404296875,
|
|
"epoch": 1.1477272727272727,
|
|
"grad_norm": 54.2272069397531,
|
|
"learning_rate": 4.742615030831615e-06,
|
|
"loss": 0.8163,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 83530069.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 0.5243072509765625,
|
|
"epoch": 1.1590909090909092,
|
|
"grad_norm": 53.79974981157999,
|
|
"learning_rate": 4.735642908978704e-06,
|
|
"loss": 0.7875,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 84350790.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 0.5246505737304688,
|
|
"epoch": 1.1704545454545454,
|
|
"grad_norm": 53.51593166643302,
|
|
"learning_rate": 4.728582879751746e-06,
|
|
"loss": 0.7576,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 85161819.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 0.509765625,
|
|
"epoch": 1.1818181818181819,
|
|
"grad_norm": 53.138333179917204,
|
|
"learning_rate": 4.721435220757078e-06,
|
|
"loss": 0.7282,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 86006089.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 0.527130126953125,
|
|
"epoch": 1.1931818181818181,
|
|
"grad_norm": 52.99230989719208,
|
|
"learning_rate": 4.714200213046707e-06,
|
|
"loss": 0.741,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 86836190.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 0.5215072631835938,
|
|
"epoch": 1.2045454545454546,
|
|
"grad_norm": 52.31920733776493,
|
|
"learning_rate": 4.706878141107269e-06,
|
|
"loss": 0.7269,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 87654111.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 0.5284652709960938,
|
|
"epoch": 1.2159090909090908,
|
|
"grad_norm": 51.7538829709675,
|
|
"learning_rate": 4.699469292848839e-06,
|
|
"loss": 0.6748,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 88442734.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 0.5117950439453125,
|
|
"epoch": 1.2272727272727273,
|
|
"grad_norm": 50.822310194810335,
|
|
"learning_rate": 4.691973959593609e-06,
|
|
"loss": 0.6566,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 89296093.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 0.5231399536132812,
|
|
"epoch": 1.2386363636363638,
|
|
"grad_norm": 50.17015940796914,
|
|
"learning_rate": 4.6843924360644385e-06,
|
|
"loss": 0.6354,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 90133661.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 0.5147552490234375,
|
|
"epoch": 1.25,
|
|
"grad_norm": 48.95194138585349,
|
|
"learning_rate": 4.676725020373255e-06,
|
|
"loss": 0.6273,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 90980602.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 0.52349853515625,
|
|
"epoch": 1.2613636363636362,
|
|
"grad_norm": 47.39724961748752,
|
|
"learning_rate": 4.6689720140093445e-06,
|
|
"loss": 0.5975,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 91807048.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 0.521514892578125,
|
|
"epoch": 1.2727272727272727,
|
|
"grad_norm": 46.42644291070126,
|
|
"learning_rate": 4.661133721827487e-06,
|
|
"loss": 0.5747,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 92647140.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 0.5202713012695312,
|
|
"epoch": 1.2840909090909092,
|
|
"grad_norm": 46.2730968549775,
|
|
"learning_rate": 4.653210452035974e-06,
|
|
"loss": 0.5663,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 93498584.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 0.52655029296875,
|
|
"epoch": 1.2954545454545454,
|
|
"grad_norm": 46.71747285833351,
|
|
"learning_rate": 4.645202516184492e-06,
|
|
"loss": 0.5568,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 94324355.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 0.5159225463867188,
|
|
"epoch": 1.3068181818181819,
|
|
"grad_norm": 45.84890607800894,
|
|
"learning_rate": 4.6371102291518635e-06,
|
|
"loss": 0.5298,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 95183202.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 0.5389862060546875,
|
|
"epoch": 1.3181818181818181,
|
|
"grad_norm": 42.59073288918198,
|
|
"learning_rate": 4.628933909133674e-06,
|
|
"loss": 0.5097,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 95998136.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 0.529327392578125,
|
|
"epoch": 1.3295454545454546,
|
|
"grad_norm": 41.670866693165614,
|
|
"learning_rate": 4.620673877629757e-06,
|
|
"loss": 0.4863,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 96842943.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 0.5238037109375,
|
|
"epoch": 1.3409090909090908,
|
|
"grad_norm": 40.26261711766899,
|
|
"learning_rate": 4.612330459431552e-06,
|
|
"loss": 0.4633,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 97710263.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 0.5423736572265625,
|
|
"epoch": 1.3522727272727273,
|
|
"grad_norm": 42.256122892341864,
|
|
"learning_rate": 4.603903982609334e-06,
|
|
"loss": 0.4653,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 98497301.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 0.5260009765625,
|
|
"epoch": 1.3636363636363638,
|
|
"grad_norm": 43.312904169159644,
|
|
"learning_rate": 4.595394778499314e-06,
|
|
"loss": 0.5063,
|
|
"mean_token_accuracy": 0.9140625051222742,
|
|
"num_tokens": 99332885.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 0.5403060913085938,
|
|
"epoch": 1.375,
|
|
"grad_norm": 35.515421668024835,
|
|
"learning_rate": 4.586803181690609e-06,
|
|
"loss": 0.4049,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 100153118.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 0.530426025390625,
|
|
"epoch": 1.3863636363636362,
|
|
"grad_norm": 39.33381625799498,
|
|
"learning_rate": 4.5781295300120885e-06,
|
|
"loss": 0.4432,
|
|
"mean_token_accuracy": 0.9192708381451666,
|
|
"num_tokens": 101017186.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 0.54278564453125,
|
|
"epoch": 1.3977272727272727,
|
|
"grad_norm": 33.567114056620284,
|
|
"learning_rate": 4.569374164519088e-06,
|
|
"loss": 0.3836,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 101837538.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 0.5357742309570312,
|
|
"epoch": 1.4090909090909092,
|
|
"grad_norm": 34.647421460830614,
|
|
"learning_rate": 4.560537429479998e-06,
|
|
"loss": 0.4015,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 102674523.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 0.5452957153320312,
|
|
"epoch": 1.4204545454545454,
|
|
"grad_norm": 31.848760000487612,
|
|
"learning_rate": 4.5516196723627325e-06,
|
|
"loss": 0.3631,
|
|
"mean_token_accuracy": 0.9440104200039059,
|
|
"num_tokens": 103498410.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 0.5371170043945312,
|
|
"epoch": 1.4318181818181819,
|
|
"grad_norm": 32.82915500567515,
|
|
"learning_rate": 4.542621243821058e-06,
|
|
"loss": 0.3459,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 104317389.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 0.5457000732421875,
|
|
"epoch": 1.4431818181818181,
|
|
"grad_norm": 30.802170293048523,
|
|
"learning_rate": 4.533542497680811e-06,
|
|
"loss": 0.3474,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 105130635.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 0.5292892456054688,
|
|
"epoch": 1.4545454545454546,
|
|
"grad_norm": 28.598798267034123,
|
|
"learning_rate": 4.524383790925987e-06,
|
|
"loss": 0.2939,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 105967667.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 0.541412353515625,
|
|
"epoch": 1.4659090909090908,
|
|
"grad_norm": 31.427843756705663,
|
|
"learning_rate": 4.515145483684696e-06,
|
|
"loss": 0.3418,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 106764890.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 0.540252685546875,
|
|
"epoch": 1.4772727272727273,
|
|
"grad_norm": 26.796560576120022,
|
|
"learning_rate": 4.505827939215009e-06,
|
|
"loss": 0.2719,
|
|
"mean_token_accuracy": 0.967447918606922,
|
|
"num_tokens": 107567415.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 0.5262680053710938,
|
|
"epoch": 1.4886363636363638,
|
|
"grad_norm": 28.763274799154782,
|
|
"learning_rate": 4.496431523890673e-06,
|
|
"loss": 0.3127,
|
|
"mean_token_accuracy": 0.9309895874466747,
|
|
"num_tokens": 108411240.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 0.5342483520507812,
|
|
"epoch": 1.5,
|
|
"grad_norm": 25.48018544383524,
|
|
"learning_rate": 4.486956607186702e-06,
|
|
"loss": 0.2803,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 109210428.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 0.5379180908203125,
|
|
"epoch": 1.5113636363636362,
|
|
"grad_norm": 24.901297620374315,
|
|
"learning_rate": 4.477403561664852e-06,
|
|
"loss": 0.2872,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 110045310.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 0.5262680053710938,
|
|
"epoch": 1.5227272727272727,
|
|
"grad_norm": 22.091806427948622,
|
|
"learning_rate": 4.467772762958968e-06,
|
|
"loss": 0.2496,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 110882589.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 0.5295639038085938,
|
|
"epoch": 1.5340909090909092,
|
|
"grad_norm": 23.292778405119115,
|
|
"learning_rate": 4.458064589760221e-06,
|
|
"loss": 0.2408,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 111705631.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 0.5286483764648438,
|
|
"epoch": 1.5454545454545454,
|
|
"grad_norm": 21.213889535771415,
|
|
"learning_rate": 4.448279423802207e-06,
|
|
"loss": 0.2283,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 112539225.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 0.5333480834960938,
|
|
"epoch": 1.5568181818181817,
|
|
"grad_norm": 19.684812094647675,
|
|
"learning_rate": 4.438417649845946e-06,
|
|
"loss": 0.2291,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 113362874.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 0.5320663452148438,
|
|
"epoch": 1.5681818181818183,
|
|
"grad_norm": 18.51754571632508,
|
|
"learning_rate": 4.428479655664748e-06,
|
|
"loss": 0.1981,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 114182515.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 0.5383987426757812,
|
|
"epoch": 1.5795454545454546,
|
|
"grad_norm": 17.723585060157205,
|
|
"learning_rate": 4.4184658320289675e-06,
|
|
"loss": 0.2078,
|
|
"mean_token_accuracy": 0.9466145865153521,
|
|
"num_tokens": 115003365.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 0.5347518920898438,
|
|
"epoch": 1.5909090909090908,
|
|
"grad_norm": 16.78330133242101,
|
|
"learning_rate": 4.408376572690638e-06,
|
|
"loss": 0.2172,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 115805016.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 0.5168228149414062,
|
|
"epoch": 1.6022727272727273,
|
|
"grad_norm": 15.362502774932778,
|
|
"learning_rate": 4.3982122743679875e-06,
|
|
"loss": 0.1947,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 116648818.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 0.5343856811523438,
|
|
"epoch": 1.6136363636363638,
|
|
"grad_norm": 18.024974195199757,
|
|
"learning_rate": 4.387973336729841e-06,
|
|
"loss": 0.2382,
|
|
"mean_token_accuracy": 0.9270833376795053,
|
|
"num_tokens": 117458175.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 0.521270751953125,
|
|
"epoch": 1.625,
|
|
"grad_norm": 14.018027983518943,
|
|
"learning_rate": 4.377660162379904e-06,
|
|
"loss": 0.1929,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 118312779.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 0.5243148803710938,
|
|
"epoch": 1.6363636363636362,
|
|
"grad_norm": 14.99949121416359,
|
|
"learning_rate": 4.3672731568409344e-06,
|
|
"loss": 0.1898,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 119140778.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 0.5383453369140625,
|
|
"epoch": 1.6477272727272727,
|
|
"grad_norm": 13.695194386624797,
|
|
"learning_rate": 4.3568127285387925e-06,
|
|
"loss": 0.1841,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 119957153.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 0.5297164916992188,
|
|
"epoch": 1.6590909090909092,
|
|
"grad_norm": 12.975994099921467,
|
|
"learning_rate": 4.346279288786387e-06,
|
|
"loss": 0.1681,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 120773292.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 0.5347518920898438,
|
|
"epoch": 1.6704545454545454,
|
|
"grad_norm": 21.362591160453682,
|
|
"learning_rate": 4.3356732517674935e-06,
|
|
"loss": 0.2171,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 121574882.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 0.5269546508789062,
|
|
"epoch": 1.6818181818181817,
|
|
"grad_norm": 19.616475421681326,
|
|
"learning_rate": 4.32499503452048e-06,
|
|
"loss": 0.1828,
|
|
"mean_token_accuracy": 0.9414062534924597,
|
|
"num_tokens": 122412333.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 0.5350723266601562,
|
|
"epoch": 1.6931818181818183,
|
|
"grad_norm": 10.633055149272264,
|
|
"learning_rate": 4.314245056921899e-06,
|
|
"loss": 0.1523,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 123241270.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 0.5313796997070312,
|
|
"epoch": 1.7045454545454546,
|
|
"grad_norm": 11.24894351109355,
|
|
"learning_rate": 4.303423741669978e-06,
|
|
"loss": 0.1697,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 124075099.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 0.536651611328125,
|
|
"epoch": 1.7159090909090908,
|
|
"grad_norm": 15.692069667523771,
|
|
"learning_rate": 4.292531514268008e-06,
|
|
"loss": 0.182,
|
|
"mean_token_accuracy": 0.9257812544237822,
|
|
"num_tokens": 124928980.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 0.5392379760742188,
|
|
"epoch": 1.7272727272727273,
|
|
"grad_norm": 9.664745819966296,
|
|
"learning_rate": 4.281568803007601e-06,
|
|
"loss": 0.1791,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 125748569.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 0.54058837890625,
|
|
"epoch": 1.7386363636363638,
|
|
"grad_norm": 15.28963396758985,
|
|
"learning_rate": 4.270536038951855e-06,
|
|
"loss": 0.1828,
|
|
"mean_token_accuracy": 0.9361979204695672,
|
|
"num_tokens": 126583565.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 0.5325546264648438,
|
|
"epoch": 1.75,
|
|
"grad_norm": 8.152421877410646,
|
|
"learning_rate": 4.259433655918404e-06,
|
|
"loss": 0.1505,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 127434652.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 0.5367050170898438,
|
|
"epoch": 1.7613636363636362,
|
|
"grad_norm": 13.314659677750605,
|
|
"learning_rate": 4.24826209046236e-06,
|
|
"loss": 0.1763,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 128265274.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 0.5238265991210938,
|
|
"epoch": 1.7727272727272727,
|
|
"grad_norm": 8.732379857922384,
|
|
"learning_rate": 4.237021781859143e-06,
|
|
"loss": 0.1685,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 129118921.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 0.5306396484375,
|
|
"epoch": 1.7840909090909092,
|
|
"grad_norm": 13.453938351329127,
|
|
"learning_rate": 4.225713172087216e-06,
|
|
"loss": 0.1672,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 129924616.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 0.5287246704101562,
|
|
"epoch": 1.7954545454545454,
|
|
"grad_norm": 8.201845038506509,
|
|
"learning_rate": 4.2143367058107e-06,
|
|
"loss": 0.1545,
|
|
"mean_token_accuracy": 0.955729169305414,
|
|
"num_tokens": 130759784.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 0.5428314208984375,
|
|
"epoch": 1.8068181818181817,
|
|
"grad_norm": 10.976889686141423,
|
|
"learning_rate": 4.202892830361892e-06,
|
|
"loss": 0.1581,
|
|
"mean_token_accuracy": 0.9283854209352285,
|
|
"num_tokens": 131523961.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 0.5345458984375,
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 8.166044940510316,
|
|
"learning_rate": 4.191381995723672e-06,
|
|
"loss": 0.1324,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 132329209.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 0.5333099365234375,
|
|
"epoch": 1.8295454545454546,
|
|
"grad_norm": 12.143258833540356,
|
|
"learning_rate": 4.179804654511816e-06,
|
|
"loss": 0.1465,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 133146370.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 0.5122833251953125,
|
|
"epoch": 1.8409090909090908,
|
|
"grad_norm": 6.838641869872832,
|
|
"learning_rate": 4.168161261957192e-06,
|
|
"loss": 0.1375,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 133988967.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 0.5211181640625,
|
|
"epoch": 1.8522727272727273,
|
|
"grad_norm": 15.025449078819449,
|
|
"learning_rate": 4.1564522758878656e-06,
|
|
"loss": 0.1562,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 134816536.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 0.5217361450195312,
|
|
"epoch": 1.8636363636363638,
|
|
"grad_norm": 7.58342476695868,
|
|
"learning_rate": 4.144678156711091e-06,
|
|
"loss": 0.1333,
|
|
"mean_token_accuracy": 0.9531250027939677,
|
|
"num_tokens": 135660302.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 0.5208892822265625,
|
|
"epoch": 1.875,
|
|
"grad_norm": 12.39285081760315,
|
|
"learning_rate": 4.132839367395215e-06,
|
|
"loss": 0.144,
|
|
"mean_token_accuracy": 0.9505208362825215,
|
|
"num_tokens": 136493782.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 0.533935546875,
|
|
"epoch": 1.8863636363636362,
|
|
"grad_norm": 11.441864197889792,
|
|
"learning_rate": 4.120936373451467e-06,
|
|
"loss": 0.1625,
|
|
"mean_token_accuracy": 0.9296875041909516,
|
|
"num_tokens": 137296430.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 0.5355453491210938,
|
|
"epoch": 1.8977272727272727,
|
|
"grad_norm": 6.843215535272569,
|
|
"learning_rate": 4.108969642915658e-06,
|
|
"loss": 0.1353,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 138111216.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 0.5361480712890625,
|
|
"epoch": 1.9090909090909092,
|
|
"grad_norm": 8.869401349710477,
|
|
"learning_rate": 4.096939646329775e-06,
|
|
"loss": 0.1442,
|
|
"mean_token_accuracy": 0.9348958372138441,
|
|
"num_tokens": 138921744.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 0.5290679931640625,
|
|
"epoch": 1.9204545454545454,
|
|
"grad_norm": 4.78181815999074,
|
|
"learning_rate": 4.08484685672348e-06,
|
|
"loss": 0.1167,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 139736919.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 0.5113754272460938,
|
|
"epoch": 1.9318181818181817,
|
|
"grad_norm": 8.861363241151182,
|
|
"learning_rate": 4.07269174959551e-06,
|
|
"loss": 0.1207,
|
|
"mean_token_accuracy": 0.9622395855840296,
|
|
"num_tokens": 140617027.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 0.5159988403320312,
|
|
"epoch": 1.9431818181818183,
|
|
"grad_norm": 8.006569945370714,
|
|
"learning_rate": 4.06047480289498e-06,
|
|
"loss": 0.1212,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 141443086.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 0.51470947265625,
|
|
"epoch": 1.9545454545454546,
|
|
"grad_norm": 10.431183762698996,
|
|
"learning_rate": 4.0481964970025885e-06,
|
|
"loss": 0.1442,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 142285456.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 0.5068130493164062,
|
|
"epoch": 1.9659090909090908,
|
|
"grad_norm": 8.414148072333015,
|
|
"learning_rate": 4.035857314711729e-06,
|
|
"loss": 0.1396,
|
|
"mean_token_accuracy": 0.945312503259629,
|
|
"num_tokens": 143142101.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 0.5181655883789062,
|
|
"epoch": 1.9772727272727273,
|
|
"grad_norm": 7.281260662915632,
|
|
"learning_rate": 4.023457741209509e-06,
|
|
"loss": 0.1226,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 143972519.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 0.52288818359375,
|
|
"epoch": 1.9886363636363638,
|
|
"grad_norm": 4.149697190907548,
|
|
"learning_rate": 4.0109982640576676e-06,
|
|
"loss": 0.1123,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 144814806.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 0.517059326171875,
|
|
"epoch": 2.0,
|
|
"grad_norm": 7.882059155042237,
|
|
"learning_rate": 3.998479373173406e-06,
|
|
"loss": 0.1111,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 145648585.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 0.5061798095703125,
|
|
"epoch": 2.0113636363636362,
|
|
"grad_norm": 3.755333968867442,
|
|
"learning_rate": 3.985901560810126e-06,
|
|
"loss": 0.0993,
|
|
"mean_token_accuracy": 0.9687500018626451,
|
|
"num_tokens": 146516717.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 0.509613037109375,
|
|
"epoch": 2.022727272727273,
|
|
"grad_norm": 7.578047639747768,
|
|
"learning_rate": 3.973265321538069e-06,
|
|
"loss": 0.1273,
|
|
"mean_token_accuracy": 0.9518229195382446,
|
|
"num_tokens": 147377138.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 0.526214599609375,
|
|
"epoch": 2.034090909090909,
|
|
"grad_norm": 9.248690622373502,
|
|
"learning_rate": 3.960571152224872e-06,
|
|
"loss": 0.0844,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 148164451.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 0.51904296875,
|
|
"epoch": 2.0454545454545454,
|
|
"grad_norm": 4.995096147331217,
|
|
"learning_rate": 3.9478195520160355e-06,
|
|
"loss": 0.0826,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 148952463.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 0.5115966796875,
|
|
"epoch": 2.0568181818181817,
|
|
"grad_norm": 9.976065288968513,
|
|
"learning_rate": 3.935011022315284e-06,
|
|
"loss": 0.1184,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 149789754.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 0.5167007446289062,
|
|
"epoch": 2.0681818181818183,
|
|
"grad_norm": 6.753980258357957,
|
|
"learning_rate": 3.922146066764863e-06,
|
|
"loss": 0.1101,
|
|
"mean_token_accuracy": 0.9609375023283064,
|
|
"num_tokens": 150576600.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 0.5221023559570312,
|
|
"epoch": 2.0795454545454546,
|
|
"grad_norm": 4.233617937880569,
|
|
"learning_rate": 3.9092251912257286e-06,
|
|
"loss": 0.073,
|
|
"mean_token_accuracy": 0.9752604181412607,
|
|
"num_tokens": 151392147.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 0.5233230590820312,
|
|
"epoch": 2.090909090909091,
|
|
"grad_norm": 6.412674458392547,
|
|
"learning_rate": 3.896248903757658e-06,
|
|
"loss": 0.0898,
|
|
"mean_token_accuracy": 0.9713541683740914,
|
|
"num_tokens": 152228107.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 0.53240966796875,
|
|
"epoch": 2.102272727272727,
|
|
"grad_norm": 6.31471950629154,
|
|
"learning_rate": 3.883217714599273e-06,
|
|
"loss": 0.1037,
|
|
"mean_token_accuracy": 0.967447918606922,
|
|
"num_tokens": 153053726.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 0.5299835205078125,
|
|
"epoch": 2.1136363636363638,
|
|
"grad_norm": 5.524811254106886,
|
|
"learning_rate": 3.870132136147977e-06,
|
|
"loss": 0.0859,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 153905918.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 0.5373382568359375,
|
|
"epoch": 2.125,
|
|
"grad_norm": 5.959063680482344,
|
|
"learning_rate": 3.856992682939803e-06,
|
|
"loss": 0.0936,
|
|
"mean_token_accuracy": 0.9739583348855376,
|
|
"num_tokens": 154726918.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 0.5167999267578125,
|
|
"epoch": 2.1363636363636362,
|
|
"grad_norm": 4.047026610074852,
|
|
"learning_rate": 3.84379987162919e-06,
|
|
"loss": 0.088,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 155571272.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 0.5206451416015625,
|
|
"epoch": 2.147727272727273,
|
|
"grad_norm": 14.950526464718012,
|
|
"learning_rate": 3.830554220968661e-06,
|
|
"loss": 0.117,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 156393962.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 0.5257492065429688,
|
|
"epoch": 2.159090909090909,
|
|
"grad_norm": 6.350323940638027,
|
|
"learning_rate": 3.817256251788425e-06,
|
|
"loss": 0.0684,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 157224989.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 0.5198822021484375,
|
|
"epoch": 2.1704545454545454,
|
|
"grad_norm": 17.607176029933502,
|
|
"learning_rate": 3.803906486975901e-06,
|
|
"loss": 0.1604,
|
|
"mean_token_accuracy": 0.9401041702367365,
|
|
"num_tokens": 158088785.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 0.5287246704101562,
|
|
"epoch": 2.1818181818181817,
|
|
"grad_norm": 17.168183781109484,
|
|
"learning_rate": 3.790505451455158e-06,
|
|
"loss": 0.1742,
|
|
"mean_token_accuracy": 0.9375000037252903,
|
|
"num_tokens": 158924638.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 0.5184326171875,
|
|
"epoch": 2.1931818181818183,
|
|
"grad_norm": 6.269834392958401,
|
|
"learning_rate": 3.77705367216627e-06,
|
|
"loss": 0.0732,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 159745234.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 0.5220794677734375,
|
|
"epoch": 2.2045454545454546,
|
|
"grad_norm": 15.721070801554754,
|
|
"learning_rate": 3.7635516780446e-06,
|
|
"loss": 0.1183,
|
|
"mean_token_accuracy": 0.9544270860496908,
|
|
"num_tokens": 160611881.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 0.53619384765625,
|
|
"epoch": 2.215909090909091,
|
|
"grad_norm": 18.33813662745301,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 0.1767,
|
|
"mean_token_accuracy": 0.9322916707023978,
|
|
"num_tokens": 161426112.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 0.5299606323242188,
|
|
"epoch": 2.227272727272727,
|
|
"grad_norm": 11.189196081242297,
|
|
"learning_rate": 3.7363991708959386e-06,
|
|
"loss": 0.1316,
|
|
"mean_token_accuracy": 0.9479166697710752,
|
|
"num_tokens": 162252165.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 0.5212249755859375,
|
|
"epoch": 2.2386363636363638,
|
|
"grad_norm": 3.775620350441528,
|
|
"learning_rate": 3.7227497255285416e-06,
|
|
"loss": 0.1001,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 163113356.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 0.5298538208007812,
|
|
"epoch": 2.25,
|
|
"grad_norm": 10.56752963682954,
|
|
"learning_rate": 3.709052200605572e-06,
|
|
"loss": 0.1411,
|
|
"mean_token_accuracy": 0.9492187530267984,
|
|
"num_tokens": 163956631.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 0.5439224243164062,
|
|
"epoch": 2.2613636363636362,
|
|
"grad_norm": 9.338301436041878,
|
|
"learning_rate": 3.6953071347253167e-06,
|
|
"loss": 0.1117,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 164735604.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 0.530059814453125,
|
|
"epoch": 2.2727272727272725,
|
|
"grad_norm": 2.888565812031598,
|
|
"learning_rate": 3.6815150683554187e-06,
|
|
"loss": 0.0845,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 165562321.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 0.5339889526367188,
|
|
"epoch": 2.284090909090909,
|
|
"grad_norm": 6.631223942318063,
|
|
"learning_rate": 3.6676765438116157e-06,
|
|
"loss": 0.1074,
|
|
"mean_token_accuracy": 0.9596354190725833,
|
|
"num_tokens": 166385295.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 0.5253219604492188,
|
|
"epoch": 2.2954545454545454,
|
|
"grad_norm": 14.061780677797357,
|
|
"learning_rate": 3.6537921052364223e-06,
|
|
"loss": 0.1289,
|
|
"mean_token_accuracy": 0.9427083367481828,
|
|
"num_tokens": 167193522.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 0.532470703125,
|
|
"epoch": 2.3068181818181817,
|
|
"grad_norm": 6.341361614121059,
|
|
"learning_rate": 3.6398622985777314e-06,
|
|
"loss": 0.0977,
|
|
"mean_token_accuracy": 0.9570312525611371,
|
|
"num_tokens": 168023123.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 0.5181808471679688,
|
|
"epoch": 2.3181818181818183,
|
|
"grad_norm": 7.085053762621007,
|
|
"learning_rate": 3.6258876715673475e-06,
|
|
"loss": 0.1024,
|
|
"mean_token_accuracy": 0.9583333358168602,
|
|
"num_tokens": 168884198.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 0.5181198120117188,
|
|
"epoch": 2.3295454545454546,
|
|
"grad_norm": 6.0840959915959445,
|
|
"learning_rate": 3.611868773699449e-06,
|
|
"loss": 0.0818,
|
|
"mean_token_accuracy": 0.9648437520954758,
|
|
"num_tokens": 169721369.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 0.525115966796875,
|
|
"epoch": 2.340909090909091,
|
|
"grad_norm": 3.8765193636226587,
|
|
"learning_rate": 3.597806156208982e-06,
|
|
"loss": 0.0713,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 170586563.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 0.5304183959960938,
|
|
"epoch": 2.3522727272727275,
|
|
"grad_norm": 8.283485896461336,
|
|
"learning_rate": 3.5837003720499853e-06,
|
|
"loss": 0.0828,
|
|
"mean_token_accuracy": 0.967447918606922,
|
|
"num_tokens": 171389861.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 0.521881103515625,
|
|
"epoch": 2.3636363636363638,
|
|
"grad_norm": 8.549193433703433,
|
|
"learning_rate": 3.569551975873847e-06,
|
|
"loss": 0.0994,
|
|
"mean_token_accuracy": 0.9635416688397527,
|
|
"num_tokens": 172247665.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 0.52642822265625,
|
|
"epoch": 2.375,
|
|
"grad_norm": 4.731480811392632,
|
|
"learning_rate": 3.555361524007498e-06,
|
|
"loss": 0.0764,
|
|
"mean_token_accuracy": 0.9700520851183683,
|
|
"num_tokens": 173082355.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 0.5319061279296875,
|
|
"epoch": 2.3863636363636362,
|
|
"grad_norm": 4.2015486425538855,
|
|
"learning_rate": 3.541129574431532e-06,
|
|
"loss": 0.0615,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 173912416.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 0.51190185546875,
|
|
"epoch": 2.3977272727272725,
|
|
"grad_norm": 2.509367450469538,
|
|
"learning_rate": 3.526856686758269e-06,
|
|
"loss": 0.0456,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 174774730.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 0.5276336669921875,
|
|
"epoch": 2.409090909090909,
|
|
"grad_norm": 3.6294914532585163,
|
|
"learning_rate": 3.51254342220975e-06,
|
|
"loss": 0.0551,
|
|
"mean_token_accuracy": 0.977864584652707,
|
|
"num_tokens": 175600690.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 0.51702880859375,
|
|
"epoch": 2.4204545454545454,
|
|
"grad_norm": 6.099017771755878,
|
|
"learning_rate": 3.4981903435956675e-06,
|
|
"loss": 0.0561,
|
|
"mean_token_accuracy": 0.977864584652707,
|
|
"num_tokens": 176433236.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 0.5221633911132812,
|
|
"epoch": 2.4318181818181817,
|
|
"grad_norm": 5.444792509114168,
|
|
"learning_rate": 3.4837980152912393e-06,
|
|
"loss": 0.0638,
|
|
"mean_token_accuracy": 0.9726562516298145,
|
|
"num_tokens": 177252798.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 0.51513671875,
|
|
"epoch": 2.4431818181818183,
|
|
"grad_norm": 6.166295424467747,
|
|
"learning_rate": 3.4693670032150117e-06,
|
|
"loss": 0.0598,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 178087586.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 0.5172805786132812,
|
|
"epoch": 2.4545454545454546,
|
|
"grad_norm": 7.216293257274722,
|
|
"learning_rate": 3.4548978748066115e-06,
|
|
"loss": 0.059,
|
|
"mean_token_accuracy": 0.9752604181412607,
|
|
"num_tokens": 178904966.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 0.5154190063476562,
|
|
"epoch": 2.465909090909091,
|
|
"grad_norm": 5.774270384427778,
|
|
"learning_rate": 3.440391199004431e-06,
|
|
"loss": 0.0573,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 179732100.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 0.5367050170898438,
|
|
"epoch": 2.4772727272727275,
|
|
"grad_norm": 5.71656956287317,
|
|
"learning_rate": 3.4258475462232586e-06,
|
|
"loss": 0.0531,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 180499306.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 0.5283737182617188,
|
|
"epoch": 2.4886363636363638,
|
|
"grad_norm": 5.454253375411191,
|
|
"learning_rate": 3.4112674883318477e-06,
|
|
"loss": 0.0508,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 181302762.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 0.5136947631835938,
|
|
"epoch": 2.5,
|
|
"grad_norm": 4.865211217099932,
|
|
"learning_rate": 3.3966515986304322e-06,
|
|
"loss": 0.0641,
|
|
"mean_token_accuracy": 0.9791666679084301,
|
|
"num_tokens": 182132901.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 0.5163040161132812,
|
|
"epoch": 2.5113636363636362,
|
|
"grad_norm": 7.9460874989938075,
|
|
"learning_rate": 3.3820004518281835e-06,
|
|
"loss": 0.0641,
|
|
"mean_token_accuracy": 0.9765625013969839,
|
|
"num_tokens": 182956676.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 0.5112228393554688,
|
|
"epoch": 2.5227272727272725,
|
|
"grad_norm": 3.996587612081756,
|
|
"learning_rate": 3.367314624020613e-06,
|
|
"loss": 0.0414,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 183817292.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 0.5117111206054688,
|
|
"epoch": 2.534090909090909,
|
|
"grad_norm": 9.534113890080645,
|
|
"learning_rate": 3.352594692666915e-06,
|
|
"loss": 0.0903,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 184637423.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 0.5112152099609375,
|
|
"epoch": 2.5454545454545454,
|
|
"grad_norm": 10.168817989484715,
|
|
"learning_rate": 3.337841236567268e-06,
|
|
"loss": 0.0771,
|
|
"mean_token_accuracy": 0.9661458353511989,
|
|
"num_tokens": 185469944.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 0.5121002197265625,
|
|
"epoch": 2.5568181818181817,
|
|
"grad_norm": 3.9431029399036825,
|
|
"learning_rate": 3.32305483584007e-06,
|
|
"loss": 0.0611,
|
|
"mean_token_accuracy": 0.9830729176755995,
|
|
"num_tokens": 186292612.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 0.515106201171875,
|
|
"epoch": 2.5681818181818183,
|
|
"grad_norm": 4.814525321844357,
|
|
"learning_rate": 3.30823607189913e-06,
|
|
"loss": 0.064,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 187101708.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 0.5209121704101562,
|
|
"epoch": 2.5795454545454546,
|
|
"grad_norm": 4.496547790724185,
|
|
"learning_rate": 3.2933855274308067e-06,
|
|
"loss": 0.0629,
|
|
"mean_token_accuracy": 0.9804687511641532,
|
|
"num_tokens": 187907131.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 0.5145339965820312,
|
|
"epoch": 2.590909090909091,
|
|
"grad_norm": 4.755114257664067,
|
|
"learning_rate": 3.278503786371095e-06,
|
|
"loss": 0.044,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 188726193.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 0.5033798217773438,
|
|
"epoch": 2.6022727272727275,
|
|
"grad_norm": 7.053025964916729,
|
|
"learning_rate": 3.2635914338826665e-06,
|
|
"loss": 0.0641,
|
|
"mean_token_accuracy": 0.977864584652707,
|
|
"num_tokens": 189574751.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 0.5178298950195312,
|
|
"epoch": 2.6136363636363638,
|
|
"grad_norm": 2.732036872044337,
|
|
"learning_rate": 3.2486490563318605e-06,
|
|
"loss": 0.0449,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 190392439.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 0.5087661743164062,
|
|
"epoch": 2.625,
|
|
"grad_norm": 3.7713000538715247,
|
|
"learning_rate": 3.233677241265627e-06,
|
|
"loss": 0.0564,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 191230801.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 0.5154266357421875,
|
|
"epoch": 2.6363636363636362,
|
|
"grad_norm": 3.296081610191933,
|
|
"learning_rate": 3.218676577388424e-06,
|
|
"loss": 0.0474,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 192055235.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 0.5103530883789062,
|
|
"epoch": 2.6477272727272725,
|
|
"grad_norm": 3.5282268907437073,
|
|
"learning_rate": 3.2036476545390695e-06,
|
|
"loss": 0.0377,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 192908727.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 0.521148681640625,
|
|
"epoch": 2.659090909090909,
|
|
"grad_norm": 6.036365365275266,
|
|
"learning_rate": 3.188591063667548e-06,
|
|
"loss": 0.0493,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 193727407.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 0.511871337890625,
|
|
"epoch": 2.6704545454545454,
|
|
"grad_norm": 5.617886246430549,
|
|
"learning_rate": 3.1735073968117743e-06,
|
|
"loss": 0.0452,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 194567749.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 0.5137405395507812,
|
|
"epoch": 2.6818181818181817,
|
|
"grad_norm": 5.613230841959403,
|
|
"learning_rate": 3.1583972470743123e-06,
|
|
"loss": 0.039,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 195403493.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 0.5176010131835938,
|
|
"epoch": 2.6931818181818183,
|
|
"grad_norm": 4.811211184785151,
|
|
"learning_rate": 3.1432612085990576e-06,
|
|
"loss": 0.0585,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 196225816.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 0.517425537109375,
|
|
"epoch": 2.7045454545454546,
|
|
"grad_norm": 6.026414758455728,
|
|
"learning_rate": 3.1280998765478725e-06,
|
|
"loss": 0.0449,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 197066021.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 0.519012451171875,
|
|
"epoch": 2.715909090909091,
|
|
"grad_norm": 5.323494774872785,
|
|
"learning_rate": 3.1129138470771823e-06,
|
|
"loss": 0.0466,
|
|
"mean_token_accuracy": 0.9830729176755995,
|
|
"num_tokens": 197881192.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 0.5157241821289062,
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 3.7572493546476897,
|
|
"learning_rate": 3.0977037173145387e-06,
|
|
"loss": 0.0351,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 198696942.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 0.5295562744140625,
|
|
"epoch": 2.7386363636363638,
|
|
"grad_norm": 4.351563113236933,
|
|
"learning_rate": 3.082470085335133e-06,
|
|
"loss": 0.0379,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 199506760.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 0.5326080322265625,
|
|
"epoch": 2.75,
|
|
"grad_norm": 4.3434728185279114,
|
|
"learning_rate": 3.0672135501382894e-06,
|
|
"loss": 0.0426,
|
|
"mean_token_accuracy": 0.9817708344198763,
|
|
"num_tokens": 200314554.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 0.5197906494140625,
|
|
"epoch": 2.7613636363636362,
|
|
"grad_norm": 4.23758686798832,
|
|
"learning_rate": 3.0519347116239e-06,
|
|
"loss": 0.0496,
|
|
"mean_token_accuracy": 0.9856770841870457,
|
|
"num_tokens": 201160682.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 0.5222244262695312,
|
|
"epoch": 2.7727272727272725,
|
|
"grad_norm": 3.3925018109786977,
|
|
"learning_rate": 3.036634170568847e-06,
|
|
"loss": 0.0397,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 201970697.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 0.5278549194335938,
|
|
"epoch": 2.784090909090909,
|
|
"grad_norm": 6.75368386794985,
|
|
"learning_rate": 3.021312528603371e-06,
|
|
"loss": 0.0616,
|
|
"mean_token_accuracy": 0.977864584652707,
|
|
"num_tokens": 202775841.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 0.5267410278320312,
|
|
"epoch": 2.7954545454545454,
|
|
"grad_norm": 3.0938038819070037,
|
|
"learning_rate": 3.0059703881874232e-06,
|
|
"loss": 0.0357,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 203595456.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 0.518524169921875,
|
|
"epoch": 2.8068181818181817,
|
|
"grad_norm": 3.220031225997158,
|
|
"learning_rate": 2.990608352586965e-06,
|
|
"loss": 0.0279,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 204424629.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 0.5166702270507812,
|
|
"epoch": 2.8181818181818183,
|
|
"grad_norm": 2.5096440894100414,
|
|
"learning_rate": 2.9752270258502593e-06,
|
|
"loss": 0.0275,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 205278442.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 0.5141220092773438,
|
|
"epoch": 2.8295454545454546,
|
|
"grad_norm": 3.0289759848019924,
|
|
"learning_rate": 2.959827012784108e-06,
|
|
"loss": 0.0258,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 206118798.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 0.5139236450195312,
|
|
"epoch": 2.840909090909091,
|
|
"grad_norm": 5.597031296469579,
|
|
"learning_rate": 2.9444089189300783e-06,
|
|
"loss": 0.0384,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 206937788.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 0.513916015625,
|
|
"epoch": 2.8522727272727275,
|
|
"grad_norm": 6.324459117331001,
|
|
"learning_rate": 2.92897335054069e-06,
|
|
"loss": 0.0346,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 207777954.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 0.5144729614257812,
|
|
"epoch": 2.8636363636363638,
|
|
"grad_norm": 10.825936923114694,
|
|
"learning_rate": 2.913520914555572e-06,
|
|
"loss": 0.0537,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 208632133.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 0.5220108032226562,
|
|
"epoch": 2.875,
|
|
"grad_norm": 8.746301873134408,
|
|
"learning_rate": 2.8980522185776065e-06,
|
|
"loss": 0.0627,
|
|
"mean_token_accuracy": 0.977864584652707,
|
|
"num_tokens": 209471247.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 0.5236358642578125,
|
|
"epoch": 2.8863636363636362,
|
|
"grad_norm": 1.998986620458919,
|
|
"learning_rate": 2.882567870849029e-06,
|
|
"loss": 0.0236,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 210282643.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 0.5129623413085938,
|
|
"epoch": 2.8977272727272725,
|
|
"grad_norm": 4.1198700432367605,
|
|
"learning_rate": 2.8670684802275173e-06,
|
|
"loss": 0.0269,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 211126378.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 0.5134735107421875,
|
|
"epoch": 2.909090909090909,
|
|
"grad_norm": 5.348962259064157,
|
|
"learning_rate": 2.8515546561622464e-06,
|
|
"loss": 0.0302,
|
|
"mean_token_accuracy": 0.9843750009313226,
|
|
"num_tokens": 211953021.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 0.5159378051757812,
|
|
"epoch": 2.9204545454545454,
|
|
"grad_norm": 5.307710488632053,
|
|
"learning_rate": 2.8360270086699274e-06,
|
|
"loss": 0.0365,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 212784194.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 0.5262603759765625,
|
|
"epoch": 2.9318181818181817,
|
|
"grad_norm": 7.506532419063644,
|
|
"learning_rate": 2.820486148310822e-06,
|
|
"loss": 0.0421,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 213585134.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 0.537200927734375,
|
|
"epoch": 2.9431818181818183,
|
|
"grad_norm": 3.3635404051746,
|
|
"learning_rate": 2.8049326861647303e-06,
|
|
"loss": 0.0233,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 214369055.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 0.5199203491210938,
|
|
"epoch": 2.9545454545454546,
|
|
"grad_norm": 3.0956317736212777,
|
|
"learning_rate": 2.7893672338069666e-06,
|
|
"loss": 0.026,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 215189129.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 0.5302810668945312,
|
|
"epoch": 2.965909090909091,
|
|
"grad_norm": 5.926197968230742,
|
|
"learning_rate": 2.7737904032843105e-06,
|
|
"loss": 0.0382,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 215982699.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 0.5257034301757812,
|
|
"epoch": 2.9772727272727275,
|
|
"grad_norm": 4.940529587588766,
|
|
"learning_rate": 2.7582028070909415e-06,
|
|
"loss": 0.0226,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 216773800.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 0.5219879150390625,
|
|
"epoch": 2.9886363636363638,
|
|
"grad_norm": 3.317011621794809,
|
|
"learning_rate": 2.742605058144352e-06,
|
|
"loss": 0.024,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 217577013.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 0.5041275024414062,
|
|
"epoch": 3.0,
|
|
"grad_norm": 4.375219675071472,
|
|
"learning_rate": 2.7269977697612515e-06,
|
|
"loss": 0.0336,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 218445070.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 0.5074462890625,
|
|
"epoch": 3.0113636363636362,
|
|
"grad_norm": 5.638061777523044,
|
|
"learning_rate": 2.7113815556334478e-06,
|
|
"loss": 0.0541,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 219295636.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 0.5047531127929688,
|
|
"epoch": 3.022727272727273,
|
|
"grad_norm": 7.001969084082727,
|
|
"learning_rate": 2.6957570298037156e-06,
|
|
"loss": 0.0236,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 220144284.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 0.5129165649414062,
|
|
"epoch": 3.034090909090909,
|
|
"grad_norm": 5.988487899098891,
|
|
"learning_rate": 2.680124806641654e-06,
|
|
"loss": 0.0352,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 220981512.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 0.5201339721679688,
|
|
"epoch": 3.0454545454545454,
|
|
"grad_norm": 5.136716069392655,
|
|
"learning_rate": 2.664485500819527e-06,
|
|
"loss": 0.0311,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 221793779.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 0.5125503540039062,
|
|
"epoch": 3.0568181818181817,
|
|
"grad_norm": 5.2949927163413895,
|
|
"learning_rate": 2.6488397272880943e-06,
|
|
"loss": 0.0365,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 222639289.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 0.5237808227539062,
|
|
"epoch": 3.0681818181818183,
|
|
"grad_norm": 4.3434336811837335,
|
|
"learning_rate": 2.633188101252433e-06,
|
|
"loss": 0.0431,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 223441006.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 0.5213165283203125,
|
|
"epoch": 3.0795454545454546,
|
|
"grad_norm": 3.3532803929089723,
|
|
"learning_rate": 2.617531238147744e-06,
|
|
"loss": 0.0167,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 224253520.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 0.5045852661132812,
|
|
"epoch": 3.090909090909091,
|
|
"grad_norm": 9.04867592578753,
|
|
"learning_rate": 2.6018697536151554e-06,
|
|
"loss": 0.034,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 225092756.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 0.5112762451171875,
|
|
"epoch": 3.102272727272727,
|
|
"grad_norm": 6.089856114683977,
|
|
"learning_rate": 2.5862042634775125e-06,
|
|
"loss": 0.0266,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 225914161.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 0.5180816650390625,
|
|
"epoch": 3.1136363636363638,
|
|
"grad_norm": 2.670172765084978,
|
|
"learning_rate": 2.5705353837151655e-06,
|
|
"loss": 0.0147,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 226727320.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 0.5142440795898438,
|
|
"epoch": 3.125,
|
|
"grad_norm": 4.087182451197898,
|
|
"learning_rate": 2.554863730441748e-06,
|
|
"loss": 0.0319,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 227543923.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 0.5253143310546875,
|
|
"epoch": 3.1363636363636362,
|
|
"grad_norm": 3.238450710346099,
|
|
"learning_rate": 2.5391899198799475e-06,
|
|
"loss": 0.018,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 228354592.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 0.5085906982421875,
|
|
"epoch": 3.147727272727273,
|
|
"grad_norm": 4.310933841924269,
|
|
"learning_rate": 2.5235145683372813e-06,
|
|
"loss": 0.03,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 229189324.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 0.5243377685546875,
|
|
"epoch": 3.159090909090909,
|
|
"grad_norm": 5.982970780385657,
|
|
"learning_rate": 2.507838292181858e-06,
|
|
"loss": 0.0273,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 229990721.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 0.5168304443359375,
|
|
"epoch": 3.1704545454545454,
|
|
"grad_norm": 5.044799482329287,
|
|
"learning_rate": 2.4921617078181425e-06,
|
|
"loss": 0.0214,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 230813428.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 0.5148696899414062,
|
|
"epoch": 3.1818181818181817,
|
|
"grad_norm": 4.358921357863228,
|
|
"learning_rate": 2.47648543166272e-06,
|
|
"loss": 0.0365,
|
|
"mean_token_accuracy": 0.9869791674427688,
|
|
"num_tokens": 231638664.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 0.5061111450195312,
|
|
"epoch": 3.1931818181818183,
|
|
"grad_norm": 3.3414807488803993,
|
|
"learning_rate": 2.4608100801200533e-06,
|
|
"loss": 0.0167,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 232502400.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 0.5233993530273438,
|
|
"epoch": 3.2045454545454546,
|
|
"grad_norm": 2.5835308691930488,
|
|
"learning_rate": 2.445136269558254e-06,
|
|
"loss": 0.019,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 233292296.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 0.5108566284179688,
|
|
"epoch": 3.215909090909091,
|
|
"grad_norm": 3.063861729689834,
|
|
"learning_rate": 2.4294646162848353e-06,
|
|
"loss": 0.0224,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 234121658.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 0.5061569213867188,
|
|
"epoch": 3.227272727272727,
|
|
"grad_norm": 4.3710805919644935,
|
|
"learning_rate": 2.413795736522489e-06,
|
|
"loss": 0.0199,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 234953163.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 0.49341583251953125,
|
|
"epoch": 3.2386363636363638,
|
|
"grad_norm": 5.3087603859535974,
|
|
"learning_rate": 2.3981302463848454e-06,
|
|
"loss": 0.0206,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 235818597.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 0.518768310546875,
|
|
"epoch": 3.25,
|
|
"grad_norm": 4.697159032722577,
|
|
"learning_rate": 2.3824687618522567e-06,
|
|
"loss": 0.0285,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 236602971.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 0.5102920532226562,
|
|
"epoch": 3.2613636363636362,
|
|
"grad_norm": 2.1053875542266964,
|
|
"learning_rate": 2.366811898747568e-06,
|
|
"loss": 0.0155,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 237421849.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 0.5016021728515625,
|
|
"epoch": 3.2727272727272725,
|
|
"grad_norm": 5.007825413263214,
|
|
"learning_rate": 2.351160272711907e-06,
|
|
"loss": 0.03,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 238264490.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 0.5040969848632812,
|
|
"epoch": 3.284090909090909,
|
|
"grad_norm": 4.54185116627051,
|
|
"learning_rate": 2.3355144991804736e-06,
|
|
"loss": 0.0249,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 239083537.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 0.5108413696289062,
|
|
"epoch": 3.2954545454545454,
|
|
"grad_norm": 3.6101103237544443,
|
|
"learning_rate": 2.3198751933583463e-06,
|
|
"loss": 0.0175,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 239887927.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 0.5096359252929688,
|
|
"epoch": 3.3068181818181817,
|
|
"grad_norm": 5.673455850680201,
|
|
"learning_rate": 2.304242970196285e-06,
|
|
"loss": 0.0176,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 240692045.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 0.5037689208984375,
|
|
"epoch": 3.3181818181818183,
|
|
"grad_norm": 4.418027168086691,
|
|
"learning_rate": 2.2886184443665522e-06,
|
|
"loss": 0.0154,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 241490985.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 0.5098648071289062,
|
|
"epoch": 3.3295454545454546,
|
|
"grad_norm": 4.29411735744527,
|
|
"learning_rate": 2.2730022302387493e-06,
|
|
"loss": 0.0237,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 242287672.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 0.49761199951171875,
|
|
"epoch": 3.340909090909091,
|
|
"grad_norm": 2.6648692660994255,
|
|
"learning_rate": 2.257394941855648e-06,
|
|
"loss": 0.0121,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 243126330.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 0.5023422241210938,
|
|
"epoch": 3.3522727272727275,
|
|
"grad_norm": 6.819480101828647,
|
|
"learning_rate": 2.2417971929090593e-06,
|
|
"loss": 0.0408,
|
|
"mean_token_accuracy": 0.989583333954215,
|
|
"num_tokens": 243934709.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 0.49542236328125,
|
|
"epoch": 3.3636363636363638,
|
|
"grad_norm": 4.254807585399875,
|
|
"learning_rate": 2.2262095967156895e-06,
|
|
"loss": 0.0236,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 244774180.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 0.5053787231445312,
|
|
"epoch": 3.375,
|
|
"grad_norm": 3.7942834183716627,
|
|
"learning_rate": 2.2106327661930343e-06,
|
|
"loss": 0.013,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 245590040.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 0.49605560302734375,
|
|
"epoch": 3.3863636363636362,
|
|
"grad_norm": 2.942700286159109,
|
|
"learning_rate": 2.19506731383527e-06,
|
|
"loss": 0.0181,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 246440516.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 0.5043182373046875,
|
|
"epoch": 3.3977272727272725,
|
|
"grad_norm": 3.503113960805014,
|
|
"learning_rate": 2.1795138516891786e-06,
|
|
"loss": 0.0184,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 247249584.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 0.5015716552734375,
|
|
"epoch": 3.409090909090909,
|
|
"grad_norm": 5.5439080197236095,
|
|
"learning_rate": 2.163972991330073e-06,
|
|
"loss": 0.0132,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 248078340.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 0.49102783203125,
|
|
"epoch": 3.4204545454545454,
|
|
"grad_norm": 5.088980442111073,
|
|
"learning_rate": 2.148445343837755e-06,
|
|
"loss": 0.0129,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 248927712.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 0.49953460693359375,
|
|
"epoch": 3.4318181818181817,
|
|
"grad_norm": 5.204176680601301,
|
|
"learning_rate": 2.1329315197724835e-06,
|
|
"loss": 0.0273,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 249746453.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 0.49965667724609375,
|
|
"epoch": 3.4431818181818183,
|
|
"grad_norm": 5.045844298995128,
|
|
"learning_rate": 2.1174321291509716e-06,
|
|
"loss": 0.023,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 250589656.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 0.48998260498046875,
|
|
"epoch": 3.4545454545454546,
|
|
"grad_norm": 4.046888074688225,
|
|
"learning_rate": 2.1019477814223943e-06,
|
|
"loss": 0.0121,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 251475034.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 0.5137939453125,
|
|
"epoch": 3.465909090909091,
|
|
"grad_norm": 5.163377258776933,
|
|
"learning_rate": 2.086479085444429e-06,
|
|
"loss": 0.0384,
|
|
"mean_token_accuracy": 0.9882812506984919,
|
|
"num_tokens": 252281615.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 0.498199462890625,
|
|
"epoch": 3.4772727272727275,
|
|
"grad_norm": 5.648873679939671,
|
|
"learning_rate": 2.071026649459311e-06,
|
|
"loss": 0.015,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 253112081.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 0.5233306884765625,
|
|
"epoch": 3.4886363636363638,
|
|
"grad_norm": 3.589697703897856,
|
|
"learning_rate": 2.055591081069922e-06,
|
|
"loss": 0.0101,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 253899681.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 0.5106887817382812,
|
|
"epoch": 3.5,
|
|
"grad_norm": 5.074293050610865,
|
|
"learning_rate": 2.040172987215893e-06,
|
|
"loss": 0.0127,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 254708112.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 0.495269775390625,
|
|
"epoch": 3.5113636363636362,
|
|
"grad_norm": 3.675745737048211,
|
|
"learning_rate": 2.024772974149741e-06,
|
|
"loss": 0.0125,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 255563485.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 0.5013885498046875,
|
|
"epoch": 3.5227272727272725,
|
|
"grad_norm": 4.817798054202063,
|
|
"learning_rate": 2.0093916474130354e-06,
|
|
"loss": 0.0203,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 256413263.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 0.5038299560546875,
|
|
"epoch": 3.534090909090909,
|
|
"grad_norm": 3.1418020211628033,
|
|
"learning_rate": 1.9940296118125776e-06,
|
|
"loss": 0.0116,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 257245420.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 0.5105819702148438,
|
|
"epoch": 3.5454545454545454,
|
|
"grad_norm": 3.456624325090506,
|
|
"learning_rate": 1.9786874713966293e-06,
|
|
"loss": 0.0143,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 258077483.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 0.5098724365234375,
|
|
"epoch": 3.5568181818181817,
|
|
"grad_norm": 3.4342539832370003,
|
|
"learning_rate": 1.9633658294311535e-06,
|
|
"loss": 0.0101,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 258880273.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 0.4929656982421875,
|
|
"epoch": 3.5681818181818183,
|
|
"grad_norm": 3.764505067295885,
|
|
"learning_rate": 1.9480652883761007e-06,
|
|
"loss": 0.0197,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 259750176.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 0.516937255859375,
|
|
"epoch": 3.5795454545454546,
|
|
"grad_norm": 2.561449184590668,
|
|
"learning_rate": 1.9327864498617114e-06,
|
|
"loss": 0.0157,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 260566530.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 0.5121307373046875,
|
|
"epoch": 3.590909090909091,
|
|
"grad_norm": 1.0671362172663923,
|
|
"learning_rate": 1.9175299146648672e-06,
|
|
"loss": 0.0053,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 261367907.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 0.5068359375,
|
|
"epoch": 3.6022727272727275,
|
|
"grad_norm": 3.381064113252212,
|
|
"learning_rate": 1.9022962826854619e-06,
|
|
"loss": 0.019,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 262183663.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 0.514404296875,
|
|
"epoch": 3.6136363636363638,
|
|
"grad_norm": 3.890719744287091,
|
|
"learning_rate": 1.887086152922818e-06,
|
|
"loss": 0.0305,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 262991003.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 0.497894287109375,
|
|
"epoch": 3.625,
|
|
"grad_norm": 4.459677614690958,
|
|
"learning_rate": 1.8719001234521283e-06,
|
|
"loss": 0.0158,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 263838798.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 0.5207977294921875,
|
|
"epoch": 3.6363636363636362,
|
|
"grad_norm": 3.3095750180357797,
|
|
"learning_rate": 1.8567387914009432e-06,
|
|
"loss": 0.0268,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 264623780.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 0.4974212646484375,
|
|
"epoch": 3.6477272727272725,
|
|
"grad_norm": 4.065611989330324,
|
|
"learning_rate": 1.8416027529256885e-06,
|
|
"loss": 0.0203,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 265460612.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 0.49669647216796875,
|
|
"epoch": 3.659090909090909,
|
|
"grad_norm": 2.750458063856211,
|
|
"learning_rate": 1.8264926031882274e-06,
|
|
"loss": 0.0133,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 266306607.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 0.497406005859375,
|
|
"epoch": 3.6704545454545454,
|
|
"grad_norm": 2.8131906720041067,
|
|
"learning_rate": 1.8114089363324525e-06,
|
|
"loss": 0.0111,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 267163529.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 0.5118255615234375,
|
|
"epoch": 3.6818181818181817,
|
|
"grad_norm": 5.003100497642272,
|
|
"learning_rate": 1.7963523454609317e-06,
|
|
"loss": 0.0145,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 267949078.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 0.48583984375,
|
|
"epoch": 3.6931818181818183,
|
|
"grad_norm": 2.0483918553702356,
|
|
"learning_rate": 1.7813234226115767e-06,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 268835407.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 0.499237060546875,
|
|
"epoch": 3.7045454545454546,
|
|
"grad_norm": 2.86400752347896,
|
|
"learning_rate": 1.766322758734374e-06,
|
|
"loss": 0.0085,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 269641194.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 0.50311279296875,
|
|
"epoch": 3.715909090909091,
|
|
"grad_norm": 4.259573160972011,
|
|
"learning_rate": 1.75135094366814e-06,
|
|
"loss": 0.0268,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 270450463.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 0.499053955078125,
|
|
"epoch": 3.7272727272727275,
|
|
"grad_norm": 3.4557767603144476,
|
|
"learning_rate": 1.7364085661173346e-06,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 271255011.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 0.4916534423828125,
|
|
"epoch": 3.7386363636363638,
|
|
"grad_norm": 3.049050929462369,
|
|
"learning_rate": 1.721496213628906e-06,
|
|
"loss": 0.0187,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 272118194.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 0.4965972900390625,
|
|
"epoch": 3.75,
|
|
"grad_norm": 3.2555463571102043,
|
|
"learning_rate": 1.7066144725691933e-06,
|
|
"loss": 0.0227,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 272929742.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 0.52386474609375,
|
|
"epoch": 3.7613636363636362,
|
|
"grad_norm": 4.093269827080145,
|
|
"learning_rate": 1.6917639281008703e-06,
|
|
"loss": 0.0126,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 273675439.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 0.5012054443359375,
|
|
"epoch": 3.7727272727272725,
|
|
"grad_norm": 1.3792964976746966,
|
|
"learning_rate": 1.6769451641599305e-06,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 274521549.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 0.5058517456054688,
|
|
"epoch": 3.784090909090909,
|
|
"grad_norm": 3.589443939404163,
|
|
"learning_rate": 1.6621587634327328e-06,
|
|
"loss": 0.0217,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 275329538.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 0.5105743408203125,
|
|
"epoch": 3.7954545454545454,
|
|
"grad_norm": 3.9777266239767712,
|
|
"learning_rate": 1.647405307333085e-06,
|
|
"loss": 0.0091,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 276142960.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 0.5018463134765625,
|
|
"epoch": 3.8068181818181817,
|
|
"grad_norm": 3.3505185228236614,
|
|
"learning_rate": 1.6326853759793878e-06,
|
|
"loss": 0.0138,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 276958754.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 0.50335693359375,
|
|
"epoch": 3.8181818181818183,
|
|
"grad_norm": 2.099970477822091,
|
|
"learning_rate": 1.6179995481718165e-06,
|
|
"loss": 0.0128,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 277760913.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 0.4929351806640625,
|
|
"epoch": 3.8295454545454546,
|
|
"grad_norm": 2.610807645269833,
|
|
"learning_rate": 1.6033484013695688e-06,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 278576783.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 0.49352264404296875,
|
|
"epoch": 3.840909090909091,
|
|
"grad_norm": 3.847350242670301,
|
|
"learning_rate": 1.588732511668153e-06,
|
|
"loss": 0.0082,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 279418488.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 0.49383544921875,
|
|
"epoch": 3.8522727272727275,
|
|
"grad_norm": 4.737949727080085,
|
|
"learning_rate": 1.5741524537767427e-06,
|
|
"loss": 0.0108,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 280251812.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 0.495880126953125,
|
|
"epoch": 3.8636363636363638,
|
|
"grad_norm": 2.54135506307979,
|
|
"learning_rate": 1.5596088009955695e-06,
|
|
"loss": 0.0066,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 281080773.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 0.49477386474609375,
|
|
"epoch": 3.875,
|
|
"grad_norm": 2.7265304592667223,
|
|
"learning_rate": 1.5451021251933895e-06,
|
|
"loss": 0.0061,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 281904123.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 0.4793701171875,
|
|
"epoch": 3.8863636363636362,
|
|
"grad_norm": 1.7759375021299968,
|
|
"learning_rate": 1.5306329967849887e-06,
|
|
"loss": 0.0116,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 282775309.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 0.4952545166015625,
|
|
"epoch": 3.8977272727272725,
|
|
"grad_norm": 7.042229803217046,
|
|
"learning_rate": 1.5162019847087616e-06,
|
|
"loss": 0.0087,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 283606678.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 0.488067626953125,
|
|
"epoch": 3.909090909090909,
|
|
"grad_norm": 2.4721993122123305,
|
|
"learning_rate": 1.5018096564043333e-06,
|
|
"loss": 0.0052,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 284454938.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 0.48111724853515625,
|
|
"epoch": 3.9204545454545454,
|
|
"grad_norm": 2.7747076531230332,
|
|
"learning_rate": 1.4874565777902518e-06,
|
|
"loss": 0.0075,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 285309488.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 0.5005111694335938,
|
|
"epoch": 3.9318181818181817,
|
|
"grad_norm": 6.503072352593269,
|
|
"learning_rate": 1.4731433132417316e-06,
|
|
"loss": 0.0168,
|
|
"mean_token_accuracy": 0.9908854172099382,
|
|
"num_tokens": 286135129.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 0.49593353271484375,
|
|
"epoch": 3.9431818181818183,
|
|
"grad_norm": 2.587049362174535,
|
|
"learning_rate": 1.4588704255684697e-06,
|
|
"loss": 0.0174,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 286958859.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 0.48793792724609375,
|
|
"epoch": 3.9545454545454546,
|
|
"grad_norm": 6.747296911451112,
|
|
"learning_rate": 1.4446384759925024e-06,
|
|
"loss": 0.0149,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 287805002.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 0.5005645751953125,
|
|
"epoch": 3.965909090909091,
|
|
"grad_norm": 4.205331165935632,
|
|
"learning_rate": 1.4304480241261529e-06,
|
|
"loss": 0.011,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 288642537.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 0.48992156982421875,
|
|
"epoch": 3.9772727272727275,
|
|
"grad_norm": 4.787823771311458,
|
|
"learning_rate": 1.4162996279500158e-06,
|
|
"loss": 0.0297,
|
|
"mean_token_accuracy": 0.9921875004656613,
|
|
"num_tokens": 289472032.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 0.505859375,
|
|
"epoch": 3.9886363636363638,
|
|
"grad_norm": 5.574571691374813,
|
|
"learning_rate": 1.4021938437910181e-06,
|
|
"loss": 0.0143,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 290273198.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 0.47916412353515625,
|
|
"epoch": 4.0,
|
|
"grad_norm": 3.0588352539730232,
|
|
"learning_rate": 1.388131226300552e-06,
|
|
"loss": 0.0081,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 291144353.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 0.4926605224609375,
|
|
"epoch": 4.011363636363637,
|
|
"grad_norm": 3.025749527554892,
|
|
"learning_rate": 1.374112328432652e-06,
|
|
"loss": 0.014,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 291972475.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 0.4936981201171875,
|
|
"epoch": 4.0227272727272725,
|
|
"grad_norm": 1.7451625346318853,
|
|
"learning_rate": 1.3601377014222688e-06,
|
|
"loss": 0.0153,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 292787195.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 0.5028610229492188,
|
|
"epoch": 4.034090909090909,
|
|
"grad_norm": 2.9232824281274037,
|
|
"learning_rate": 1.3462078947635781e-06,
|
|
"loss": 0.01,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 293578363.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 0.5084609985351562,
|
|
"epoch": 4.045454545454546,
|
|
"grad_norm": 4.536859602349459,
|
|
"learning_rate": 1.3323234561883847e-06,
|
|
"loss": 0.0102,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 294378786.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 0.49819183349609375,
|
|
"epoch": 4.056818181818182,
|
|
"grad_norm": 3.746680387955585,
|
|
"learning_rate": 1.318484931644582e-06,
|
|
"loss": 0.0071,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 295215438.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 0.5009613037109375,
|
|
"epoch": 4.068181818181818,
|
|
"grad_norm": 3.712740822459945,
|
|
"learning_rate": 1.3046928652746833e-06,
|
|
"loss": 0.0172,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 296029918.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 0.4818267822265625,
|
|
"epoch": 4.079545454545454,
|
|
"grad_norm": 2.8940045765908753,
|
|
"learning_rate": 1.2909477993944286e-06,
|
|
"loss": 0.0066,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 296891697.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 0.490875244140625,
|
|
"epoch": 4.090909090909091,
|
|
"grad_norm": 2.347888529432484,
|
|
"learning_rate": 1.2772502744714592e-06,
|
|
"loss": 0.014,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 297738951.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 0.4999847412109375,
|
|
"epoch": 4.1022727272727275,
|
|
"grad_norm": 3.5228155116652844,
|
|
"learning_rate": 1.2636008291040618e-06,
|
|
"loss": 0.013,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 298593160.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 0.49713134765625,
|
|
"epoch": 4.113636363636363,
|
|
"grad_norm": 3.446218217523892,
|
|
"learning_rate": 1.2500000000000007e-06,
|
|
"loss": 0.0141,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 299423053.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 0.49144744873046875,
|
|
"epoch": 4.125,
|
|
"grad_norm": 3.0365054145039037,
|
|
"learning_rate": 1.236448321955401e-06,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 300256502.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 0.5076980590820312,
|
|
"epoch": 4.136363636363637,
|
|
"grad_norm": 0.9932804984104635,
|
|
"learning_rate": 1.222946327833731e-06,
|
|
"loss": 0.0034,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 301066005.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 0.48675537109375,
|
|
"epoch": 4.1477272727272725,
|
|
"grad_norm": 2.3517119546250465,
|
|
"learning_rate": 1.2094945485448424e-06,
|
|
"loss": 0.0071,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 301919771.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 0.5002212524414062,
|
|
"epoch": 4.159090909090909,
|
|
"grad_norm": 0.9715596419931048,
|
|
"learning_rate": 1.196093513024099e-06,
|
|
"loss": 0.0046,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 302745145.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 0.4864044189453125,
|
|
"epoch": 4.170454545454546,
|
|
"grad_norm": 3.7968011078794586,
|
|
"learning_rate": 1.182743748211576e-06,
|
|
"loss": 0.0078,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 303616597.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 0.49373626708984375,
|
|
"epoch": 4.181818181818182,
|
|
"grad_norm": 2.874821117414963,
|
|
"learning_rate": 1.1694457790313403e-06,
|
|
"loss": 0.02,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 304444351.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 0.486968994140625,
|
|
"epoch": 4.193181818181818,
|
|
"grad_norm": 2.074445350579162,
|
|
"learning_rate": 1.15620012837081e-06,
|
|
"loss": 0.0041,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 305286990.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 0.49383544921875,
|
|
"epoch": 4.204545454545454,
|
|
"grad_norm": 1.9418684032272442,
|
|
"learning_rate": 1.1430073170601968e-06,
|
|
"loss": 0.0058,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 306109641.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 0.507232666015625,
|
|
"epoch": 4.215909090909091,
|
|
"grad_norm": 4.069513556231147,
|
|
"learning_rate": 1.1298678638520247e-06,
|
|
"loss": 0.0053,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 306902232.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 0.4990386962890625,
|
|
"epoch": 4.2272727272727275,
|
|
"grad_norm": 3.444347669415785,
|
|
"learning_rate": 1.1167822854007265e-06,
|
|
"loss": 0.0242,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 307719255.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 0.480438232421875,
|
|
"epoch": 4.238636363636363,
|
|
"grad_norm": 0.6683414614854636,
|
|
"learning_rate": 1.1037510962423425e-06,
|
|
"loss": 0.0117,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 308575072.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 0.484466552734375,
|
|
"epoch": 4.25,
|
|
"grad_norm": 6.59972703836071,
|
|
"learning_rate": 1.0907748087742716e-06,
|
|
"loss": 0.0168,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 309421914.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 0.500274658203125,
|
|
"epoch": 4.261363636363637,
|
|
"grad_norm": 5.5296244355600015,
|
|
"learning_rate": 1.0778539332351374e-06,
|
|
"loss": 0.0066,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 310225102.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 0.48038482666015625,
|
|
"epoch": 4.2727272727272725,
|
|
"grad_norm": 2.7332507609875982,
|
|
"learning_rate": 1.0649889776847161e-06,
|
|
"loss": 0.0073,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 311092524.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 0.49211883544921875,
|
|
"epoch": 4.284090909090909,
|
|
"grad_norm": 6.20627247496486,
|
|
"learning_rate": 1.0521804479839651e-06,
|
|
"loss": 0.0165,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 311920496.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 0.5010910034179688,
|
|
"epoch": 4.295454545454546,
|
|
"grad_norm": 1.9825033722155454,
|
|
"learning_rate": 1.0394288477751274e-06,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 312734021.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 0.49372100830078125,
|
|
"epoch": 4.306818181818182,
|
|
"grad_norm": 2.062597317498596,
|
|
"learning_rate": 1.0267346784619324e-06,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 313570038.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 0.4984283447265625,
|
|
"epoch": 4.318181818181818,
|
|
"grad_norm": 3.0372016073487567,
|
|
"learning_rate": 1.0140984391898744e-06,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 314385328.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 0.4894866943359375,
|
|
"epoch": 4.329545454545454,
|
|
"grad_norm": 2.327251786892091,
|
|
"learning_rate": 1.0015206268265948e-06,
|
|
"loss": 0.0042,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 315206928.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 0.49160003662109375,
|
|
"epoch": 4.340909090909091,
|
|
"grad_norm": 2.974267125288258,
|
|
"learning_rate": 9.890017359423326e-07,
|
|
"loss": 0.0038,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 316035622.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 0.5143508911132812,
|
|
"epoch": 4.3522727272727275,
|
|
"grad_norm": 1.7357788012156292,
|
|
"learning_rate": 9.765422587904919e-07,
|
|
"loss": 0.0126,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 316811648.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 0.4999237060546875,
|
|
"epoch": 4.363636363636363,
|
|
"grad_norm": 2.6366697025032337,
|
|
"learning_rate": 9.641426852882717e-07,
|
|
"loss": 0.0176,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 317623918.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 0.495269775390625,
|
|
"epoch": 4.375,
|
|
"grad_norm": 2.501283842298434,
|
|
"learning_rate": 9.518035029974127e-07,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 318428730.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 0.4707489013671875,
|
|
"epoch": 4.386363636363637,
|
|
"grad_norm": 1.0161739496619286,
|
|
"learning_rate": 9.395251971050206e-07,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 319302783.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 0.5036773681640625,
|
|
"epoch": 4.3977272727272725,
|
|
"grad_norm": 0.5414812981393075,
|
|
"learning_rate": 9.273082504044903e-07,
|
|
"loss": 0.0025,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 320101409.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 0.4913787841796875,
|
|
"epoch": 4.409090909090909,
|
|
"grad_norm": 7.733854824346476,
|
|
"learning_rate": 9.151531432765204e-07,
|
|
"loss": 0.0102,
|
|
"mean_token_accuracy": 0.9934895837213844,
|
|
"num_tokens": 320913133.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 0.47967529296875,
|
|
"epoch": 4.420454545454546,
|
|
"grad_norm": 2.3580133114107587,
|
|
"learning_rate": 9.030603536702254e-07,
|
|
"loss": 0.0039,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 321768392.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 0.49045562744140625,
|
|
"epoch": 4.431818181818182,
|
|
"grad_norm": 4.210313755601456,
|
|
"learning_rate": 8.910303570843423e-07,
|
|
"loss": 0.0041,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 322599634.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 0.493011474609375,
|
|
"epoch": 4.443181818181818,
|
|
"grad_norm": 3.8913982883862963,
|
|
"learning_rate": 8.790636265485333e-07,
|
|
"loss": 0.0162,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 323395015.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 0.495269775390625,
|
|
"epoch": 4.454545454545454,
|
|
"grad_norm": 0.346768689760395,
|
|
"learning_rate": 8.67160632604786e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 324210995.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 0.49155426025390625,
|
|
"epoch": 4.465909090909091,
|
|
"grad_norm": 0.5274973505562733,
|
|
"learning_rate": 8.553218432889091e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 325018202.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 0.48851776123046875,
|
|
"epoch": 4.4772727272727275,
|
|
"grad_norm": 3.449091621793021,
|
|
"learning_rate": 8.435477241121354e-07,
|
|
"loss": 0.0036,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 325836067.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 0.48577880859375,
|
|
"epoch": 4.488636363636363,
|
|
"grad_norm": 1.7526399488203852,
|
|
"learning_rate": 8.31838738042808e-07,
|
|
"loss": 0.0067,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 326648663.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 0.49868011474609375,
|
|
"epoch": 4.5,
|
|
"grad_norm": 3.8013694140306917,
|
|
"learning_rate": 8.201953454881844e-07,
|
|
"loss": 0.0074,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 327428989.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 0.48467254638671875,
|
|
"epoch": 4.511363636363637,
|
|
"grad_norm": 10.511325463365447,
|
|
"learning_rate": 8.086180042763284e-07,
|
|
"loss": 0.0091,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 328263854.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 0.4808349609375,
|
|
"epoch": 4.5227272727272725,
|
|
"grad_norm": 2.4305720746807515,
|
|
"learning_rate": 7.971071696381089e-07,
|
|
"loss": 0.0141,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 329087106.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 0.497528076171875,
|
|
"epoch": 4.534090909090909,
|
|
"grad_norm": 0.675792729286102,
|
|
"learning_rate": 7.856632941893e-07,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 329907669.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 0.48723602294921875,
|
|
"epoch": 4.545454545454545,
|
|
"grad_norm": 2.4628320022302175,
|
|
"learning_rate": 7.74286827912785e-07,
|
|
"loss": 0.0031,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 330727069.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 0.471282958984375,
|
|
"epoch": 4.556818181818182,
|
|
"grad_norm": 5.265304674225523,
|
|
"learning_rate": 7.629782181408574e-07,
|
|
"loss": 0.0147,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 331590906.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 0.46993255615234375,
|
|
"epoch": 4.568181818181818,
|
|
"grad_norm": 4.67378034474964,
|
|
"learning_rate": 7.517379095376418e-07,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 332461949.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 0.5018539428710938,
|
|
"epoch": 4.579545454545455,
|
|
"grad_norm": 2.09672088064596,
|
|
"learning_rate": 7.405663440815968e-07,
|
|
"loss": 0.0082,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 333263860.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 0.48856353759765625,
|
|
"epoch": 4.590909090909091,
|
|
"grad_norm": 2.0511373755169506,
|
|
"learning_rate": 7.294639610481461e-07,
|
|
"loss": 0.0083,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 334055426.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 0.47692108154296875,
|
|
"epoch": 4.6022727272727275,
|
|
"grad_norm": 1.5653117450616783,
|
|
"learning_rate": 7.184311969924002e-07,
|
|
"loss": 0.0024,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 334899503.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 0.47054290771484375,
|
|
"epoch": 4.613636363636363,
|
|
"grad_norm": 0.3087773860358458,
|
|
"learning_rate": 7.074684857319928e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 335767817.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 0.479583740234375,
|
|
"epoch": 4.625,
|
|
"grad_norm": 2.99732336372377,
|
|
"learning_rate": 6.965762583300223e-07,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 336621709.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 0.4801177978515625,
|
|
"epoch": 4.636363636363637,
|
|
"grad_norm": 1.3076164367052054,
|
|
"learning_rate": 6.85754943078103e-07,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 337441489.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 0.49835968017578125,
|
|
"epoch": 4.6477272727272725,
|
|
"grad_norm": 1.349284457595706,
|
|
"learning_rate": 6.750049654795199e-07,
|
|
"loss": 0.0088,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 338232060.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 0.48317718505859375,
|
|
"epoch": 4.659090909090909,
|
|
"grad_norm": 2.1021942804973324,
|
|
"learning_rate": 6.643267482325061e-07,
|
|
"loss": 0.0029,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 339051182.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 0.4761505126953125,
|
|
"epoch": 4.670454545454545,
|
|
"grad_norm": 4.442088129850368,
|
|
"learning_rate": 6.537207112136143e-07,
|
|
"loss": 0.0164,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 339905590.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 0.48587799072265625,
|
|
"epoch": 4.681818181818182,
|
|
"grad_norm": 5.065463686315352,
|
|
"learning_rate": 6.431872714612072e-07,
|
|
"loss": 0.0129,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 340743579.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 0.46688079833984375,
|
|
"epoch": 4.693181818181818,
|
|
"grad_norm": 3.228917361025119,
|
|
"learning_rate": 6.327268431590664e-07,
|
|
"loss": 0.0066,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 341596634.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.48221588134765625,
|
|
"epoch": 4.704545454545455,
|
|
"grad_norm": 0.6719090703949218,
|
|
"learning_rate": 6.223398376200956e-07,
|
|
"loss": 0.0026,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 342426853.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 0.48246002197265625,
|
|
"epoch": 4.715909090909091,
|
|
"grad_norm": 0.8384280760870119,
|
|
"learning_rate": 6.1202666327016e-07,
|
|
"loss": 0.0025,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 343277692.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 0.4742889404296875,
|
|
"epoch": 4.7272727272727275,
|
|
"grad_norm": 2.4415138149124735,
|
|
"learning_rate": 6.017877256320132e-07,
|
|
"loss": 0.0048,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 344128535.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 0.4818267822265625,
|
|
"epoch": 4.738636363636363,
|
|
"grad_norm": 0.8515596128253197,
|
|
"learning_rate": 5.916234273093624e-07,
|
|
"loss": 0.0023,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 344948795.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 0.49347686767578125,
|
|
"epoch": 4.75,
|
|
"grad_norm": 9.346479245259347,
|
|
"learning_rate": 5.815341679710327e-07,
|
|
"loss": 0.0115,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 345737929.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 0.48424530029296875,
|
|
"epoch": 4.761363636363637,
|
|
"grad_norm": 1.339805656978658,
|
|
"learning_rate": 5.715203443352526e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 346553828.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 0.48157501220703125,
|
|
"epoch": 4.7727272727272725,
|
|
"grad_norm": 0.2701253366161105,
|
|
"learning_rate": 5.615823501540546e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 347375058.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 0.4778289794921875,
|
|
"epoch": 4.784090909090909,
|
|
"grad_norm": 0.36859401605585224,
|
|
"learning_rate": 5.51720576197794e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 348217272.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 0.47646331787109375,
|
|
"epoch": 4.795454545454545,
|
|
"grad_norm": 1.3198516138713523,
|
|
"learning_rate": 5.419354102397792e-07,
|
|
"loss": 0.0117,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 349043381.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 0.49961090087890625,
|
|
"epoch": 4.806818181818182,
|
|
"grad_norm": 4.2939387242943585,
|
|
"learning_rate": 5.32227237041032e-07,
|
|
"loss": 0.0058,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 349814498.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 0.4700927734375,
|
|
"epoch": 4.818181818181818,
|
|
"grad_norm": 2.183285915853867,
|
|
"learning_rate": 5.22596438335149e-07,
|
|
"loss": 0.0072,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 350661477.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 0.47753143310546875,
|
|
"epoch": 4.829545454545455,
|
|
"grad_norm": 4.542934050380425,
|
|
"learning_rate": 5.130433928132983e-07,
|
|
"loss": 0.0042,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 351498317.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 0.46913909912109375,
|
|
"epoch": 4.840909090909091,
|
|
"grad_norm": 0.23800276817196775,
|
|
"learning_rate": 5.035684761093273e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 352355692.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 0.47409820556640625,
|
|
"epoch": 4.8522727272727275,
|
|
"grad_norm": 0.25047198404759846,
|
|
"learning_rate": 4.941720607849912e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 353190411.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 0.4987030029296875,
|
|
"epoch": 4.863636363636363,
|
|
"grad_norm": 10.75981988944821,
|
|
"learning_rate": 4.848545163153048e-07,
|
|
"loss": 0.0131,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 353977106.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 0.5035171508789062,
|
|
"epoch": 4.875,
|
|
"grad_norm": 0.2384511087444417,
|
|
"learning_rate": 4.756162090740135e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 354746145.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 0.46907806396484375,
|
|
"epoch": 4.886363636363637,
|
|
"grad_norm": 6.102423857117631,
|
|
"learning_rate": 4.6645750231918864e-07,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 355611270.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 0.47377777099609375,
|
|
"epoch": 4.8977272727272725,
|
|
"grad_norm": 0.24001371751604936,
|
|
"learning_rate": 4.5737875617894225e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 356445109.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 0.48812103271484375,
|
|
"epoch": 4.909090909090909,
|
|
"grad_norm": 4.762057280592623,
|
|
"learning_rate": 4.4838032763726806e-07,
|
|
"loss": 0.006,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 357235786.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 0.49410247802734375,
|
|
"epoch": 4.920454545454545,
|
|
"grad_norm": 3.949138959783489,
|
|
"learning_rate": 4.394625705200012e-07,
|
|
"loss": 0.0219,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 358031050.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 0.46722412109375,
|
|
"epoch": 4.931818181818182,
|
|
"grad_norm": 0.284180125397139,
|
|
"learning_rate": 4.3062583548091256e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 358895808.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 0.4991455078125,
|
|
"epoch": 4.943181818181818,
|
|
"grad_norm": 6.520535055001159,
|
|
"learning_rate": 4.218704699879117e-07,
|
|
"loss": 0.0071,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 359692075.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 0.4780120849609375,
|
|
"epoch": 4.954545454545455,
|
|
"grad_norm": 0.7522536313096782,
|
|
"learning_rate": 4.1319681830939124e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 360552654.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 0.46907806396484375,
|
|
"epoch": 4.965909090909091,
|
|
"grad_norm": 2.3964109369615016,
|
|
"learning_rate": 4.0460522150068684e-07,
|
|
"loss": 0.0024,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 361395351.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 0.476226806640625,
|
|
"epoch": 4.9772727272727275,
|
|
"grad_norm": 0.50213990162671,
|
|
"learning_rate": 3.9609601739066664e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 362224545.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 0.48921966552734375,
|
|
"epoch": 4.988636363636363,
|
|
"grad_norm": 1.4592075617151454,
|
|
"learning_rate": 3.876695405684486e-07,
|
|
"loss": 0.0022,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 363032893.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 0.489410400390625,
|
|
"epoch": 5.0,
|
|
"grad_norm": 2.556748367856961,
|
|
"learning_rate": 3.793261223702441e-07,
|
|
"loss": 0.0095,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 363838139.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 0.47222137451171875,
|
|
"epoch": 5.011363636363637,
|
|
"grad_norm": 4.214353845258516,
|
|
"learning_rate": 3.7106609086632635e-07,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 364669029.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 0.48236846923828125,
|
|
"epoch": 5.0227272727272725,
|
|
"grad_norm": 0.30780546204673304,
|
|
"learning_rate": 3.628897708481377e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 365462695.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 0.49261474609375,
|
|
"epoch": 5.034090909090909,
|
|
"grad_norm": 1.4822159912495783,
|
|
"learning_rate": 3.5479748381550855e-07,
|
|
"loss": 0.0077,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 366246764.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 0.45969390869140625,
|
|
"epoch": 5.045454545454546,
|
|
"grad_norm": 0.8370386732840761,
|
|
"learning_rate": 3.4678954796402624e-07,
|
|
"loss": 0.0095,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 367104037.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 0.45703887939453125,
|
|
"epoch": 5.056818181818182,
|
|
"grad_norm": 0.23893007118099577,
|
|
"learning_rate": 3.388662781725141e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 367968268.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 0.48757171630859375,
|
|
"epoch": 5.068181818181818,
|
|
"grad_norm": 0.2517958929459578,
|
|
"learning_rate": 3.310279859906565e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 368763452.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 0.475311279296875,
|
|
"epoch": 5.079545454545454,
|
|
"grad_norm": 1.7028057764506213,
|
|
"learning_rate": 3.232749796267451e-07,
|
|
"loss": 0.0144,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 369589368.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 0.478118896484375,
|
|
"epoch": 5.090909090909091,
|
|
"grad_norm": 0.24780863890849855,
|
|
"learning_rate": 3.1560756393556187e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 370411869.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 0.4748687744140625,
|
|
"epoch": 5.1022727272727275,
|
|
"grad_norm": 1.1233286081795228,
|
|
"learning_rate": 3.0802604040639034e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 371255264.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 0.46393585205078125,
|
|
"epoch": 5.113636363636363,
|
|
"grad_norm": 3.08110570654071,
|
|
"learning_rate": 3.0053070715116153e-07,
|
|
"loss": 0.0026,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 372135606.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 0.48212432861328125,
|
|
"epoch": 5.125,
|
|
"grad_norm": 3.202235664798041,
|
|
"learning_rate": 2.9312185889273147e-07,
|
|
"loss": 0.0083,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 372974668.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 0.4832611083984375,
|
|
"epoch": 5.136363636363637,
|
|
"grad_norm": 0.7865563069141048,
|
|
"learning_rate": 2.8579978695329386e-07,
|
|
"loss": 0.0051,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 373795872.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 0.473236083984375,
|
|
"epoch": 5.1477272727272725,
|
|
"grad_norm": 2.506322586954564,
|
|
"learning_rate": 2.785647792429233e-07,
|
|
"loss": 0.0035,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 374644330.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 0.4847564697265625,
|
|
"epoch": 5.159090909090909,
|
|
"grad_norm": 0.551312091586902,
|
|
"learning_rate": 2.714171202482538e-07,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 375451445.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 0.48731231689453125,
|
|
"epoch": 5.170454545454546,
|
|
"grad_norm": 6.3859087946178414,
|
|
"learning_rate": 2.6435709102129727e-07,
|
|
"loss": 0.0046,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 376254658.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 0.482147216796875,
|
|
"epoch": 5.181818181818182,
|
|
"grad_norm": 0.526311488156248,
|
|
"learning_rate": 2.5738496916838524e-07,
|
|
"loss": 0.0084,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 377072437.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 0.4810333251953125,
|
|
"epoch": 5.193181818181818,
|
|
"grad_norm": 3.7772382358130283,
|
|
"learning_rate": 2.505010288392587e-07,
|
|
"loss": 0.012,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 377909031.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 0.471832275390625,
|
|
"epoch": 5.204545454545454,
|
|
"grad_norm": 2.329737506097157,
|
|
"learning_rate": 2.4370554071628613e-07,
|
|
"loss": 0.0175,
|
|
"mean_token_accuracy": 0.9960937502328306,
|
|
"num_tokens": 378749326.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 0.477752685546875,
|
|
"epoch": 5.215909090909091,
|
|
"grad_norm": 1.2247209201337799,
|
|
"learning_rate": 2.3699877200382026e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 379588976.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 0.46756744384765625,
|
|
"epoch": 5.2272727272727275,
|
|
"grad_norm": 0.41393867739069445,
|
|
"learning_rate": 2.303809864176909e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 380444915.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 0.484649658203125,
|
|
"epoch": 5.238636363636363,
|
|
"grad_norm": 0.4288087261894904,
|
|
"learning_rate": 2.2385244417483743e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 381261471.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 0.47428131103515625,
|
|
"epoch": 5.25,
|
|
"grad_norm": 2.6805042624913673,
|
|
"learning_rate": 2.174134019830726e-07,
|
|
"loss": 0.0031,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 382101653.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 0.48526763916015625,
|
|
"epoch": 5.261363636363637,
|
|
"grad_norm": 0.5789242093428709,
|
|
"learning_rate": 2.1106411303099455e-07,
|
|
"loss": 0.0022,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 382917569.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 0.4815673828125,
|
|
"epoch": 5.2727272727272725,
|
|
"grad_norm": 0.7412441363450578,
|
|
"learning_rate": 2.0480482697802507e-07,
|
|
"loss": 0.0021,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 383733436.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 0.47106170654296875,
|
|
"epoch": 5.284090909090909,
|
|
"grad_norm": 0.37355555384510464,
|
|
"learning_rate": 1.986357899445976e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 384576794.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 0.48104095458984375,
|
|
"epoch": 5.295454545454546,
|
|
"grad_norm": 0.38999280654890595,
|
|
"learning_rate": 1.9255724450247676e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 385382850.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 0.467498779296875,
|
|
"epoch": 5.306818181818182,
|
|
"grad_norm": 0.3255882167657849,
|
|
"learning_rate": 1.8656942966522124e-07,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 386230967.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 0.49216461181640625,
|
|
"epoch": 5.318181818181818,
|
|
"grad_norm": 0.812386735624479,
|
|
"learning_rate": 1.8067258087878597e-07,
|
|
"loss": 0.0095,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 387023995.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 0.48313140869140625,
|
|
"epoch": 5.329545454545454,
|
|
"grad_norm": 1.5287051130874392,
|
|
"learning_rate": 1.748669300122627e-07,
|
|
"loss": 0.0019,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 387853013.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 0.48089599609375,
|
|
"epoch": 5.340909090909091,
|
|
"grad_norm": 0.5410198602305207,
|
|
"learning_rate": 1.691527053487646e-07,
|
|
"loss": 0.0017,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 388678242.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 0.48954010009765625,
|
|
"epoch": 5.3522727272727275,
|
|
"grad_norm": 0.2573461421988016,
|
|
"learning_rate": 1.635301315764484e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 389489184.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 0.48332977294921875,
|
|
"epoch": 5.363636363636363,
|
|
"grad_norm": 0.2512417872893421,
|
|
"learning_rate": 1.579994297796808e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 390313965.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 0.484710693359375,
|
|
"epoch": 5.375,
|
|
"grad_norm": 0.24272552930424235,
|
|
"learning_rate": 1.5256081743034336e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 391124884.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 0.474761962890625,
|
|
"epoch": 5.386363636363637,
|
|
"grad_norm": 0.2441148940956172,
|
|
"learning_rate": 1.472145083792842e-07,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 391979859.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 0.48592376708984375,
|
|
"epoch": 5.3977272727272725,
|
|
"grad_norm": 0.29622625734441144,
|
|
"learning_rate": 1.419607128479053e-07,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 392807678.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 0.48085784912109375,
|
|
"epoch": 5.409090909090909,
|
|
"grad_norm": 1.1708257311301122,
|
|
"learning_rate": 1.3679963741990127e-07,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 393628416.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 0.4741973876953125,
|
|
"epoch": 5.420454545454546,
|
|
"grad_norm": 3.7263249279288244,
|
|
"learning_rate": 1.317314850331314e-07,
|
|
"loss": 0.0061,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 394467813.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 0.46479034423828125,
|
|
"epoch": 5.431818181818182,
|
|
"grad_norm": 0.23356626570343833,
|
|
"learning_rate": 1.2675645497164352e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 395318537.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 0.4804534912109375,
|
|
"epoch": 5.443181818181818,
|
|
"grad_norm": 5.27811356549522,
|
|
"learning_rate": 1.2187474285783623e-07,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9973958334885538,
|
|
"num_tokens": 396117712.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 0.4789581298828125,
|
|
"epoch": 5.454545454545454,
|
|
"grad_norm": 0.23205909431366892,
|
|
"learning_rate": 1.1708654064476743e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 396951671.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 0.46929931640625,
|
|
"epoch": 5.465909090909091,
|
|
"grad_norm": 0.23000123108156822,
|
|
"learning_rate": 1.1239203660860648e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 397801199.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 0.470428466796875,
|
|
"epoch": 5.4772727272727275,
|
|
"grad_norm": 0.23372087472096778,
|
|
"learning_rate": 1.0779141534123127e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 398632709.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.47586822509765625,
|
|
"epoch": 5.488636363636363,
|
|
"grad_norm": 0.23070523406895965,
|
|
"learning_rate": 1.0328485774296875e-07,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 399472187.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 0.482421875,
|
|
"epoch": 5.5,
|
|
"grad_norm": 0.23318075758795018,
|
|
"learning_rate": 9.887254101548422e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 400281455.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 0.4844207763671875,
|
|
"epoch": 5.511363636363637,
|
|
"grad_norm": 0.23208929439471024,
|
|
"learning_rate": 9.455463865481019e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 401097680.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 0.48187255859375,
|
|
"epoch": 5.5227272727272725,
|
|
"grad_norm": 0.23677025538291055,
|
|
"learning_rate": 9.033132044452775e-08,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 401927273.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 0.47664642333984375,
|
|
"epoch": 5.534090909090909,
|
|
"grad_norm": 5.055301927928181,
|
|
"learning_rate": 8.620275244908826e-08,
|
|
"loss": 0.0057,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 402736167.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 0.4710693359375,
|
|
"epoch": 5.545454545454545,
|
|
"grad_norm": 0.23085051544257776,
|
|
"learning_rate": 8.216909700728498e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 403584040.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 0.46595001220703125,
|
|
"epoch": 5.556818181818182,
|
|
"grad_norm": 0.2298464991859443,
|
|
"learning_rate": 7.823051272586812e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 404443639.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 0.490386962890625,
|
|
"epoch": 5.568181818181818,
|
|
"grad_norm": 1.7487934276030572,
|
|
"learning_rate": 7.438715447331018e-08,
|
|
"loss": 0.0057,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 405219096.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 0.46939849853515625,
|
|
"epoch": 5.579545454545455,
|
|
"grad_norm": 3.6849857997279503,
|
|
"learning_rate": 7.063917337371495e-08,
|
|
"loss": 0.0032,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 406050370.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 0.4792938232421875,
|
|
"epoch": 5.590909090909091,
|
|
"grad_norm": 0.48656609958077124,
|
|
"learning_rate": 6.698671680087643e-08,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 406883685.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 0.48340606689453125,
|
|
"epoch": 5.6022727272727275,
|
|
"grad_norm": 0.2277425646629738,
|
|
"learning_rate": 6.342992837248235e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 407672236.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 0.482147216796875,
|
|
"epoch": 5.613636363636363,
|
|
"grad_norm": 0.23759672451897687,
|
|
"learning_rate": 5.996894794446817e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 408468181.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 0.4691009521484375,
|
|
"epoch": 5.625,
|
|
"grad_norm": 0.8356994624990842,
|
|
"learning_rate": 5.660391160551837e-08,
|
|
"loss": 0.0103,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 409310506.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 0.4743499755859375,
|
|
"epoch": 5.636363636363637,
|
|
"grad_norm": 0.23173743117233309,
|
|
"learning_rate": 5.333495167171354e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 410129372.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 0.48233795166015625,
|
|
"epoch": 5.6477272727272725,
|
|
"grad_norm": 3.325194453062317,
|
|
"learning_rate": 5.016219668132871e-08,
|
|
"loss": 0.0076,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 410938341.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 0.4671173095703125,
|
|
"epoch": 5.659090909090909,
|
|
"grad_norm": 0.9759331183243299,
|
|
"learning_rate": 4.708577138977932e-08,
|
|
"loss": 0.0109,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 411793767.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 0.46318817138671875,
|
|
"epoch": 5.670454545454545,
|
|
"grad_norm": 1.3398280119073476,
|
|
"learning_rate": 4.410579676471571e-08,
|
|
"loss": 0.0073,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 412651605.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 0.4657745361328125,
|
|
"epoch": 5.681818181818182,
|
|
"grad_norm": 2.55570040229675,
|
|
"learning_rate": 4.1222389981265546e-08,
|
|
"loss": 0.0086,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 413496664.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 0.4684906005859375,
|
|
"epoch": 5.693181818181818,
|
|
"grad_norm": 6.915908450960148,
|
|
"learning_rate": 3.843566441742774e-08,
|
|
"loss": 0.0045,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 414322992.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 0.48667144775390625,
|
|
"epoch": 5.704545454545455,
|
|
"grad_norm": 0.3864454725818334,
|
|
"learning_rate": 3.574572964961304e-08,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 415150977.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 0.47731781005859375,
|
|
"epoch": 5.715909090909091,
|
|
"grad_norm": 0.5359817931960815,
|
|
"learning_rate": 3.3152691448336825e-08,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 415967079.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 0.4813232421875,
|
|
"epoch": 5.7272727272727275,
|
|
"grad_norm": 0.2512550244047445,
|
|
"learning_rate": 3.065665177405808e-08,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 416791778.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 0.477447509765625,
|
|
"epoch": 5.738636363636363,
|
|
"grad_norm": 0.24573660989925844,
|
|
"learning_rate": 2.825770877317363e-08,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 417633485.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 0.46212005615234375,
|
|
"epoch": 5.75,
|
|
"grad_norm": 0.29916205036328847,
|
|
"learning_rate": 2.5955956774154633e-08,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 418482660.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 0.48030853271484375,
|
|
"epoch": 5.761363636363637,
|
|
"grad_norm": 5.663292172784125,
|
|
"learning_rate": 2.3751486283840884e-08,
|
|
"loss": 0.0036,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 419319817.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 0.4896087646484375,
|
|
"epoch": 5.7727272727272725,
|
|
"grad_norm": 3.733506086862286,
|
|
"learning_rate": 2.1644383983880356e-08,
|
|
"loss": 0.0255,
|
|
"mean_token_accuracy": 0.9947916669771075,
|
|
"num_tokens": 420120900.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 0.4809722900390625,
|
|
"epoch": 5.784090909090909,
|
|
"grad_norm": 0.9657687987783138,
|
|
"learning_rate": 1.9634732727321636e-08,
|
|
"loss": 0.0058,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 420957434.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 0.46662139892578125,
|
|
"epoch": 5.795454545454545,
|
|
"grad_norm": 2.701021841228621,
|
|
"learning_rate": 1.7722611535355426e-08,
|
|
"loss": 0.0018,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 421804546.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 0.48058319091796875,
|
|
"epoch": 5.806818181818182,
|
|
"grad_norm": 0.24552060618334481,
|
|
"learning_rate": 1.5908095594207585e-08,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 422624032.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 0.47879791259765625,
|
|
"epoch": 5.818181818181818,
|
|
"grad_norm": 0.2313724445434632,
|
|
"learning_rate": 1.4191256252182595e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 423469195.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"entropy": 0.4875946044921875,
|
|
"epoch": 5.829545454545455,
|
|
"grad_norm": 0.24314245718612798,
|
|
"learning_rate": 1.2572161016858874e-08,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 424277193.0,
|
|
"step": 513
|
|
},
|
|
{
|
|
"entropy": 0.47444915771484375,
|
|
"epoch": 5.840909090909091,
|
|
"grad_norm": 0.2530316763515594,
|
|
"learning_rate": 1.1050873552433394e-08,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 425106713.0,
|
|
"step": 514
|
|
},
|
|
{
|
|
"entropy": 0.47956085205078125,
|
|
"epoch": 5.8522727272727275,
|
|
"grad_norm": 0.24440590571395973,
|
|
"learning_rate": 9.627453677218402e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 425917836.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 0.47054290771484375,
|
|
"epoch": 5.863636363636363,
|
|
"grad_norm": 0.23559264304220282,
|
|
"learning_rate": 8.301957361289969e-09,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 426756350.0,
|
|
"step": 516
|
|
},
|
|
{
|
|
"entropy": 0.48386383056640625,
|
|
"epoch": 5.875,
|
|
"grad_norm": 0.23312439693348005,
|
|
"learning_rate": 7.074436724286704e-09,
|
|
"loss": 0.0014,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 427592830.0,
|
|
"step": 517
|
|
},
|
|
{
|
|
"entropy": 0.48955535888671875,
|
|
"epoch": 5.886363636363637,
|
|
"grad_norm": 0.2588648942422833,
|
|
"learning_rate": 5.944940033360269e-09,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 428381425.0,
|
|
"step": 518
|
|
},
|
|
{
|
|
"entropy": 0.46274566650390625,
|
|
"epoch": 5.8977272727272725,
|
|
"grad_norm": 0.8745898260707156,
|
|
"learning_rate": 4.913511701278017e-09,
|
|
"loss": 0.0081,
|
|
"mean_token_accuracy": 0.9986979167442769,
|
|
"num_tokens": 429230635.0,
|
|
"step": 519
|
|
},
|
|
{
|
|
"entropy": 0.48191070556640625,
|
|
"epoch": 5.909090909090909,
|
|
"grad_norm": 0.27536210456621274,
|
|
"learning_rate": 3.98019228467661e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 430035732.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 0.48334503173828125,
|
|
"epoch": 5.920454545454545,
|
|
"grad_norm": 0.2487439601672152,
|
|
"learning_rate": 3.1450184824657892e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 430861990.0,
|
|
"step": 521
|
|
},
|
|
{
|
|
"entropy": 0.47314453125,
|
|
"epoch": 5.931818181818182,
|
|
"grad_norm": 0.2584974817202786,
|
|
"learning_rate": 2.408023134387871e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 431688137.0,
|
|
"step": 522
|
|
},
|
|
{
|
|
"entropy": 0.47559356689453125,
|
|
"epoch": 5.943181818181818,
|
|
"grad_norm": 0.27212198450046776,
|
|
"learning_rate": 1.7692352197240525e-09,
|
|
"loss": 0.0016,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 432525084.0,
|
|
"step": 523
|
|
},
|
|
{
|
|
"entropy": 0.464752197265625,
|
|
"epoch": 5.954545454545455,
|
|
"grad_norm": 0.2510235822217297,
|
|
"learning_rate": 1.2286798561572666e-09,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 433386586.0,
|
|
"step": 524
|
|
},
|
|
{
|
|
"entropy": 0.479766845703125,
|
|
"epoch": 5.965909090909091,
|
|
"grad_norm": 1.0240552861546037,
|
|
"learning_rate": 7.863782987821422e-10,
|
|
"loss": 0.002,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 434211459.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 0.47564697265625,
|
|
"epoch": 5.9772727272727275,
|
|
"grad_norm": 0.24500907077317072,
|
|
"learning_rate": 4.4234793927094845e-10,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 435023458.0,
|
|
"step": 526
|
|
},
|
|
{
|
|
"entropy": 0.47664642333984375,
|
|
"epoch": 5.988636363636363,
|
|
"grad_norm": 1.6045447652040448,
|
|
"learning_rate": 1.9660230518886436e-10,
|
|
"loss": 0.0022,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 435856072.0,
|
|
"step": 527
|
|
},
|
|
{
|
|
"entropy": 0.49124908447265625,
|
|
"epoch": 6.0,
|
|
"grad_norm": 0.24762061313479247,
|
|
"learning_rate": 4.915105946246002e-11,
|
|
"loss": 0.0015,
|
|
"mean_token_accuracy": 1.0,
|
|
"num_tokens": 436633698.0,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"step": 528,
|
|
"total_flos": 513726926487552.0,
|
|
"train_loss": 0.5520123199349848,
|
|
"train_runtime": 70216.6152,
|
|
"train_samples_per_second": 3.512,
|
|
"train_steps_per_second": 0.008
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 528,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 6,
|
|
"save_steps": 44,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 513726926487552.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|