Model: ali-elganzory/Baguettotron-longsft_16k-SFT-Tulu3-decontaminated Source: Original Platform
14677 lines
409 KiB
JSON
14677 lines
409 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 14634,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.8609375,
|
|
"epoch": 0.0013666803334700013,
|
|
"grad_norm": 1.1921717538529828,
|
|
"learning_rate": 1.0227272727272728e-07,
|
|
"loss": 2.0822,
|
|
"mean_token_accuracy": 0.5996886134147644,
|
|
"num_tokens": 937214.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.778125,
|
|
"epoch": 0.0027333606669400026,
|
|
"grad_norm": 1.2632581241732956,
|
|
"learning_rate": 2.1590909090909094e-07,
|
|
"loss": 1.9648,
|
|
"mean_token_accuracy": 0.6188210308551788,
|
|
"num_tokens": 1832763.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.8140625,
|
|
"epoch": 0.004100041000410004,
|
|
"grad_norm": 1.1518369209881436,
|
|
"learning_rate": 3.2954545454545455e-07,
|
|
"loss": 2.0205,
|
|
"mean_token_accuracy": 0.6097918927669526,
|
|
"num_tokens": 2759199.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.8453125,
|
|
"epoch": 0.005466721333880005,
|
|
"grad_norm": 1.2469645520783976,
|
|
"learning_rate": 4.431818181818182e-07,
|
|
"loss": 2.0276,
|
|
"mean_token_accuracy": 0.6081013083457947,
|
|
"num_tokens": 3670561.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.006833401667350007,
|
|
"grad_norm": 1.2613172218038768,
|
|
"learning_rate": 5.568181818181818e-07,
|
|
"loss": 1.9413,
|
|
"mean_token_accuracy": 0.6198121964931488,
|
|
"num_tokens": 4603768.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.8515625,
|
|
"epoch": 0.008200082000820008,
|
|
"grad_norm": 1.4564703125431837,
|
|
"learning_rate": 6.704545454545456e-07,
|
|
"loss": 2.0296,
|
|
"mean_token_accuracy": 0.608572918176651,
|
|
"num_tokens": 5567048.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.875,
|
|
"epoch": 0.00956676233429001,
|
|
"grad_norm": 1.2353724241700723,
|
|
"learning_rate": 7.840909090909092e-07,
|
|
"loss": 2.0768,
|
|
"mean_token_accuracy": 0.6014140844345093,
|
|
"num_tokens": 6535603.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.90078125,
|
|
"epoch": 0.01093344266776001,
|
|
"grad_norm": 2.175173203374402,
|
|
"learning_rate": 8.977272727272728e-07,
|
|
"loss": 2.0979,
|
|
"mean_token_accuracy": 0.5971255242824555,
|
|
"num_tokens": 7476241.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.84453125,
|
|
"epoch": 0.012300123001230012,
|
|
"grad_norm": 1.641198900096302,
|
|
"learning_rate": 1.0113636363636365e-06,
|
|
"loss": 2.0255,
|
|
"mean_token_accuracy": 0.6071559190750122,
|
|
"num_tokens": 8346244.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.796875,
|
|
"epoch": 0.013666803334700014,
|
|
"grad_norm": 1.1188866821825745,
|
|
"learning_rate": 1.125e-06,
|
|
"loss": 1.9953,
|
|
"mean_token_accuracy": 0.6135570168495178,
|
|
"num_tokens": 9275008.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.80234375,
|
|
"epoch": 0.015033483668170014,
|
|
"grad_norm": 1.1997771074998806,
|
|
"learning_rate": 1.2386363636363638e-06,
|
|
"loss": 1.9782,
|
|
"mean_token_accuracy": 0.6136937260627746,
|
|
"num_tokens": 10215452.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.7890625,
|
|
"epoch": 0.016400164001640016,
|
|
"grad_norm": 1.0974140223006834,
|
|
"learning_rate": 1.3522727272727273e-06,
|
|
"loss": 1.9682,
|
|
"mean_token_accuracy": 0.6139700293540955,
|
|
"num_tokens": 11118084.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.80546875,
|
|
"epoch": 0.017766844335110017,
|
|
"grad_norm": 1.4532621982439322,
|
|
"learning_rate": 1.465909090909091e-06,
|
|
"loss": 1.9922,
|
|
"mean_token_accuracy": 0.6135905265808106,
|
|
"num_tokens": 12052822.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.83359375,
|
|
"epoch": 0.01913352466858002,
|
|
"grad_norm": 0.8954727911420093,
|
|
"learning_rate": 1.5795454545454547e-06,
|
|
"loss": 2.0037,
|
|
"mean_token_accuracy": 0.6113010823726654,
|
|
"num_tokens": 12995395.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.8125,
|
|
"epoch": 0.02050020500205002,
|
|
"grad_norm": 0.9200319832626224,
|
|
"learning_rate": 1.6931818181818182e-06,
|
|
"loss": 1.9806,
|
|
"mean_token_accuracy": 0.6148091971874237,
|
|
"num_tokens": 13945392.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.834375,
|
|
"epoch": 0.02186688533552002,
|
|
"grad_norm": 0.8952251738174076,
|
|
"learning_rate": 1.8068181818181822e-06,
|
|
"loss": 1.9927,
|
|
"mean_token_accuracy": 0.6109092473983765,
|
|
"num_tokens": 14893050.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.834375,
|
|
"epoch": 0.023233565668990024,
|
|
"grad_norm": 0.7454769989249519,
|
|
"learning_rate": 1.9204545454545457e-06,
|
|
"loss": 1.9955,
|
|
"mean_token_accuracy": 0.6118976652622223,
|
|
"num_tokens": 15751833.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.82578125,
|
|
"epoch": 0.024600246002460024,
|
|
"grad_norm": 0.7809636800990112,
|
|
"learning_rate": 2.034090909090909e-06,
|
|
"loss": 1.9497,
|
|
"mean_token_accuracy": 0.6161179006099701,
|
|
"num_tokens": 16663724.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.9265625,
|
|
"epoch": 0.025966926335930025,
|
|
"grad_norm": 0.9807738850009182,
|
|
"learning_rate": 2.147727272727273e-06,
|
|
"loss": 2.0415,
|
|
"mean_token_accuracy": 0.6017034590244293,
|
|
"num_tokens": 17618983.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.8546875,
|
|
"epoch": 0.02733360666940003,
|
|
"grad_norm": 0.5822115052148433,
|
|
"learning_rate": 2.2613636363636366e-06,
|
|
"loss": 1.9969,
|
|
"mean_token_accuracy": 0.608382773399353,
|
|
"num_tokens": 18482595.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.82578125,
|
|
"epoch": 0.02870028700287003,
|
|
"grad_norm": 0.6770978069213363,
|
|
"learning_rate": 2.375e-06,
|
|
"loss": 1.9589,
|
|
"mean_token_accuracy": 0.6094512939453125,
|
|
"num_tokens": 19421489.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.85390625,
|
|
"epoch": 0.03006696733634003,
|
|
"grad_norm": 0.4780702662554944,
|
|
"learning_rate": 2.488636363636364e-06,
|
|
"loss": 1.958,
|
|
"mean_token_accuracy": 0.6133853971958161,
|
|
"num_tokens": 20362083.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.82109375,
|
|
"epoch": 0.03143364766981003,
|
|
"grad_norm": 0.6611590881636448,
|
|
"learning_rate": 2.6022727272727276e-06,
|
|
"loss": 1.9267,
|
|
"mean_token_accuracy": 0.6214154958724976,
|
|
"num_tokens": 21308392.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.85234375,
|
|
"epoch": 0.03280032800328003,
|
|
"grad_norm": 0.3841765701715681,
|
|
"learning_rate": 2.715909090909091e-06,
|
|
"loss": 1.9518,
|
|
"mean_token_accuracy": 0.6145195066928864,
|
|
"num_tokens": 22242546.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.8703125,
|
|
"epoch": 0.034167008336750036,
|
|
"grad_norm": 0.5029043628947965,
|
|
"learning_rate": 2.829545454545455e-06,
|
|
"loss": 1.9563,
|
|
"mean_token_accuracy": 0.6189641892910004,
|
|
"num_tokens": 23172283.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.8453125,
|
|
"epoch": 0.03553368867022003,
|
|
"grad_norm": 0.3159560632047791,
|
|
"learning_rate": 2.9431818181818185e-06,
|
|
"loss": 1.9211,
|
|
"mean_token_accuracy": 0.6215897798538208,
|
|
"num_tokens": 24105048.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.85859375,
|
|
"epoch": 0.03690036900369004,
|
|
"grad_norm": 0.5455690957523381,
|
|
"learning_rate": 3.056818181818182e-06,
|
|
"loss": 1.9566,
|
|
"mean_token_accuracy": 0.6166253328323364,
|
|
"num_tokens": 25035940.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.8078125,
|
|
"epoch": 0.03826704933716004,
|
|
"grad_norm": 0.49748594700669235,
|
|
"learning_rate": 3.1704545454545456e-06,
|
|
"loss": 1.8657,
|
|
"mean_token_accuracy": 0.6277089416980743,
|
|
"num_tokens": 25984242.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.853125,
|
|
"epoch": 0.03963372967063004,
|
|
"grad_norm": 0.4182318760702863,
|
|
"learning_rate": 3.2840909090909095e-06,
|
|
"loss": 1.9019,
|
|
"mean_token_accuracy": 0.6224762082099915,
|
|
"num_tokens": 26909829.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.846875,
|
|
"epoch": 0.04100041000410004,
|
|
"grad_norm": 0.3683000146862693,
|
|
"learning_rate": 3.397727272727273e-06,
|
|
"loss": 1.933,
|
|
"mean_token_accuracy": 0.6163983643054962,
|
|
"num_tokens": 27832441.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.903125,
|
|
"epoch": 0.042367090337570044,
|
|
"grad_norm": 0.4093252398414278,
|
|
"learning_rate": 3.5113636363636365e-06,
|
|
"loss": 1.9737,
|
|
"mean_token_accuracy": 0.6136131167411805,
|
|
"num_tokens": 28776601.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.87109375,
|
|
"epoch": 0.04373377067104004,
|
|
"grad_norm": 0.32709142055074586,
|
|
"learning_rate": 3.625e-06,
|
|
"loss": 1.924,
|
|
"mean_token_accuracy": 0.6172412157058715,
|
|
"num_tokens": 29677237.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 1.9453125,
|
|
"epoch": 0.045100451004510045,
|
|
"grad_norm": 0.27607285807974646,
|
|
"learning_rate": 3.7386363636363635e-06,
|
|
"loss": 1.9984,
|
|
"mean_token_accuracy": 0.609658706188202,
|
|
"num_tokens": 30592137.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.7984375,
|
|
"epoch": 0.04646713133798005,
|
|
"grad_norm": 0.3541911395546446,
|
|
"learning_rate": 3.852272727272728e-06,
|
|
"loss": 1.8292,
|
|
"mean_token_accuracy": 0.6329781174659729,
|
|
"num_tokens": 31485609.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.047833811671450045,
|
|
"grad_norm": 0.28645280476439783,
|
|
"learning_rate": 3.965909090909091e-06,
|
|
"loss": 1.8076,
|
|
"mean_token_accuracy": 0.635384339094162,
|
|
"num_tokens": 32433039.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.88046875,
|
|
"epoch": 0.04920049200492005,
|
|
"grad_norm": 0.4470550378935197,
|
|
"learning_rate": 4.079545454545455e-06,
|
|
"loss": 1.9292,
|
|
"mean_token_accuracy": 0.6165472984313964,
|
|
"num_tokens": 33370088.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 1.9140625,
|
|
"epoch": 0.05056717233839005,
|
|
"grad_norm": 0.3896144984837952,
|
|
"learning_rate": 4.193181818181819e-06,
|
|
"loss": 1.9532,
|
|
"mean_token_accuracy": 0.615005624294281,
|
|
"num_tokens": 34301456.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.896875,
|
|
"epoch": 0.05193385267186005,
|
|
"grad_norm": 0.28872306923393026,
|
|
"learning_rate": 4.306818181818182e-06,
|
|
"loss": 1.9389,
|
|
"mean_token_accuracy": 0.6173099458217621,
|
|
"num_tokens": 35207001.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.83046875,
|
|
"epoch": 0.05330053300533005,
|
|
"grad_norm": 0.2830874088796496,
|
|
"learning_rate": 4.420454545454546e-06,
|
|
"loss": 1.8849,
|
|
"mean_token_accuracy": 0.625219315290451,
|
|
"num_tokens": 36167357.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.8390625,
|
|
"epoch": 0.05466721333880006,
|
|
"grad_norm": 0.3587109218474515,
|
|
"learning_rate": 4.53409090909091e-06,
|
|
"loss": 1.858,
|
|
"mean_token_accuracy": 0.6297855079174042,
|
|
"num_tokens": 37057200.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.92109375,
|
|
"epoch": 0.05603389367227005,
|
|
"grad_norm": 0.3886590795841911,
|
|
"learning_rate": 4.647727272727273e-06,
|
|
"loss": 1.9441,
|
|
"mean_token_accuracy": 0.6135135769844056,
|
|
"num_tokens": 37960465.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.87421875,
|
|
"epoch": 0.05740057400574006,
|
|
"grad_norm": 0.39927728002592405,
|
|
"learning_rate": 4.761363636363637e-06,
|
|
"loss": 1.9204,
|
|
"mean_token_accuracy": 0.6204217135906219,
|
|
"num_tokens": 38906392.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.8453125,
|
|
"epoch": 0.05876725433921006,
|
|
"grad_norm": 0.5409531977142042,
|
|
"learning_rate": 4.875e-06,
|
|
"loss": 1.8898,
|
|
"mean_token_accuracy": 0.624541437625885,
|
|
"num_tokens": 39848884.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.890625,
|
|
"epoch": 0.06013393467268006,
|
|
"grad_norm": 0.2719793982526499,
|
|
"learning_rate": 4.988636363636364e-06,
|
|
"loss": 1.9368,
|
|
"mean_token_accuracy": 0.6182913780212402,
|
|
"num_tokens": 40804802.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.896875,
|
|
"epoch": 0.06150061500615006,
|
|
"grad_norm": 0.23782415620281597,
|
|
"learning_rate": 4.996829646329435e-06,
|
|
"loss": 1.9344,
|
|
"mean_token_accuracy": 0.6166102349758148,
|
|
"num_tokens": 41729046.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.9171875,
|
|
"epoch": 0.06286729533962006,
|
|
"grad_norm": 0.35453457972525304,
|
|
"learning_rate": 4.993307031139919e-06,
|
|
"loss": 1.9471,
|
|
"mean_token_accuracy": 0.6167487621307373,
|
|
"num_tokens": 42678976.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.89765625,
|
|
"epoch": 0.06423397567309007,
|
|
"grad_norm": 0.21306205760147529,
|
|
"learning_rate": 4.989784415950402e-06,
|
|
"loss": 1.9324,
|
|
"mean_token_accuracy": 0.6162386178970337,
|
|
"num_tokens": 43616324.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 1.98828125,
|
|
"epoch": 0.06560065600656007,
|
|
"grad_norm": 0.3202548263484468,
|
|
"learning_rate": 4.986261800760885e-06,
|
|
"loss": 2.0174,
|
|
"mean_token_accuracy": 0.6063738107681275,
|
|
"num_tokens": 44537209.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.85234375,
|
|
"epoch": 0.06696733634003006,
|
|
"grad_norm": 0.4430381700595253,
|
|
"learning_rate": 4.9827391855713685e-06,
|
|
"loss": 1.8725,
|
|
"mean_token_accuracy": 0.6288193941116333,
|
|
"num_tokens": 45493973.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.87265625,
|
|
"epoch": 0.06833401667350007,
|
|
"grad_norm": 0.3070897011835966,
|
|
"learning_rate": 4.979216570381852e-06,
|
|
"loss": 1.8728,
|
|
"mean_token_accuracy": 0.6225479960441589,
|
|
"num_tokens": 46418909.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.8390625,
|
|
"epoch": 0.06970069700697007,
|
|
"grad_norm": 0.22770644428121758,
|
|
"learning_rate": 4.975693955192335e-06,
|
|
"loss": 1.874,
|
|
"mean_token_accuracy": 0.6211739480495453,
|
|
"num_tokens": 47332522.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.88828125,
|
|
"epoch": 0.07106737734044007,
|
|
"grad_norm": 0.3553081897625303,
|
|
"learning_rate": 4.972171340002819e-06,
|
|
"loss": 1.903,
|
|
"mean_token_accuracy": 0.6225685954093934,
|
|
"num_tokens": 48296148.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 1.8859375,
|
|
"epoch": 0.07243405767391008,
|
|
"grad_norm": 0.2975689782746442,
|
|
"learning_rate": 4.968648724813302e-06,
|
|
"loss": 1.9015,
|
|
"mean_token_accuracy": 0.6212416887283325,
|
|
"num_tokens": 49218879.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 1.81640625,
|
|
"epoch": 0.07380073800738007,
|
|
"grad_norm": 0.3366676752208685,
|
|
"learning_rate": 4.965126109623785e-06,
|
|
"loss": 1.8309,
|
|
"mean_token_accuracy": 0.6296063721179962,
|
|
"num_tokens": 50125266.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 1.878125,
|
|
"epoch": 0.07516741834085007,
|
|
"grad_norm": 0.22287273668792668,
|
|
"learning_rate": 4.961603494434268e-06,
|
|
"loss": 1.918,
|
|
"mean_token_accuracy": 0.6202747404575348,
|
|
"num_tokens": 51051066.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.83828125,
|
|
"epoch": 0.07653409867432008,
|
|
"grad_norm": 0.40326686181361543,
|
|
"learning_rate": 4.958080879244752e-06,
|
|
"loss": 1.8554,
|
|
"mean_token_accuracy": 0.6274335145950317,
|
|
"num_tokens": 51997532.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 1.83671875,
|
|
"epoch": 0.07790077900779008,
|
|
"grad_norm": 0.3440465578138394,
|
|
"learning_rate": 4.954558264055234e-06,
|
|
"loss": 1.8576,
|
|
"mean_token_accuracy": 0.6271359920501709,
|
|
"num_tokens": 52882149.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 1.82734375,
|
|
"epoch": 0.07926745934126007,
|
|
"grad_norm": 0.37179283609736735,
|
|
"learning_rate": 4.951035648865719e-06,
|
|
"loss": 1.8339,
|
|
"mean_token_accuracy": 0.632761150598526,
|
|
"num_tokens": 53778993.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 1.83203125,
|
|
"epoch": 0.08063413967473008,
|
|
"grad_norm": 0.29403053317949407,
|
|
"learning_rate": 4.9475130336762015e-06,
|
|
"loss": 1.851,
|
|
"mean_token_accuracy": 0.6265239477157593,
|
|
"num_tokens": 54682786.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 1.90078125,
|
|
"epoch": 0.08200082000820008,
|
|
"grad_norm": 0.24590959957108358,
|
|
"learning_rate": 4.943990418486685e-06,
|
|
"loss": 1.9187,
|
|
"mean_token_accuracy": 0.6165609002113343,
|
|
"num_tokens": 55647271.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.85078125,
|
|
"epoch": 0.08336750034167008,
|
|
"grad_norm": 0.40163855107067725,
|
|
"learning_rate": 4.9404678032971685e-06,
|
|
"loss": 1.8625,
|
|
"mean_token_accuracy": 0.6295888662338257,
|
|
"num_tokens": 56553214.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 1.80703125,
|
|
"epoch": 0.08473418067514009,
|
|
"grad_norm": 0.3462791819613935,
|
|
"learning_rate": 4.936945188107651e-06,
|
|
"loss": 1.8309,
|
|
"mean_token_accuracy": 0.633828467130661,
|
|
"num_tokens": 57496993.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 1.73828125,
|
|
"epoch": 0.08610086100861009,
|
|
"grad_norm": 0.2430607093929717,
|
|
"learning_rate": 4.933422572918135e-06,
|
|
"loss": 1.7472,
|
|
"mean_token_accuracy": 0.6450510919094086,
|
|
"num_tokens": 58398547.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 1.82109375,
|
|
"epoch": 0.08746754134208008,
|
|
"grad_norm": 0.2874574857690488,
|
|
"learning_rate": 4.929899957728618e-06,
|
|
"loss": 1.8344,
|
|
"mean_token_accuracy": 0.6350089013576508,
|
|
"num_tokens": 59340886.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 1.859375,
|
|
"epoch": 0.08883422167555009,
|
|
"grad_norm": 0.2680479496921199,
|
|
"learning_rate": 4.926377342539102e-06,
|
|
"loss": 1.8553,
|
|
"mean_token_accuracy": 0.6298904240131378,
|
|
"num_tokens": 60257912.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.85859375,
|
|
"epoch": 0.09020090200902009,
|
|
"grad_norm": 0.29613063174389415,
|
|
"learning_rate": 4.922854727349585e-06,
|
|
"loss": 1.8685,
|
|
"mean_token_accuracy": 0.6255346000194549,
|
|
"num_tokens": 61186876.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 1.82265625,
|
|
"epoch": 0.09156758234249009,
|
|
"grad_norm": 0.18483287626160677,
|
|
"learning_rate": 4.919332112160068e-06,
|
|
"loss": 1.8349,
|
|
"mean_token_accuracy": 0.6299048840999604,
|
|
"num_tokens": 62144445.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 1.82578125,
|
|
"epoch": 0.0929342626759601,
|
|
"grad_norm": 0.24205537254013715,
|
|
"learning_rate": 4.915809496970551e-06,
|
|
"loss": 1.8268,
|
|
"mean_token_accuracy": 0.6305512726306916,
|
|
"num_tokens": 63095715.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 1.91171875,
|
|
"epoch": 0.0943009430094301,
|
|
"grad_norm": 0.2501335783503176,
|
|
"learning_rate": 4.912286881781035e-06,
|
|
"loss": 1.9412,
|
|
"mean_token_accuracy": 0.6134482502937317,
|
|
"num_tokens": 64009267.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 1.809375,
|
|
"epoch": 0.09566762334290009,
|
|
"grad_norm": 0.4559395986579277,
|
|
"learning_rate": 4.908764266591518e-06,
|
|
"loss": 1.8157,
|
|
"mean_token_accuracy": 0.6337453961372376,
|
|
"num_tokens": 64920596.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.82578125,
|
|
"epoch": 0.0970343036763701,
|
|
"grad_norm": 0.4054517743321767,
|
|
"learning_rate": 4.9052416514020015e-06,
|
|
"loss": 1.8256,
|
|
"mean_token_accuracy": 0.6317720651626587,
|
|
"num_tokens": 65831798.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 1.7875,
|
|
"epoch": 0.0984009840098401,
|
|
"grad_norm": 0.2547471198995455,
|
|
"learning_rate": 4.901719036212484e-06,
|
|
"loss": 1.8129,
|
|
"mean_token_accuracy": 0.6332606256008149,
|
|
"num_tokens": 66798343.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 1.77109375,
|
|
"epoch": 0.0997676643433101,
|
|
"grad_norm": 0.3774998394038914,
|
|
"learning_rate": 4.898196421022968e-06,
|
|
"loss": 1.7873,
|
|
"mean_token_accuracy": 0.6375937163829803,
|
|
"num_tokens": 67730096.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 1.79375,
|
|
"epoch": 0.1011343446767801,
|
|
"grad_norm": 0.42264389315744116,
|
|
"learning_rate": 4.894673805833451e-06,
|
|
"loss": 1.7773,
|
|
"mean_token_accuracy": 0.6393559336662292,
|
|
"num_tokens": 68583530.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 1.84609375,
|
|
"epoch": 0.1025010250102501,
|
|
"grad_norm": 0.3142587304537084,
|
|
"learning_rate": 4.891151190643935e-06,
|
|
"loss": 1.8626,
|
|
"mean_token_accuracy": 0.6302407264709473,
|
|
"num_tokens": 69508351.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.82734375,
|
|
"epoch": 0.1038677053437201,
|
|
"grad_norm": 0.294605516363874,
|
|
"learning_rate": 4.8876285754544175e-06,
|
|
"loss": 1.833,
|
|
"mean_token_accuracy": 0.6312712132930756,
|
|
"num_tokens": 70460853.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 1.86640625,
|
|
"epoch": 0.10523438567719011,
|
|
"grad_norm": 0.2349400075475341,
|
|
"learning_rate": 4.884105960264901e-06,
|
|
"loss": 1.8852,
|
|
"mean_token_accuracy": 0.6229366779327392,
|
|
"num_tokens": 71360689.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 1.7640625,
|
|
"epoch": 0.1066010660106601,
|
|
"grad_norm": 0.35535797923491436,
|
|
"learning_rate": 4.880583345075385e-06,
|
|
"loss": 1.7665,
|
|
"mean_token_accuracy": 0.6392882764339447,
|
|
"num_tokens": 72284444.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.1079677463441301,
|
|
"grad_norm": 0.3078577535566996,
|
|
"learning_rate": 4.877060729885867e-06,
|
|
"loss": 1.7798,
|
|
"mean_token_accuracy": 0.6377495408058167,
|
|
"num_tokens": 73207379.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 1.8,
|
|
"epoch": 0.10933442667760011,
|
|
"grad_norm": 0.3056636980556509,
|
|
"learning_rate": 4.873538114696351e-06,
|
|
"loss": 1.8094,
|
|
"mean_token_accuracy": 0.6329820156097412,
|
|
"num_tokens": 74116984.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.76875,
|
|
"epoch": 0.11070110701107011,
|
|
"grad_norm": 0.23125816196159055,
|
|
"learning_rate": 4.870015499506834e-06,
|
|
"loss": 1.7859,
|
|
"mean_token_accuracy": 0.6416086554527283,
|
|
"num_tokens": 75041811.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 1.8296875,
|
|
"epoch": 0.1120677873445401,
|
|
"grad_norm": 0.4249160078505264,
|
|
"learning_rate": 4.866492884317318e-06,
|
|
"loss": 1.8411,
|
|
"mean_token_accuracy": 0.6290619671344757,
|
|
"num_tokens": 75967173.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.11343446767801012,
|
|
"grad_norm": 0.22917265010438512,
|
|
"learning_rate": 4.862970269127801e-06,
|
|
"loss": 1.7804,
|
|
"mean_token_accuracy": 0.6413192212581634,
|
|
"num_tokens": 76877682.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 1.83515625,
|
|
"epoch": 0.11480114801148011,
|
|
"grad_norm": 0.21597164026294416,
|
|
"learning_rate": 4.859447653938284e-06,
|
|
"loss": 1.8168,
|
|
"mean_token_accuracy": 0.6313130855560303,
|
|
"num_tokens": 77772740.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 1.853125,
|
|
"epoch": 0.11616782834495011,
|
|
"grad_norm": 0.2240671083282642,
|
|
"learning_rate": 4.855925038748768e-06,
|
|
"loss": 1.8798,
|
|
"mean_token_accuracy": 0.625989556312561,
|
|
"num_tokens": 78765799.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.840625,
|
|
"epoch": 0.11753450867842012,
|
|
"grad_norm": 0.3598083242269916,
|
|
"learning_rate": 4.852402423559251e-06,
|
|
"loss": 1.8561,
|
|
"mean_token_accuracy": 0.6298409700393677,
|
|
"num_tokens": 79731400.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 1.84609375,
|
|
"epoch": 0.11890118901189012,
|
|
"grad_norm": 0.34758753288945565,
|
|
"learning_rate": 4.848879808369734e-06,
|
|
"loss": 1.874,
|
|
"mean_token_accuracy": 0.6278775632381439,
|
|
"num_tokens": 80641424.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 1.76171875,
|
|
"epoch": 0.12026786934536011,
|
|
"grad_norm": 0.34655115038989115,
|
|
"learning_rate": 4.8453571931802175e-06,
|
|
"loss": 1.7491,
|
|
"mean_token_accuracy": 0.6433783173561096,
|
|
"num_tokens": 81585552.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.12163454967883013,
|
|
"grad_norm": 0.2670339318033455,
|
|
"learning_rate": 4.8418345779907e-06,
|
|
"loss": 1.7303,
|
|
"mean_token_accuracy": 0.6473717451095581,
|
|
"num_tokens": 82486405.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 1.859375,
|
|
"epoch": 0.12300123001230012,
|
|
"grad_norm": 0.3707369288867322,
|
|
"learning_rate": 4.838311962801184e-06,
|
|
"loss": 1.8772,
|
|
"mean_token_accuracy": 0.625070333480835,
|
|
"num_tokens": 83433707.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.790625,
|
|
"epoch": 0.12436791034577012,
|
|
"grad_norm": 0.22758122901293334,
|
|
"learning_rate": 4.834789347611667e-06,
|
|
"loss": 1.8007,
|
|
"mean_token_accuracy": 0.6356785476207734,
|
|
"num_tokens": 84309931.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 1.8609375,
|
|
"epoch": 0.12573459067924012,
|
|
"grad_norm": 0.36729137942639845,
|
|
"learning_rate": 4.831266732422151e-06,
|
|
"loss": 1.8546,
|
|
"mean_token_accuracy": 0.628924036026001,
|
|
"num_tokens": 85189803.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 1.70390625,
|
|
"epoch": 0.12710127101271013,
|
|
"grad_norm": 0.21245382768529134,
|
|
"learning_rate": 4.827744117232634e-06,
|
|
"loss": 1.7063,
|
|
"mean_token_accuracy": 0.6511714458465576,
|
|
"num_tokens": 86090974.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 1.7140625,
|
|
"epoch": 0.12846795134618014,
|
|
"grad_norm": 0.23489321219790837,
|
|
"learning_rate": 4.824221502043117e-06,
|
|
"loss": 1.713,
|
|
"mean_token_accuracy": 0.6465020656585694,
|
|
"num_tokens": 86994446.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 1.840625,
|
|
"epoch": 0.12983463167965012,
|
|
"grad_norm": 0.24517136771389986,
|
|
"learning_rate": 4.820698886853601e-06,
|
|
"loss": 1.836,
|
|
"mean_token_accuracy": 0.6265623092651367,
|
|
"num_tokens": 87975415.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.83125,
|
|
"epoch": 0.13120131201312013,
|
|
"grad_norm": 0.26677779292944553,
|
|
"learning_rate": 4.817176271664084e-06,
|
|
"loss": 1.8388,
|
|
"mean_token_accuracy": 0.6264655888080597,
|
|
"num_tokens": 88914934.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 1.75234375,
|
|
"epoch": 0.13256799234659014,
|
|
"grad_norm": 0.2844170571417803,
|
|
"learning_rate": 4.813653656474567e-06,
|
|
"loss": 1.7579,
|
|
"mean_token_accuracy": 0.6403306782245636,
|
|
"num_tokens": 89793764.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 1.8265625,
|
|
"epoch": 0.13393467268006012,
|
|
"grad_norm": 0.21502885351959408,
|
|
"learning_rate": 4.8101310412850505e-06,
|
|
"loss": 1.8328,
|
|
"mean_token_accuracy": 0.6281405746936798,
|
|
"num_tokens": 90705067.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 1.79921875,
|
|
"epoch": 0.13530135301353013,
|
|
"grad_norm": 0.21334504322922784,
|
|
"learning_rate": 4.806608426095534e-06,
|
|
"loss": 1.8213,
|
|
"mean_token_accuracy": 0.6344258308410644,
|
|
"num_tokens": 91668074.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 1.76484375,
|
|
"epoch": 0.13666803334700015,
|
|
"grad_norm": 0.2744605885891502,
|
|
"learning_rate": 4.803085810906017e-06,
|
|
"loss": 1.7636,
|
|
"mean_token_accuracy": 0.640391081571579,
|
|
"num_tokens": 92592477.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.8484375,
|
|
"epoch": 0.13803471368047013,
|
|
"grad_norm": 0.23524301796234934,
|
|
"learning_rate": 4.7995631957165e-06,
|
|
"loss": 1.8503,
|
|
"mean_token_accuracy": 0.624327790737152,
|
|
"num_tokens": 93544370.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 1.74453125,
|
|
"epoch": 0.13940139401394014,
|
|
"grad_norm": 0.2009120242416655,
|
|
"learning_rate": 4.796040580526984e-06,
|
|
"loss": 1.7512,
|
|
"mean_token_accuracy": 0.645144248008728,
|
|
"num_tokens": 94471152.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 1.77421875,
|
|
"epoch": 0.14076807434741015,
|
|
"grad_norm": 0.19701058262655025,
|
|
"learning_rate": 4.792517965337467e-06,
|
|
"loss": 1.792,
|
|
"mean_token_accuracy": 0.6378251552581787,
|
|
"num_tokens": 95355196.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 1.796875,
|
|
"epoch": 0.14213475468088013,
|
|
"grad_norm": 0.3580688875958106,
|
|
"learning_rate": 4.78899535014795e-06,
|
|
"loss": 1.815,
|
|
"mean_token_accuracy": 0.6352297246456147,
|
|
"num_tokens": 96280630.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 1.77890625,
|
|
"epoch": 0.14350143501435014,
|
|
"grad_norm": 0.21537920826172227,
|
|
"learning_rate": 4.785472734958434e-06,
|
|
"loss": 1.789,
|
|
"mean_token_accuracy": 0.6377599775791168,
|
|
"num_tokens": 97231696.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.82421875,
|
|
"epoch": 0.14486811534782015,
|
|
"grad_norm": 0.3613218460641949,
|
|
"learning_rate": 4.781950119768916e-06,
|
|
"loss": 1.8332,
|
|
"mean_token_accuracy": 0.631199163198471,
|
|
"num_tokens": 98167947.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.14623479568129014,
|
|
"grad_norm": 0.2358868160634767,
|
|
"learning_rate": 4.778427504579401e-06,
|
|
"loss": 1.6932,
|
|
"mean_token_accuracy": 0.6512220025062561,
|
|
"num_tokens": 99103204.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 1.74140625,
|
|
"epoch": 0.14760147601476015,
|
|
"grad_norm": 0.32875908766158685,
|
|
"learning_rate": 4.774904889389883e-06,
|
|
"loss": 1.7454,
|
|
"mean_token_accuracy": 0.6448809325695037,
|
|
"num_tokens": 100061105.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 1.7578125,
|
|
"epoch": 0.14896815634823016,
|
|
"grad_norm": 0.33262705940337595,
|
|
"learning_rate": 4.771382274200367e-06,
|
|
"loss": 1.7728,
|
|
"mean_token_accuracy": 0.6408154785633087,
|
|
"num_tokens": 100990812.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 1.821875,
|
|
"epoch": 0.15033483668170014,
|
|
"grad_norm": 0.24228023296123538,
|
|
"learning_rate": 4.76785965901085e-06,
|
|
"loss": 1.8205,
|
|
"mean_token_accuracy": 0.6309961318969727,
|
|
"num_tokens": 101950835.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.89609375,
|
|
"epoch": 0.15170151701517015,
|
|
"grad_norm": 0.25897064494243804,
|
|
"learning_rate": 4.764337043821333e-06,
|
|
"loss": 1.9105,
|
|
"mean_token_accuracy": 0.6197786509990693,
|
|
"num_tokens": 102863061.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 1.79765625,
|
|
"epoch": 0.15306819734864016,
|
|
"grad_norm": 0.2578406014708779,
|
|
"learning_rate": 4.760814428631817e-06,
|
|
"loss": 1.7966,
|
|
"mean_token_accuracy": 0.6341909527778625,
|
|
"num_tokens": 103778383.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 1.771875,
|
|
"epoch": 0.15443487768211014,
|
|
"grad_norm": 0.27604435092322455,
|
|
"learning_rate": 4.7572918134423e-06,
|
|
"loss": 1.7592,
|
|
"mean_token_accuracy": 0.6396775722503663,
|
|
"num_tokens": 104663753.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 0.15580155801558015,
|
|
"grad_norm": 0.2650506845707573,
|
|
"learning_rate": 4.753769198252783e-06,
|
|
"loss": 1.7553,
|
|
"mean_token_accuracy": 0.6432477116584778,
|
|
"num_tokens": 105592627.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 1.8296875,
|
|
"epoch": 0.15716823834905017,
|
|
"grad_norm": 0.3171711337969201,
|
|
"learning_rate": 4.7502465830632665e-06,
|
|
"loss": 1.8413,
|
|
"mean_token_accuracy": 0.6281008064746857,
|
|
"num_tokens": 106559734.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.71015625,
|
|
"epoch": 0.15853491868252015,
|
|
"grad_norm": 0.305444352409687,
|
|
"learning_rate": 4.74672396787375e-06,
|
|
"loss": 1.7087,
|
|
"mean_token_accuracy": 0.6507192611694336,
|
|
"num_tokens": 107497554.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 1.79375,
|
|
"epoch": 0.15990159901599016,
|
|
"grad_norm": 0.2887068284824132,
|
|
"learning_rate": 4.743201352684233e-06,
|
|
"loss": 1.7992,
|
|
"mean_token_accuracy": 0.6391997754573822,
|
|
"num_tokens": 108435926.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 1.75546875,
|
|
"epoch": 0.16126827934946017,
|
|
"grad_norm": 0.28289826695011855,
|
|
"learning_rate": 4.739678737494716e-06,
|
|
"loss": 1.7639,
|
|
"mean_token_accuracy": 0.6442327499389648,
|
|
"num_tokens": 109342023.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 1.78984375,
|
|
"epoch": 0.16263495968293015,
|
|
"grad_norm": 0.34007229816632956,
|
|
"learning_rate": 4.7361561223052e-06,
|
|
"loss": 1.7977,
|
|
"mean_token_accuracy": 0.6322007477283478,
|
|
"num_tokens": 110258123.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 1.77265625,
|
|
"epoch": 0.16400164001640016,
|
|
"grad_norm": 0.28646893178592686,
|
|
"learning_rate": 4.7326335071156834e-06,
|
|
"loss": 1.7905,
|
|
"mean_token_accuracy": 0.6407837986946106,
|
|
"num_tokens": 111168171.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.16536832034987017,
|
|
"grad_norm": 0.4064241056746564,
|
|
"learning_rate": 4.729110891926166e-06,
|
|
"loss": 1.7907,
|
|
"mean_token_accuracy": 0.6367282748222352,
|
|
"num_tokens": 112116903.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 0.16673500068334016,
|
|
"grad_norm": 0.30148408583658354,
|
|
"learning_rate": 4.72558827673665e-06,
|
|
"loss": 1.7444,
|
|
"mean_token_accuracy": 0.6436694324016571,
|
|
"num_tokens": 112999695.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 1.7296875,
|
|
"epoch": 0.16810168101681017,
|
|
"grad_norm": 0.29420272497435795,
|
|
"learning_rate": 4.722065661547132e-06,
|
|
"loss": 1.7585,
|
|
"mean_token_accuracy": 0.6432316660881042,
|
|
"num_tokens": 113933796.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 1.81796875,
|
|
"epoch": 0.16946836135028018,
|
|
"grad_norm": 0.25759937074500205,
|
|
"learning_rate": 4.718543046357617e-06,
|
|
"loss": 1.828,
|
|
"mean_token_accuracy": 0.6315177202224731,
|
|
"num_tokens": 114897277.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 0.17083504168375016,
|
|
"grad_norm": 0.3472875047424681,
|
|
"learning_rate": 4.7150204311680995e-06,
|
|
"loss": 1.7377,
|
|
"mean_token_accuracy": 0.6429005861282349,
|
|
"num_tokens": 115771970.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.79375,
|
|
"epoch": 0.17220172201722017,
|
|
"grad_norm": 0.2062789102422931,
|
|
"learning_rate": 4.711497815978583e-06,
|
|
"loss": 1.8036,
|
|
"mean_token_accuracy": 0.635973310470581,
|
|
"num_tokens": 116695464.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 1.809375,
|
|
"epoch": 0.17356840235069018,
|
|
"grad_norm": 0.37309137076756616,
|
|
"learning_rate": 4.707975200789066e-06,
|
|
"loss": 1.8444,
|
|
"mean_token_accuracy": 0.6279125273227691,
|
|
"num_tokens": 117662731.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.17493508268416016,
|
|
"grad_norm": 0.34382592822909275,
|
|
"learning_rate": 4.704452585599549e-06,
|
|
"loss": 1.7884,
|
|
"mean_token_accuracy": 0.63452068567276,
|
|
"num_tokens": 118580485.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 1.7515625,
|
|
"epoch": 0.17630176301763018,
|
|
"grad_norm": 0.3012587492537284,
|
|
"learning_rate": 4.700929970410033e-06,
|
|
"loss": 1.7529,
|
|
"mean_token_accuracy": 0.6437574744224548,
|
|
"num_tokens": 119504310.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 1.7828125,
|
|
"epoch": 0.17766844335110019,
|
|
"grad_norm": 0.1577234342565021,
|
|
"learning_rate": 4.697407355220516e-06,
|
|
"loss": 1.7717,
|
|
"mean_token_accuracy": 0.6420591473579407,
|
|
"num_tokens": 120461635.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.76953125,
|
|
"epoch": 0.17903512368457017,
|
|
"grad_norm": 0.18995451187988488,
|
|
"learning_rate": 4.693884740030999e-06,
|
|
"loss": 1.761,
|
|
"mean_token_accuracy": 0.6390918970108033,
|
|
"num_tokens": 121333629.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 1.80390625,
|
|
"epoch": 0.18040180401804018,
|
|
"grad_norm": 0.2240179021591038,
|
|
"learning_rate": 4.690362124841483e-06,
|
|
"loss": 1.8044,
|
|
"mean_token_accuracy": 0.6350222051143646,
|
|
"num_tokens": 122256619.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 1.7890625,
|
|
"epoch": 0.1817684843515102,
|
|
"grad_norm": 0.18327286851588995,
|
|
"learning_rate": 4.686839509651966e-06,
|
|
"loss": 1.8064,
|
|
"mean_token_accuracy": 0.6337711751461029,
|
|
"num_tokens": 123180430.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 1.79921875,
|
|
"epoch": 0.18313516468498017,
|
|
"grad_norm": 0.1957717595645091,
|
|
"learning_rate": 4.683316894462449e-06,
|
|
"loss": 1.8143,
|
|
"mean_token_accuracy": 0.6329705774784088,
|
|
"num_tokens": 124122310.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 1.7328125,
|
|
"epoch": 0.18450184501845018,
|
|
"grad_norm": 0.23376105571843567,
|
|
"learning_rate": 4.679794279272933e-06,
|
|
"loss": 1.7538,
|
|
"mean_token_accuracy": 0.6412849307060242,
|
|
"num_tokens": 125048931.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.75703125,
|
|
"epoch": 0.1858685253519202,
|
|
"grad_norm": 0.20935371712300313,
|
|
"learning_rate": 4.676271664083416e-06,
|
|
"loss": 1.7767,
|
|
"mean_token_accuracy": 0.6388256549835205,
|
|
"num_tokens": 126002806.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 1.884375,
|
|
"epoch": 0.18723520568539018,
|
|
"grad_norm": 0.5301253342140009,
|
|
"learning_rate": 4.6727490488938995e-06,
|
|
"loss": 1.8745,
|
|
"mean_token_accuracy": 0.6256722331047058,
|
|
"num_tokens": 126878845.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 1.765625,
|
|
"epoch": 0.1886018860188602,
|
|
"grad_norm": 0.3225912323362885,
|
|
"learning_rate": 4.669226433704382e-06,
|
|
"loss": 1.7597,
|
|
"mean_token_accuracy": 0.6400802493095398,
|
|
"num_tokens": 127801062.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 1.7390625,
|
|
"epoch": 0.1899685663523302,
|
|
"grad_norm": 0.27630048809212004,
|
|
"learning_rate": 4.665703818514866e-06,
|
|
"loss": 1.7505,
|
|
"mean_token_accuracy": 0.6440927445888519,
|
|
"num_tokens": 128701534.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 1.740625,
|
|
"epoch": 0.19133524668580018,
|
|
"grad_norm": 0.21961221291572336,
|
|
"learning_rate": 4.6621812033253484e-06,
|
|
"loss": 1.7547,
|
|
"mean_token_accuracy": 0.6426892518997193,
|
|
"num_tokens": 129623673.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 0.1927019270192702,
|
|
"grad_norm": 0.20605591829363124,
|
|
"learning_rate": 4.658658588135833e-06,
|
|
"loss": 1.6375,
|
|
"mean_token_accuracy": 0.6617919564247131,
|
|
"num_tokens": 130545447.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 0.1940686073527402,
|
|
"grad_norm": 0.38254793074274823,
|
|
"learning_rate": 4.6551359729463155e-06,
|
|
"loss": 1.6886,
|
|
"mean_token_accuracy": 0.6519088804721832,
|
|
"num_tokens": 131531540.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 1.8265625,
|
|
"epoch": 0.19543528768621019,
|
|
"grad_norm": 0.192190652039793,
|
|
"learning_rate": 4.651613357756799e-06,
|
|
"loss": 1.8409,
|
|
"mean_token_accuracy": 0.6262837171554565,
|
|
"num_tokens": 132461259.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 1.78671875,
|
|
"epoch": 0.1968019680196802,
|
|
"grad_norm": 0.38866723537677667,
|
|
"learning_rate": 4.648090742567283e-06,
|
|
"loss": 1.8152,
|
|
"mean_token_accuracy": 0.6346620202064515,
|
|
"num_tokens": 133396762.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 1.79765625,
|
|
"epoch": 0.1981686483531502,
|
|
"grad_norm": 0.21527291412135238,
|
|
"learning_rate": 4.644568127377765e-06,
|
|
"loss": 1.814,
|
|
"mean_token_accuracy": 0.6335269153118134,
|
|
"num_tokens": 134351609.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.80390625,
|
|
"epoch": 0.1995353286866202,
|
|
"grad_norm": 0.24398600203783388,
|
|
"learning_rate": 4.641045512188249e-06,
|
|
"loss": 1.8212,
|
|
"mean_token_accuracy": 0.6338249921798706,
|
|
"num_tokens": 135284441.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 1.78203125,
|
|
"epoch": 0.2009020090200902,
|
|
"grad_norm": 0.22653941689334312,
|
|
"learning_rate": 4.6375228969987324e-06,
|
|
"loss": 1.7933,
|
|
"mean_token_accuracy": 0.6370425701141358,
|
|
"num_tokens": 136227270.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 1.79453125,
|
|
"epoch": 0.2022686893535602,
|
|
"grad_norm": 0.27415532830965855,
|
|
"learning_rate": 4.634000281809216e-06,
|
|
"loss": 1.7973,
|
|
"mean_token_accuracy": 0.6371819317340851,
|
|
"num_tokens": 137171675.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 1.75703125,
|
|
"epoch": 0.2036353696870302,
|
|
"grad_norm": 0.31238019940673684,
|
|
"learning_rate": 4.630477666619699e-06,
|
|
"loss": 1.7663,
|
|
"mean_token_accuracy": 0.63936807513237,
|
|
"num_tokens": 138076571.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 0.2050020500205002,
|
|
"grad_norm": 0.18039704265027046,
|
|
"learning_rate": 4.626955051430182e-06,
|
|
"loss": 1.6661,
|
|
"mean_token_accuracy": 0.6594658136367798,
|
|
"num_tokens": 139031942.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.20636873035397021,
|
|
"grad_norm": 0.21114207601872753,
|
|
"learning_rate": 4.623432436240665e-06,
|
|
"loss": 1.7789,
|
|
"mean_token_accuracy": 0.638205099105835,
|
|
"num_tokens": 139895417.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.2077354106874402,
|
|
"grad_norm": 0.23190377121317854,
|
|
"learning_rate": 4.619909821051149e-06,
|
|
"loss": 1.7049,
|
|
"mean_token_accuracy": 0.6487535178661347,
|
|
"num_tokens": 140756189.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 1.77109375,
|
|
"epoch": 0.2091020910209102,
|
|
"grad_norm": 0.2071529936933706,
|
|
"learning_rate": 4.616387205861632e-06,
|
|
"loss": 1.7893,
|
|
"mean_token_accuracy": 0.637857848405838,
|
|
"num_tokens": 141671489.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 1.7578125,
|
|
"epoch": 0.21046877135438022,
|
|
"grad_norm": 0.3018322856546776,
|
|
"learning_rate": 4.6128645906721156e-06,
|
|
"loss": 1.7618,
|
|
"mean_token_accuracy": 0.6426481187343598,
|
|
"num_tokens": 142551190.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 1.8515625,
|
|
"epoch": 0.2118354516878502,
|
|
"grad_norm": 0.34370238631005434,
|
|
"learning_rate": 4.609341975482598e-06,
|
|
"loss": 1.8576,
|
|
"mean_token_accuracy": 0.6293798804283142,
|
|
"num_tokens": 143459522.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.7625,
|
|
"epoch": 0.2132021320213202,
|
|
"grad_norm": 0.2757760116917972,
|
|
"learning_rate": 4.605819360293082e-06,
|
|
"loss": 1.7664,
|
|
"mean_token_accuracy": 0.6412102341651916,
|
|
"num_tokens": 144418753.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.21456881235479022,
|
|
"grad_norm": 0.19986494812494868,
|
|
"learning_rate": 4.602296745103565e-06,
|
|
"loss": 1.7079,
|
|
"mean_token_accuracy": 0.6502198994159698,
|
|
"num_tokens": 145335382.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 1.784375,
|
|
"epoch": 0.2159354926882602,
|
|
"grad_norm": 0.23374793882477024,
|
|
"learning_rate": 4.598774129914049e-06,
|
|
"loss": 1.8279,
|
|
"mean_token_accuracy": 0.6331926584243774,
|
|
"num_tokens": 146258358.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 1.7578125,
|
|
"epoch": 0.21730217302173022,
|
|
"grad_norm": 0.21368204712738686,
|
|
"learning_rate": 4.595251514724532e-06,
|
|
"loss": 1.7538,
|
|
"mean_token_accuracy": 0.638098955154419,
|
|
"num_tokens": 147161819.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 1.74453125,
|
|
"epoch": 0.21866885335520023,
|
|
"grad_norm": 0.2717970527924303,
|
|
"learning_rate": 4.591728899535015e-06,
|
|
"loss": 1.7483,
|
|
"mean_token_accuracy": 0.6428154408931732,
|
|
"num_tokens": 148082916.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 1.7,
|
|
"epoch": 0.2200355336886702,
|
|
"grad_norm": 0.27125254203471844,
|
|
"learning_rate": 4.588206284345499e-06,
|
|
"loss": 1.7017,
|
|
"mean_token_accuracy": 0.6492776691913604,
|
|
"num_tokens": 149010170.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 1.8734375,
|
|
"epoch": 0.22140221402214022,
|
|
"grad_norm": 0.2445376987882485,
|
|
"learning_rate": 4.584683669155981e-06,
|
|
"loss": 1.8888,
|
|
"mean_token_accuracy": 0.623607474565506,
|
|
"num_tokens": 149954423.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 1.79140625,
|
|
"epoch": 0.22276889435561023,
|
|
"grad_norm": 0.34008469535973535,
|
|
"learning_rate": 4.581161053966465e-06,
|
|
"loss": 1.8099,
|
|
"mean_token_accuracy": 0.6367361009120941,
|
|
"num_tokens": 150936716.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 1.83671875,
|
|
"epoch": 0.2241355746890802,
|
|
"grad_norm": 0.2021773037911737,
|
|
"learning_rate": 4.5776384387769485e-06,
|
|
"loss": 1.873,
|
|
"mean_token_accuracy": 0.6269274353981018,
|
|
"num_tokens": 151871482.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 1.74453125,
|
|
"epoch": 0.22550225502255022,
|
|
"grad_norm": 0.260085653814476,
|
|
"learning_rate": 4.574115823587432e-06,
|
|
"loss": 1.7259,
|
|
"mean_token_accuracy": 0.6482770264148712,
|
|
"num_tokens": 152802213.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.22686893535602023,
|
|
"grad_norm": 0.41871251650216545,
|
|
"learning_rate": 4.570593208397915e-06,
|
|
"loss": 1.7434,
|
|
"mean_token_accuracy": 0.6441694617271423,
|
|
"num_tokens": 153727748.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 1.75859375,
|
|
"epoch": 0.22823561568949022,
|
|
"grad_norm": 0.352259179640089,
|
|
"learning_rate": 4.567070593208398e-06,
|
|
"loss": 1.7552,
|
|
"mean_token_accuracy": 0.643628454208374,
|
|
"num_tokens": 154631311.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 1.73125,
|
|
"epoch": 0.22960229602296023,
|
|
"grad_norm": 0.2113616753515337,
|
|
"learning_rate": 4.563547978018882e-06,
|
|
"loss": 1.7405,
|
|
"mean_token_accuracy": 0.6419270813465119,
|
|
"num_tokens": 155568942.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 0.23096897635643024,
|
|
"grad_norm": 0.1942210844425174,
|
|
"learning_rate": 4.560025362829365e-06,
|
|
"loss": 1.7175,
|
|
"mean_token_accuracy": 0.6440073072910308,
|
|
"num_tokens": 156478989.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 1.73828125,
|
|
"epoch": 0.23233565668990022,
|
|
"grad_norm": 0.24764181298041427,
|
|
"learning_rate": 4.556502747639848e-06,
|
|
"loss": 1.7607,
|
|
"mean_token_accuracy": 0.6424190998077393,
|
|
"num_tokens": 157416305.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 1.83359375,
|
|
"epoch": 0.23370233702337023,
|
|
"grad_norm": 0.19377049706636176,
|
|
"learning_rate": 4.552980132450332e-06,
|
|
"loss": 1.8312,
|
|
"mean_token_accuracy": 0.6290262877941132,
|
|
"num_tokens": 158385743.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.23506901735684024,
|
|
"grad_norm": 0.3179514721470347,
|
|
"learning_rate": 4.549457517260814e-06,
|
|
"loss": 1.7364,
|
|
"mean_token_accuracy": 0.6450240135192871,
|
|
"num_tokens": 159331860.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 1.73984375,
|
|
"epoch": 0.23643569769031023,
|
|
"grad_norm": 0.21464211227093127,
|
|
"learning_rate": 4.545934902071298e-06,
|
|
"loss": 1.7536,
|
|
"mean_token_accuracy": 0.6422495365142822,
|
|
"num_tokens": 160307495.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 1.77734375,
|
|
"epoch": 0.23780237802378024,
|
|
"grad_norm": 0.2915288412980468,
|
|
"learning_rate": 4.5424122868817814e-06,
|
|
"loss": 1.7913,
|
|
"mean_token_accuracy": 0.6359599351882934,
|
|
"num_tokens": 161200029.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 0.23916905835725025,
|
|
"grad_norm": 0.20293410977731624,
|
|
"learning_rate": 4.538889671692265e-06,
|
|
"loss": 1.7011,
|
|
"mean_token_accuracy": 0.6514890372753144,
|
|
"num_tokens": 162102209.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 1.7890625,
|
|
"epoch": 0.24053573869072023,
|
|
"grad_norm": 0.21263706614013422,
|
|
"learning_rate": 4.535367056502748e-06,
|
|
"loss": 1.7999,
|
|
"mean_token_accuracy": 0.6377551794052124,
|
|
"num_tokens": 163056596.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 1.77265625,
|
|
"epoch": 0.24190241902419024,
|
|
"grad_norm": 0.19343274752119743,
|
|
"learning_rate": 4.531844441313231e-06,
|
|
"loss": 1.7734,
|
|
"mean_token_accuracy": 0.6383991062641143,
|
|
"num_tokens": 163978075.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.24326909935766025,
|
|
"grad_norm": 0.27151957066592686,
|
|
"learning_rate": 4.528321826123715e-06,
|
|
"loss": 1.7139,
|
|
"mean_token_accuracy": 0.6492103636264801,
|
|
"num_tokens": 164881911.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 1.76875,
|
|
"epoch": 0.24463577969113023,
|
|
"grad_norm": 0.20911274631900187,
|
|
"learning_rate": 4.524799210934198e-06,
|
|
"loss": 1.7727,
|
|
"mean_token_accuracy": 0.6398770570755005,
|
|
"num_tokens": 165810157.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 1.74375,
|
|
"epoch": 0.24600246002460024,
|
|
"grad_norm": 0.1705835169679151,
|
|
"learning_rate": 4.521276595744681e-06,
|
|
"loss": 1.7327,
|
|
"mean_token_accuracy": 0.6458980083465576,
|
|
"num_tokens": 166703621.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 1.75625,
|
|
"epoch": 0.24736914035807026,
|
|
"grad_norm": 0.2994369185206251,
|
|
"learning_rate": 4.5177539805551646e-06,
|
|
"loss": 1.7749,
|
|
"mean_token_accuracy": 0.6382310390472412,
|
|
"num_tokens": 167621301.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.24873582069154024,
|
|
"grad_norm": 0.2579429458883581,
|
|
"learning_rate": 4.514231365365648e-06,
|
|
"loss": 1.7144,
|
|
"mean_token_accuracy": 0.6479700028896331,
|
|
"num_tokens": 168509348.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 1.72890625,
|
|
"epoch": 0.25010250102501025,
|
|
"grad_norm": 0.19712853028071003,
|
|
"learning_rate": 4.510708750176131e-06,
|
|
"loss": 1.7297,
|
|
"mean_token_accuracy": 0.6444222390651703,
|
|
"num_tokens": 169409425.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 1.7234375,
|
|
"epoch": 0.25146918135848023,
|
|
"grad_norm": 0.20683806910802435,
|
|
"learning_rate": 4.507186134986614e-06,
|
|
"loss": 1.7204,
|
|
"mean_token_accuracy": 0.6469195187091827,
|
|
"num_tokens": 170375045.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.25283586169195027,
|
|
"grad_norm": 0.24867686170009623,
|
|
"learning_rate": 4.503663519797098e-06,
|
|
"loss": 1.7854,
|
|
"mean_token_accuracy": 0.6361985862255096,
|
|
"num_tokens": 171300275.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.25420254202542025,
|
|
"grad_norm": 0.2394214019550836,
|
|
"learning_rate": 4.5001409046075814e-06,
|
|
"loss": 1.7308,
|
|
"mean_token_accuracy": 0.6484771728515625,
|
|
"num_tokens": 172222640.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 1.73515625,
|
|
"epoch": 0.25556922235889024,
|
|
"grad_norm": 0.19103023849692785,
|
|
"learning_rate": 4.496618289418064e-06,
|
|
"loss": 1.7361,
|
|
"mean_token_accuracy": 0.6466102600097656,
|
|
"num_tokens": 173173214.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.2569359026923603,
|
|
"grad_norm": 0.20479679028699968,
|
|
"learning_rate": 4.493095674228548e-06,
|
|
"loss": 1.7334,
|
|
"mean_token_accuracy": 0.6474600315093995,
|
|
"num_tokens": 174072703.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 1.7921875,
|
|
"epoch": 0.25830258302583026,
|
|
"grad_norm": 0.19521016329759583,
|
|
"learning_rate": 4.48957305903903e-06,
|
|
"loss": 1.7848,
|
|
"mean_token_accuracy": 0.635907793045044,
|
|
"num_tokens": 175009721.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.25966926335930024,
|
|
"grad_norm": 0.28051192780985973,
|
|
"learning_rate": 4.486050443849515e-06,
|
|
"loss": 1.7303,
|
|
"mean_token_accuracy": 0.6483193635940552,
|
|
"num_tokens": 175926287.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.2610359436927703,
|
|
"grad_norm": 0.16943526418530247,
|
|
"learning_rate": 4.4825278286599975e-06,
|
|
"loss": 1.7335,
|
|
"mean_token_accuracy": 0.6443354606628418,
|
|
"num_tokens": 176859786.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.26240262402624026,
|
|
"grad_norm": 0.327261916570871,
|
|
"learning_rate": 4.479005213470481e-06,
|
|
"loss": 1.6868,
|
|
"mean_token_accuracy": 0.652915996313095,
|
|
"num_tokens": 177776468.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 1.7734375,
|
|
"epoch": 0.26376930435971024,
|
|
"grad_norm": 0.22196364076707234,
|
|
"learning_rate": 4.475482598280964e-06,
|
|
"loss": 1.7678,
|
|
"mean_token_accuracy": 0.6370668828487396,
|
|
"num_tokens": 178696172.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.2651359846931803,
|
|
"grad_norm": 0.24734617458979816,
|
|
"learning_rate": 4.471959983091447e-06,
|
|
"loss": 1.7207,
|
|
"mean_token_accuracy": 0.6473471403121949,
|
|
"num_tokens": 179623016.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 1.74140625,
|
|
"epoch": 0.26650266502665027,
|
|
"grad_norm": 0.21650794255264053,
|
|
"learning_rate": 4.468437367901931e-06,
|
|
"loss": 1.7568,
|
|
"mean_token_accuracy": 0.6430077612400055,
|
|
"num_tokens": 180550628.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 1.7796875,
|
|
"epoch": 0.26786934536012025,
|
|
"grad_norm": 0.25786955729258043,
|
|
"learning_rate": 4.464914752712414e-06,
|
|
"loss": 1.7745,
|
|
"mean_token_accuracy": 0.6381547033786774,
|
|
"num_tokens": 181457886.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 0.2692360256935903,
|
|
"grad_norm": 0.1721369610628911,
|
|
"learning_rate": 4.461392137522897e-06,
|
|
"loss": 1.7166,
|
|
"mean_token_accuracy": 0.6461134254932404,
|
|
"num_tokens": 182393564.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.27060270602706027,
|
|
"grad_norm": 0.22652435309342595,
|
|
"learning_rate": 4.457869522333381e-06,
|
|
"loss": 1.6795,
|
|
"mean_token_accuracy": 0.6526227116584777,
|
|
"num_tokens": 183291119.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.27196938636053025,
|
|
"grad_norm": 0.23755256729013183,
|
|
"learning_rate": 4.454346907143864e-06,
|
|
"loss": 1.7046,
|
|
"mean_token_accuracy": 0.6517730414867401,
|
|
"num_tokens": 184205633.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 1.7703125,
|
|
"epoch": 0.2733360666940003,
|
|
"grad_norm": 0.24549207674703646,
|
|
"learning_rate": 4.450824291954347e-06,
|
|
"loss": 1.7779,
|
|
"mean_token_accuracy": 0.6368072807788849,
|
|
"num_tokens": 185143640.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.2747027470274703,
|
|
"grad_norm": 0.18915376404783596,
|
|
"learning_rate": 4.44730167676483e-06,
|
|
"loss": 1.6957,
|
|
"mean_token_accuracy": 0.6522092461585999,
|
|
"num_tokens": 186076485.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 1.80703125,
|
|
"epoch": 0.27606942736094026,
|
|
"grad_norm": 0.19999747492231404,
|
|
"learning_rate": 4.443779061575314e-06,
|
|
"loss": 1.8227,
|
|
"mean_token_accuracy": 0.6325264155864716,
|
|
"num_tokens": 186997413.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.2774361076944103,
|
|
"grad_norm": 0.24070118833414136,
|
|
"learning_rate": 4.4402564463857975e-06,
|
|
"loss": 1.6538,
|
|
"mean_token_accuracy": 0.6532848298549652,
|
|
"num_tokens": 187883323.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 1.746875,
|
|
"epoch": 0.2788027880278803,
|
|
"grad_norm": 0.2057115095775583,
|
|
"learning_rate": 4.43673383119628e-06,
|
|
"loss": 1.7933,
|
|
"mean_token_accuracy": 0.6361957013607025,
|
|
"num_tokens": 188783958.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.28016946836135026,
|
|
"grad_norm": 0.2195441124915254,
|
|
"learning_rate": 4.433211216006764e-06,
|
|
"loss": 1.7682,
|
|
"mean_token_accuracy": 0.6411130547523498,
|
|
"num_tokens": 189673078.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 0.2815361486948203,
|
|
"grad_norm": 0.21603866526278181,
|
|
"learning_rate": 4.4296886008172465e-06,
|
|
"loss": 1.6804,
|
|
"mean_token_accuracy": 0.651447206735611,
|
|
"num_tokens": 190584016.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.2829028290282903,
|
|
"grad_norm": 0.20777599705234892,
|
|
"learning_rate": 4.426165985627731e-06,
|
|
"loss": 1.7153,
|
|
"mean_token_accuracy": 0.6482792913913726,
|
|
"num_tokens": 191547560.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.28426950936176026,
|
|
"grad_norm": 0.18975737041130866,
|
|
"learning_rate": 4.4226433704382136e-06,
|
|
"loss": 1.6571,
|
|
"mean_token_accuracy": 0.6588443338871002,
|
|
"num_tokens": 192439284.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.2856361896952303,
|
|
"grad_norm": 0.23294200884378677,
|
|
"learning_rate": 4.419120755248697e-06,
|
|
"loss": 1.6984,
|
|
"mean_token_accuracy": 0.6504812240600586,
|
|
"num_tokens": 193368840.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 1.8078125,
|
|
"epoch": 0.2870028700287003,
|
|
"grad_norm": 0.2456628536661753,
|
|
"learning_rate": 4.41559814005918e-06,
|
|
"loss": 1.8361,
|
|
"mean_token_accuracy": 0.6299799799919128,
|
|
"num_tokens": 194306336.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.28836955036217027,
|
|
"grad_norm": 0.19082211172913924,
|
|
"learning_rate": 4.412075524869663e-06,
|
|
"loss": 1.6992,
|
|
"mean_token_accuracy": 0.6523184239864349,
|
|
"num_tokens": 195184867.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 1.73203125,
|
|
"epoch": 0.2897362306956403,
|
|
"grad_norm": 0.23264738708307428,
|
|
"learning_rate": 4.408552909680147e-06,
|
|
"loss": 1.7312,
|
|
"mean_token_accuracy": 0.6475288927555084,
|
|
"num_tokens": 196099384.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 1.75625,
|
|
"epoch": 0.2911029110291103,
|
|
"grad_norm": 0.193852929712688,
|
|
"learning_rate": 4.4050302944906304e-06,
|
|
"loss": 1.766,
|
|
"mean_token_accuracy": 0.640529602766037,
|
|
"num_tokens": 197036393.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 1.75078125,
|
|
"epoch": 0.2924695913625803,
|
|
"grad_norm": 0.1904914032149668,
|
|
"learning_rate": 4.401507679301113e-06,
|
|
"loss": 1.731,
|
|
"mean_token_accuracy": 0.6443095862865448,
|
|
"num_tokens": 197925750.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 1.82890625,
|
|
"epoch": 0.2938362716960503,
|
|
"grad_norm": 0.19577503142890482,
|
|
"learning_rate": 4.397985064111597e-06,
|
|
"loss": 1.8503,
|
|
"mean_token_accuracy": 0.6278849124908448,
|
|
"num_tokens": 198884410.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 1.72890625,
|
|
"epoch": 0.2952029520295203,
|
|
"grad_norm": 0.27321086988752125,
|
|
"learning_rate": 4.39446244892208e-06,
|
|
"loss": 1.7319,
|
|
"mean_token_accuracy": 0.6442044079303741,
|
|
"num_tokens": 199808548.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 1.77109375,
|
|
"epoch": 0.2965696323629903,
|
|
"grad_norm": 0.18811588961940276,
|
|
"learning_rate": 4.390939833732563e-06,
|
|
"loss": 1.7777,
|
|
"mean_token_accuracy": 0.6405163526535034,
|
|
"num_tokens": 200657353.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.2979363126964603,
|
|
"grad_norm": 0.22929112162559287,
|
|
"learning_rate": 4.387417218543047e-06,
|
|
"loss": 1.7156,
|
|
"mean_token_accuracy": 0.6471207499504089,
|
|
"num_tokens": 201592429.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.2993029930299303,
|
|
"grad_norm": 0.24158635671435583,
|
|
"learning_rate": 4.38389460335353e-06,
|
|
"loss": 1.6323,
|
|
"mean_token_accuracy": 0.6594474792480469,
|
|
"num_tokens": 202508465.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.3006696733634003,
|
|
"grad_norm": 0.23492192108456564,
|
|
"learning_rate": 4.380371988164014e-06,
|
|
"loss": 1.7657,
|
|
"mean_token_accuracy": 0.638015341758728,
|
|
"num_tokens": 203486931.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 0.3020363536968703,
|
|
"grad_norm": 0.2390072943148003,
|
|
"learning_rate": 4.376849372974496e-06,
|
|
"loss": 1.7146,
|
|
"mean_token_accuracy": 0.650445181131363,
|
|
"num_tokens": 204437891.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.3034030340303403,
|
|
"grad_norm": 0.2108515777474216,
|
|
"learning_rate": 4.37332675778498e-06,
|
|
"loss": 1.7033,
|
|
"mean_token_accuracy": 0.6466539919376373,
|
|
"num_tokens": 205369076.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.3047697143638103,
|
|
"grad_norm": 0.2773741302979524,
|
|
"learning_rate": 4.369804142595463e-06,
|
|
"loss": 1.7048,
|
|
"mean_token_accuracy": 0.6493583023548126,
|
|
"num_tokens": 206283201.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 0.3061363946972803,
|
|
"grad_norm": 0.22851186792530268,
|
|
"learning_rate": 4.366281527405947e-06,
|
|
"loss": 1.7152,
|
|
"mean_token_accuracy": 0.6469030201435089,
|
|
"num_tokens": 207244813.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.3075030750307503,
|
|
"grad_norm": 0.17967736209606125,
|
|
"learning_rate": 4.36275891221643e-06,
|
|
"loss": 1.6971,
|
|
"mean_token_accuracy": 0.649112731218338,
|
|
"num_tokens": 208176655.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 0.3088697553642203,
|
|
"grad_norm": 0.2481166759826417,
|
|
"learning_rate": 4.359236297026913e-06,
|
|
"loss": 1.6777,
|
|
"mean_token_accuracy": 0.6526995122432708,
|
|
"num_tokens": 209087949.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 0.3102364356976903,
|
|
"grad_norm": 0.26746060722840287,
|
|
"learning_rate": 4.355713681837396e-06,
|
|
"loss": 1.6518,
|
|
"mean_token_accuracy": 0.6559167802333832,
|
|
"num_tokens": 210003172.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.3116031160311603,
|
|
"grad_norm": 0.2844292756481138,
|
|
"learning_rate": 4.352191066647879e-06,
|
|
"loss": 1.7077,
|
|
"mean_token_accuracy": 0.6477632701396943,
|
|
"num_tokens": 210911894.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.3129697963646303,
|
|
"grad_norm": 0.23557592067773625,
|
|
"learning_rate": 4.348668451458363e-06,
|
|
"loss": 1.6524,
|
|
"mean_token_accuracy": 0.6558865308761597,
|
|
"num_tokens": 211848726.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 1.8125,
|
|
"epoch": 0.31433647669810033,
|
|
"grad_norm": 0.29905380685464583,
|
|
"learning_rate": 4.3451458362688465e-06,
|
|
"loss": 1.8065,
|
|
"mean_token_accuracy": 0.6338882982730866,
|
|
"num_tokens": 212762631.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.3157031570315703,
|
|
"grad_norm": 0.20797523430074452,
|
|
"learning_rate": 4.34162322107933e-06,
|
|
"loss": 1.716,
|
|
"mean_token_accuracy": 0.6484074771404267,
|
|
"num_tokens": 213665764.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.3170698373650403,
|
|
"grad_norm": 0.22205176933215592,
|
|
"learning_rate": 4.338100605889813e-06,
|
|
"loss": 1.713,
|
|
"mean_token_accuracy": 0.6481547713279724,
|
|
"num_tokens": 214555420.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.31843651769851034,
|
|
"grad_norm": 0.17105162937458773,
|
|
"learning_rate": 4.334577990700296e-06,
|
|
"loss": 1.6315,
|
|
"mean_token_accuracy": 0.6620208263397217,
|
|
"num_tokens": 215457610.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 1.74375,
|
|
"epoch": 0.3198031980319803,
|
|
"grad_norm": 0.21551289557648465,
|
|
"learning_rate": 4.33105537551078e-06,
|
|
"loss": 1.7511,
|
|
"mean_token_accuracy": 0.639218533039093,
|
|
"num_tokens": 216362573.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.3211698783654503,
|
|
"grad_norm": 0.1835759691187967,
|
|
"learning_rate": 4.327532760321263e-06,
|
|
"loss": 1.6463,
|
|
"mean_token_accuracy": 0.6608923017978668,
|
|
"num_tokens": 217247879.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.32253655869892034,
|
|
"grad_norm": 0.1902702489043468,
|
|
"learning_rate": 4.324010145131746e-06,
|
|
"loss": 1.7012,
|
|
"mean_token_accuracy": 0.6492303788661957,
|
|
"num_tokens": 218159591.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.3239032390323903,
|
|
"grad_norm": 0.27078123512699753,
|
|
"learning_rate": 4.32048752994223e-06,
|
|
"loss": 1.7406,
|
|
"mean_token_accuracy": 0.6446932435035706,
|
|
"num_tokens": 219122814.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 0.3252699193658603,
|
|
"grad_norm": 0.18362451864191692,
|
|
"learning_rate": 4.316964914752712e-06,
|
|
"loss": 1.7213,
|
|
"mean_token_accuracy": 0.6471914112567901,
|
|
"num_tokens": 220049053.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.32663659969933034,
|
|
"grad_norm": 0.2532105879580372,
|
|
"learning_rate": 4.313442299563196e-06,
|
|
"loss": 1.7251,
|
|
"mean_token_accuracy": 0.648465770483017,
|
|
"num_tokens": 220929901.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 0.3280032800328003,
|
|
"grad_norm": 0.2286132794571216,
|
|
"learning_rate": 4.3099196843736794e-06,
|
|
"loss": 1.6931,
|
|
"mean_token_accuracy": 0.6497257292270661,
|
|
"num_tokens": 221821547.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.3293699603662703,
|
|
"grad_norm": 0.22434507091780212,
|
|
"learning_rate": 4.306397069184163e-06,
|
|
"loss": 1.709,
|
|
"mean_token_accuracy": 0.6469891607761383,
|
|
"num_tokens": 222737003.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 1.7234375,
|
|
"epoch": 0.33073664069974035,
|
|
"grad_norm": 0.22449501931476712,
|
|
"learning_rate": 4.302874453994646e-06,
|
|
"loss": 1.7248,
|
|
"mean_token_accuracy": 0.6434304356575012,
|
|
"num_tokens": 223642945.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.33210332103321033,
|
|
"grad_norm": 0.21872884009879134,
|
|
"learning_rate": 4.299351838805129e-06,
|
|
"loss": 1.6591,
|
|
"mean_token_accuracy": 0.6552774846553803,
|
|
"num_tokens": 224570653.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.3334700013666803,
|
|
"grad_norm": 0.2736225390441724,
|
|
"learning_rate": 4.295829223615613e-06,
|
|
"loss": 1.7036,
|
|
"mean_token_accuracy": 0.6492485046386719,
|
|
"num_tokens": 225506210.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 1.75390625,
|
|
"epoch": 0.33483668170015035,
|
|
"grad_norm": 0.2646688033223781,
|
|
"learning_rate": 4.292306608426096e-06,
|
|
"loss": 1.7376,
|
|
"mean_token_accuracy": 0.6459907948970794,
|
|
"num_tokens": 226416285.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.33620336203362033,
|
|
"grad_norm": 0.18705955900726806,
|
|
"learning_rate": 4.288783993236579e-06,
|
|
"loss": 1.6375,
|
|
"mean_token_accuracy": 0.6587654113769531,
|
|
"num_tokens": 227353350.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.3375700423670903,
|
|
"grad_norm": 0.18573348573592038,
|
|
"learning_rate": 4.285261378047063e-06,
|
|
"loss": 1.7041,
|
|
"mean_token_accuracy": 0.6493724524974823,
|
|
"num_tokens": 228287350.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 1.73671875,
|
|
"epoch": 0.33893672270056036,
|
|
"grad_norm": 0.2902999685271366,
|
|
"learning_rate": 4.281738762857546e-06,
|
|
"loss": 1.7481,
|
|
"mean_token_accuracy": 0.6390805602073669,
|
|
"num_tokens": 229217110.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 1.771875,
|
|
"epoch": 0.34030340303403034,
|
|
"grad_norm": 0.18945387415344742,
|
|
"learning_rate": 4.278216147668029e-06,
|
|
"loss": 1.7773,
|
|
"mean_token_accuracy": 0.6371896862983704,
|
|
"num_tokens": 230150503.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.3416700833675003,
|
|
"grad_norm": 0.3168070829721579,
|
|
"learning_rate": 4.274693532478512e-06,
|
|
"loss": 1.7145,
|
|
"mean_token_accuracy": 0.6458973288536072,
|
|
"num_tokens": 231075522.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.34303676370097036,
|
|
"grad_norm": 0.23822102003314233,
|
|
"learning_rate": 4.271170917288996e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6590349793434143,
|
|
"num_tokens": 231990306.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 0.34440344403444034,
|
|
"grad_norm": 0.20216723303033204,
|
|
"learning_rate": 4.2676483020994795e-06,
|
|
"loss": 1.6613,
|
|
"mean_token_accuracy": 0.6562923491001129,
|
|
"num_tokens": 232926651.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.3457701243679103,
|
|
"grad_norm": 0.18614235380952437,
|
|
"learning_rate": 4.264125686909962e-06,
|
|
"loss": 1.7703,
|
|
"mean_token_accuracy": 0.6400592982769012,
|
|
"num_tokens": 233888329.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.34713680470138036,
|
|
"grad_norm": 0.2127885628074766,
|
|
"learning_rate": 4.260603071720446e-06,
|
|
"loss": 1.732,
|
|
"mean_token_accuracy": 0.6459634721279144,
|
|
"num_tokens": 234794267.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 1.734375,
|
|
"epoch": 0.34850348503485035,
|
|
"grad_norm": 0.21109718974142025,
|
|
"learning_rate": 4.257080456530928e-06,
|
|
"loss": 1.738,
|
|
"mean_token_accuracy": 0.644263106584549,
|
|
"num_tokens": 235717957.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.34987016536832033,
|
|
"grad_norm": 0.17957188582469347,
|
|
"learning_rate": 4.253557841341413e-06,
|
|
"loss": 1.6388,
|
|
"mean_token_accuracy": 0.6575391530990601,
|
|
"num_tokens": 236653926.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 1.70390625,
|
|
"epoch": 0.35123684570179037,
|
|
"grad_norm": 0.2736369629669195,
|
|
"learning_rate": 4.2500352261518955e-06,
|
|
"loss": 1.7043,
|
|
"mean_token_accuracy": 0.6475358068943023,
|
|
"num_tokens": 237581142.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.35260352603526035,
|
|
"grad_norm": 0.18422621339856982,
|
|
"learning_rate": 4.246512610962379e-06,
|
|
"loss": 1.6246,
|
|
"mean_token_accuracy": 0.6617096304893494,
|
|
"num_tokens": 238506030.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.35397020636873033,
|
|
"grad_norm": 0.2500460170540877,
|
|
"learning_rate": 4.242989995772862e-06,
|
|
"loss": 1.698,
|
|
"mean_token_accuracy": 0.6518735647201538,
|
|
"num_tokens": 239394159.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 1.7546875,
|
|
"epoch": 0.35533688670220037,
|
|
"grad_norm": 0.2980121829826058,
|
|
"learning_rate": 4.239467380583345e-06,
|
|
"loss": 1.7568,
|
|
"mean_token_accuracy": 0.640753835439682,
|
|
"num_tokens": 240365934.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.35670356703567035,
|
|
"grad_norm": 0.1888221935053662,
|
|
"learning_rate": 4.235944765393829e-06,
|
|
"loss": 1.6509,
|
|
"mean_token_accuracy": 0.6564042508602143,
|
|
"num_tokens": 241282098.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 1.74765625,
|
|
"epoch": 0.35807024736914034,
|
|
"grad_norm": 0.22135235922870142,
|
|
"learning_rate": 4.232422150204312e-06,
|
|
"loss": 1.7589,
|
|
"mean_token_accuracy": 0.6391521453857422,
|
|
"num_tokens": 242168645.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 1.78984375,
|
|
"epoch": 0.3594369277026104,
|
|
"grad_norm": 0.18138146163092764,
|
|
"learning_rate": 4.228899535014795e-06,
|
|
"loss": 1.8024,
|
|
"mean_token_accuracy": 0.632556676864624,
|
|
"num_tokens": 243057052.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 1.70078125,
|
|
"epoch": 0.36080360803608036,
|
|
"grad_norm": 0.2920299117743663,
|
|
"learning_rate": 4.225376919825279e-06,
|
|
"loss": 1.7074,
|
|
"mean_token_accuracy": 0.646614956855774,
|
|
"num_tokens": 243972544.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 1.71015625,
|
|
"epoch": 0.36217028836955034,
|
|
"grad_norm": 0.22325330365409513,
|
|
"learning_rate": 4.221854304635762e-06,
|
|
"loss": 1.7173,
|
|
"mean_token_accuracy": 0.6465919077396393,
|
|
"num_tokens": 244888369.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 1.7609375,
|
|
"epoch": 0.3635369687030204,
|
|
"grad_norm": 0.1726358053043753,
|
|
"learning_rate": 4.218331689446245e-06,
|
|
"loss": 1.7765,
|
|
"mean_token_accuracy": 0.6375112771987915,
|
|
"num_tokens": 245817917.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.36490364903649036,
|
|
"grad_norm": 0.28797865174722215,
|
|
"learning_rate": 4.2148090742567284e-06,
|
|
"loss": 1.658,
|
|
"mean_token_accuracy": 0.6562651097774506,
|
|
"num_tokens": 246732590.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 0.36627032936996035,
|
|
"grad_norm": 0.1898427981275681,
|
|
"learning_rate": 4.211286459067212e-06,
|
|
"loss": 1.7517,
|
|
"mean_token_accuracy": 0.6375582277774811,
|
|
"num_tokens": 247706752.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 0.3676370097034304,
|
|
"grad_norm": 0.2055423963611234,
|
|
"learning_rate": 4.2077638438776955e-06,
|
|
"loss": 1.6227,
|
|
"mean_token_accuracy": 0.6643299043178559,
|
|
"num_tokens": 248601054.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.36900369003690037,
|
|
"grad_norm": 0.17233809275810327,
|
|
"learning_rate": 4.204241228688178e-06,
|
|
"loss": 1.6684,
|
|
"mean_token_accuracy": 0.6558837234973908,
|
|
"num_tokens": 249607891.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 0.1856855562082745,
|
|
"learning_rate": 4.200718613498662e-06,
|
|
"loss": 1.6484,
|
|
"mean_token_accuracy": 0.657769525051117,
|
|
"num_tokens": 250541309.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.3717370507038404,
|
|
"grad_norm": 0.22034479786883557,
|
|
"learning_rate": 4.1971959983091445e-06,
|
|
"loss": 1.7116,
|
|
"mean_token_accuracy": 0.6499764025211334,
|
|
"num_tokens": 251444553.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 0.37310373103731037,
|
|
"grad_norm": 0.1801013870582494,
|
|
"learning_rate": 4.193673383119629e-06,
|
|
"loss": 1.635,
|
|
"mean_token_accuracy": 0.6582135915756225,
|
|
"num_tokens": 252393510.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 1.80703125,
|
|
"epoch": 0.37447041137078035,
|
|
"grad_norm": 0.20164769222875084,
|
|
"learning_rate": 4.190150767930112e-06,
|
|
"loss": 1.8129,
|
|
"mean_token_accuracy": 0.6337982535362243,
|
|
"num_tokens": 253385785.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.3758370917042504,
|
|
"grad_norm": 0.2064809157401824,
|
|
"learning_rate": 4.186628152740595e-06,
|
|
"loss": 1.7557,
|
|
"mean_token_accuracy": 0.6417210280895234,
|
|
"num_tokens": 254323532.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.3772037720377204,
|
|
"grad_norm": 0.18201827880921484,
|
|
"learning_rate": 4.183105537551078e-06,
|
|
"loss": 1.7094,
|
|
"mean_token_accuracy": 0.6471146523952485,
|
|
"num_tokens": 255248478.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 1.77265625,
|
|
"epoch": 0.37857045237119036,
|
|
"grad_norm": 0.25614670045132565,
|
|
"learning_rate": 4.179582922361561e-06,
|
|
"loss": 1.7855,
|
|
"mean_token_accuracy": 0.6355153620243073,
|
|
"num_tokens": 256139816.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 1.746875,
|
|
"epoch": 0.3799371327046604,
|
|
"grad_norm": 0.18256580998356528,
|
|
"learning_rate": 4.176060307172045e-06,
|
|
"loss": 1.7496,
|
|
"mean_token_accuracy": 0.641632866859436,
|
|
"num_tokens": 257089962.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.3813038130381304,
|
|
"grad_norm": 0.19750575279820728,
|
|
"learning_rate": 4.1725376919825285e-06,
|
|
"loss": 1.6768,
|
|
"mean_token_accuracy": 0.65319983959198,
|
|
"num_tokens": 257982271.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.38267049337160036,
|
|
"grad_norm": 0.1716327266648324,
|
|
"learning_rate": 4.169015076793011e-06,
|
|
"loss": 1.656,
|
|
"mean_token_accuracy": 0.654665720462799,
|
|
"num_tokens": 258874378.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 0.3840371737050704,
|
|
"grad_norm": 0.19988567003967067,
|
|
"learning_rate": 4.165492461603495e-06,
|
|
"loss": 1.6472,
|
|
"mean_token_accuracy": 0.6554565608501435,
|
|
"num_tokens": 259811179.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 0.3854038540385404,
|
|
"grad_norm": 0.22179932123386548,
|
|
"learning_rate": 4.161969846413978e-06,
|
|
"loss": 1.6229,
|
|
"mean_token_accuracy": 0.6586724817752838,
|
|
"num_tokens": 260707214.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.38677053437201037,
|
|
"grad_norm": 0.2279006296209549,
|
|
"learning_rate": 4.158447231224461e-06,
|
|
"loss": 1.6418,
|
|
"mean_token_accuracy": 0.6583502531051636,
|
|
"num_tokens": 261685457.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.3881372147054804,
|
|
"grad_norm": 0.3729755968267986,
|
|
"learning_rate": 4.1549246160349445e-06,
|
|
"loss": 1.7179,
|
|
"mean_token_accuracy": 0.6456328153610229,
|
|
"num_tokens": 262649912.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 1.7390625,
|
|
"epoch": 0.3895038950389504,
|
|
"grad_norm": 0.24691229300095363,
|
|
"learning_rate": 4.151402000845428e-06,
|
|
"loss": 1.757,
|
|
"mean_token_accuracy": 0.642482602596283,
|
|
"num_tokens": 263592418.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 0.39087057537242037,
|
|
"grad_norm": 0.19388663387653168,
|
|
"learning_rate": 4.147879385655912e-06,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.6632375657558441,
|
|
"num_tokens": 264512514.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.3922372557058904,
|
|
"grad_norm": 0.20482480325044292,
|
|
"learning_rate": 4.144356770466394e-06,
|
|
"loss": 1.6822,
|
|
"mean_token_accuracy": 0.6500252187252045,
|
|
"num_tokens": 265400467.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 1.76640625,
|
|
"epoch": 0.3936039360393604,
|
|
"grad_norm": 0.21618360063727718,
|
|
"learning_rate": 4.140834155276878e-06,
|
|
"loss": 1.7703,
|
|
"mean_token_accuracy": 0.6373282372951508,
|
|
"num_tokens": 266341579.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 0.3949706163728304,
|
|
"grad_norm": 0.2725363245339085,
|
|
"learning_rate": 4.137311540087361e-06,
|
|
"loss": 1.7068,
|
|
"mean_token_accuracy": 0.6508552551269531,
|
|
"num_tokens": 267319860.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 1.734375,
|
|
"epoch": 0.3963372967063004,
|
|
"grad_norm": 0.23298630199376136,
|
|
"learning_rate": 4.133788924897845e-06,
|
|
"loss": 1.7195,
|
|
"mean_token_accuracy": 0.6465118229389191,
|
|
"num_tokens": 268252351.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 0.3977039770397704,
|
|
"grad_norm": 0.2576379315427296,
|
|
"learning_rate": 4.130266309708328e-06,
|
|
"loss": 1.6166,
|
|
"mean_token_accuracy": 0.6640608131885528,
|
|
"num_tokens": 269167877.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 1.7515625,
|
|
"epoch": 0.3990706573732404,
|
|
"grad_norm": 0.16346975801405084,
|
|
"learning_rate": 4.126743694518811e-06,
|
|
"loss": 1.7521,
|
|
"mean_token_accuracy": 0.6424897074699402,
|
|
"num_tokens": 270118341.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.4004373377067104,
|
|
"grad_norm": 0.35890152890453436,
|
|
"learning_rate": 4.123221079329294e-06,
|
|
"loss": 1.7013,
|
|
"mean_token_accuracy": 0.649168872833252,
|
|
"num_tokens": 270984717.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 1.7671875,
|
|
"epoch": 0.4018040180401804,
|
|
"grad_norm": 0.3995959085509587,
|
|
"learning_rate": 4.1196984641397774e-06,
|
|
"loss": 1.7653,
|
|
"mean_token_accuracy": 0.6392790257930756,
|
|
"num_tokens": 271888370.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 0.4031706983736504,
|
|
"grad_norm": 0.17473931533083698,
|
|
"learning_rate": 4.116175848950261e-06,
|
|
"loss": 1.6059,
|
|
"mean_token_accuracy": 0.6662253022193909,
|
|
"num_tokens": 272802935.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.4045373787071204,
|
|
"grad_norm": 0.22907087124597017,
|
|
"learning_rate": 4.1126532337607445e-06,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6574507415294647,
|
|
"num_tokens": 273652069.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 0.4059040590405904,
|
|
"grad_norm": 0.2018285675461123,
|
|
"learning_rate": 4.109130618571227e-06,
|
|
"loss": 1.7407,
|
|
"mean_token_accuracy": 0.643599557876587,
|
|
"num_tokens": 274592161.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.4072707393740604,
|
|
"grad_norm": 0.2940864075031202,
|
|
"learning_rate": 4.105608003381711e-06,
|
|
"loss": 1.7395,
|
|
"mean_token_accuracy": 0.6379079282283783,
|
|
"num_tokens": 275499518.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 0.4086374197075304,
|
|
"grad_norm": 0.20016442951673244,
|
|
"learning_rate": 4.102085388192194e-06,
|
|
"loss": 1.6108,
|
|
"mean_token_accuracy": 0.6639392614364624,
|
|
"num_tokens": 276414342.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 1.7453125,
|
|
"epoch": 0.4100041000410004,
|
|
"grad_norm": 0.2108336124269069,
|
|
"learning_rate": 4.098562773002678e-06,
|
|
"loss": 1.7366,
|
|
"mean_token_accuracy": 0.6405311107635498,
|
|
"num_tokens": 277314373.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 0.4113707803744704,
|
|
"grad_norm": 0.1811958505394723,
|
|
"learning_rate": 4.095040157813161e-06,
|
|
"loss": 1.6681,
|
|
"mean_token_accuracy": 0.6547752916812897,
|
|
"num_tokens": 278278531.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 0.41273746070794043,
|
|
"grad_norm": 0.22993548151798257,
|
|
"learning_rate": 4.091517542623644e-06,
|
|
"loss": 1.7289,
|
|
"mean_token_accuracy": 0.6450952172279358,
|
|
"num_tokens": 279188171.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 1.74609375,
|
|
"epoch": 0.4141041410414104,
|
|
"grad_norm": 0.2710431173296056,
|
|
"learning_rate": 4.087994927434128e-06,
|
|
"loss": 1.7787,
|
|
"mean_token_accuracy": 0.637441486120224,
|
|
"num_tokens": 280103778.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.4154708213748804,
|
|
"grad_norm": 0.15340730582944356,
|
|
"learning_rate": 4.08447231224461e-06,
|
|
"loss": 1.7306,
|
|
"mean_token_accuracy": 0.6434753656387329,
|
|
"num_tokens": 280995165.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.41683750170835043,
|
|
"grad_norm": 0.19876229969899625,
|
|
"learning_rate": 4.080949697055094e-06,
|
|
"loss": 1.6375,
|
|
"mean_token_accuracy": 0.6586462140083313,
|
|
"num_tokens": 281875412.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.4182041820418204,
|
|
"grad_norm": 0.23124714298290078,
|
|
"learning_rate": 4.0774270818655775e-06,
|
|
"loss": 1.6629,
|
|
"mean_token_accuracy": 0.6564308822154998,
|
|
"num_tokens": 282821823.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 1.73125,
|
|
"epoch": 0.4195708623752904,
|
|
"grad_norm": 0.19346563735185862,
|
|
"learning_rate": 4.073904466676061e-06,
|
|
"loss": 1.7314,
|
|
"mean_token_accuracy": 0.6413521528244018,
|
|
"num_tokens": 283723959.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.42093754270876044,
|
|
"grad_norm": 0.1979846231913686,
|
|
"learning_rate": 4.070381851486544e-06,
|
|
"loss": 1.6776,
|
|
"mean_token_accuracy": 0.6550431907176971,
|
|
"num_tokens": 284689910.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 0.4223042230422304,
|
|
"grad_norm": 0.2132291862393037,
|
|
"learning_rate": 4.066859236297027e-06,
|
|
"loss": 1.6949,
|
|
"mean_token_accuracy": 0.6513552844524384,
|
|
"num_tokens": 285637557.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 0.4236709033757004,
|
|
"grad_norm": 0.2332028603798789,
|
|
"learning_rate": 4.06333662110751e-06,
|
|
"loss": 1.6779,
|
|
"mean_token_accuracy": 0.6539150774478912,
|
|
"num_tokens": 286530037.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 1.7703125,
|
|
"epoch": 0.42503758370917044,
|
|
"grad_norm": 0.21578427982600545,
|
|
"learning_rate": 4.059814005917994e-06,
|
|
"loss": 1.7902,
|
|
"mean_token_accuracy": 0.6379691660404205,
|
|
"num_tokens": 287493768.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 0.4264042640426404,
|
|
"grad_norm": 0.25407797411120486,
|
|
"learning_rate": 4.056291390728477e-06,
|
|
"loss": 1.7121,
|
|
"mean_token_accuracy": 0.6498547255992889,
|
|
"num_tokens": 288416157.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 1.73984375,
|
|
"epoch": 0.4277709443761104,
|
|
"grad_norm": 0.21351048931609193,
|
|
"learning_rate": 4.052768775538961e-06,
|
|
"loss": 1.7508,
|
|
"mean_token_accuracy": 0.6444311738014221,
|
|
"num_tokens": 289364035.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.42913762470958045,
|
|
"grad_norm": 0.26809958104392956,
|
|
"learning_rate": 4.049246160349444e-06,
|
|
"loss": 1.6757,
|
|
"mean_token_accuracy": 0.6541439354419708,
|
|
"num_tokens": 290322007.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.43050430504305043,
|
|
"grad_norm": 0.28387291871779646,
|
|
"learning_rate": 4.045723545159927e-06,
|
|
"loss": 1.6526,
|
|
"mean_token_accuracy": 0.657026720046997,
|
|
"num_tokens": 291207867.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 0.4318709853765204,
|
|
"grad_norm": 0.18640772227036623,
|
|
"learning_rate": 4.04220092997041e-06,
|
|
"loss": 1.6197,
|
|
"mean_token_accuracy": 0.6637424826622009,
|
|
"num_tokens": 292155007.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.43323766570999045,
|
|
"grad_norm": 0.24282630668067398,
|
|
"learning_rate": 4.038678314780894e-06,
|
|
"loss": 1.657,
|
|
"mean_token_accuracy": 0.6566802382469177,
|
|
"num_tokens": 293128868.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.43460434604346043,
|
|
"grad_norm": 0.18366822004942465,
|
|
"learning_rate": 4.0351556995913775e-06,
|
|
"loss": 1.6887,
|
|
"mean_token_accuracy": 0.649800980091095,
|
|
"num_tokens": 294102263.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.4359710263769304,
|
|
"grad_norm": 0.19321101079467315,
|
|
"learning_rate": 4.03163308440186e-06,
|
|
"loss": 1.6899,
|
|
"mean_token_accuracy": 0.6504279434680938,
|
|
"num_tokens": 295011738.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 0.43733770671040045,
|
|
"grad_norm": 0.21580511551379092,
|
|
"learning_rate": 4.028110469212344e-06,
|
|
"loss": 1.6803,
|
|
"mean_token_accuracy": 0.6554789364337921,
|
|
"num_tokens": 295896162.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 0.43870438704387044,
|
|
"grad_norm": 0.2053666353393879,
|
|
"learning_rate": 4.0245878540228264e-06,
|
|
"loss": 1.6873,
|
|
"mean_token_accuracy": 0.6517736852169037,
|
|
"num_tokens": 296809501.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.4400710673773404,
|
|
"grad_norm": 0.1991280632652847,
|
|
"learning_rate": 4.02106523883331e-06,
|
|
"loss": 1.691,
|
|
"mean_token_accuracy": 0.6517205595970154,
|
|
"num_tokens": 297704136.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 0.44143774771081046,
|
|
"grad_norm": 0.18384362485558225,
|
|
"learning_rate": 4.0175426236437935e-06,
|
|
"loss": 1.608,
|
|
"mean_token_accuracy": 0.6663908660411835,
|
|
"num_tokens": 298596335.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 0.44280442804428044,
|
|
"grad_norm": 0.3108586251137939,
|
|
"learning_rate": 4.014020008454277e-06,
|
|
"loss": 1.6391,
|
|
"mean_token_accuracy": 0.6587963402271271,
|
|
"num_tokens": 299524819.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 0.4441711083777504,
|
|
"grad_norm": 0.16455099434724024,
|
|
"learning_rate": 4.01049739326476e-06,
|
|
"loss": 1.6437,
|
|
"mean_token_accuracy": 0.6566988289356231,
|
|
"num_tokens": 300391859.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.44553778871122046,
|
|
"grad_norm": 0.20662199478613588,
|
|
"learning_rate": 4.006974778075243e-06,
|
|
"loss": 1.7152,
|
|
"mean_token_accuracy": 0.6482463181018829,
|
|
"num_tokens": 301330588.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 1.7328125,
|
|
"epoch": 0.44690446904469044,
|
|
"grad_norm": 0.18986128144413164,
|
|
"learning_rate": 4.003452162885727e-06,
|
|
"loss": 1.7463,
|
|
"mean_token_accuracy": 0.6388202369213104,
|
|
"num_tokens": 302282695.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.4482711493781604,
|
|
"grad_norm": 0.2376113633159274,
|
|
"learning_rate": 3.99992954769621e-06,
|
|
"loss": 1.6595,
|
|
"mean_token_accuracy": 0.655811321735382,
|
|
"num_tokens": 303205540.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.44963782971163047,
|
|
"grad_norm": 0.22282506235610722,
|
|
"learning_rate": 3.996406932506693e-06,
|
|
"loss": 1.6649,
|
|
"mean_token_accuracy": 0.6539288938045502,
|
|
"num_tokens": 304170107.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 0.45100451004510045,
|
|
"grad_norm": 0.21158301561715612,
|
|
"learning_rate": 3.992884317317177e-06,
|
|
"loss": 1.6918,
|
|
"mean_token_accuracy": 0.6514376282691956,
|
|
"num_tokens": 305140054.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 1.725,
|
|
"epoch": 0.45237119037857043,
|
|
"grad_norm": 0.2437410374999052,
|
|
"learning_rate": 3.98936170212766e-06,
|
|
"loss": 1.7467,
|
|
"mean_token_accuracy": 0.643176156282425,
|
|
"num_tokens": 306091179.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.45373787071204047,
|
|
"grad_norm": 0.2771516619062115,
|
|
"learning_rate": 3.985839086938143e-06,
|
|
"loss": 1.6855,
|
|
"mean_token_accuracy": 0.6507757723331451,
|
|
"num_tokens": 307003615.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.45510455104551045,
|
|
"grad_norm": 0.17281003419676658,
|
|
"learning_rate": 3.9823164717486265e-06,
|
|
"loss": 1.7286,
|
|
"mean_token_accuracy": 0.6453676402568818,
|
|
"num_tokens": 307967132.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.45647123137898044,
|
|
"grad_norm": 0.25922797826025246,
|
|
"learning_rate": 3.97879385655911e-06,
|
|
"loss": 1.667,
|
|
"mean_token_accuracy": 0.6537936806678772,
|
|
"num_tokens": 308874003.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 1.76328125,
|
|
"epoch": 0.4578379117124505,
|
|
"grad_norm": 0.2740951246632703,
|
|
"learning_rate": 3.9752712413695936e-06,
|
|
"loss": 1.7624,
|
|
"mean_token_accuracy": 0.6402164757251739,
|
|
"num_tokens": 309787534.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 1.809375,
|
|
"epoch": 0.45920459204592046,
|
|
"grad_norm": 0.15291288725213084,
|
|
"learning_rate": 3.971748626180076e-06,
|
|
"loss": 1.8146,
|
|
"mean_token_accuracy": 0.6302819132804871,
|
|
"num_tokens": 310715349.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.46057127237939044,
|
|
"grad_norm": 0.22642650675231296,
|
|
"learning_rate": 3.96822601099056e-06,
|
|
"loss": 1.6991,
|
|
"mean_token_accuracy": 0.6487354397773742,
|
|
"num_tokens": 311674302.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.4619379527128605,
|
|
"grad_norm": 0.23381790402143576,
|
|
"learning_rate": 3.9647033958010425e-06,
|
|
"loss": 1.6797,
|
|
"mean_token_accuracy": 0.6507147312164306,
|
|
"num_tokens": 312578919.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.46330463304633046,
|
|
"grad_norm": 0.19603796697019105,
|
|
"learning_rate": 3.961180780611527e-06,
|
|
"loss": 1.6981,
|
|
"mean_token_accuracy": 0.645148515701294,
|
|
"num_tokens": 313496088.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.46467131337980044,
|
|
"grad_norm": 0.17985394440493588,
|
|
"learning_rate": 3.95765816542201e-06,
|
|
"loss": 1.692,
|
|
"mean_token_accuracy": 0.6516962647438049,
|
|
"num_tokens": 314472732.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 0.4660379937132705,
|
|
"grad_norm": 0.27203596986060535,
|
|
"learning_rate": 3.954135550232493e-06,
|
|
"loss": 1.6764,
|
|
"mean_token_accuracy": 0.6532367110252381,
|
|
"num_tokens": 315371999.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.46740467404674046,
|
|
"grad_norm": 0.25816052083401947,
|
|
"learning_rate": 3.950612935042976e-06,
|
|
"loss": 1.7069,
|
|
"mean_token_accuracy": 0.6487165689468384,
|
|
"num_tokens": 316296642.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.46877135438021045,
|
|
"grad_norm": 0.1907434479417283,
|
|
"learning_rate": 3.947090319853459e-06,
|
|
"loss": 1.664,
|
|
"mean_token_accuracy": 0.6553165972232818,
|
|
"num_tokens": 317209234.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 1.7328125,
|
|
"epoch": 0.4701380347136805,
|
|
"grad_norm": 0.32982236429957595,
|
|
"learning_rate": 3.943567704663943e-06,
|
|
"loss": 1.7229,
|
|
"mean_token_accuracy": 0.6461262106895447,
|
|
"num_tokens": 318111927.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 0.47150471504715047,
|
|
"grad_norm": 0.24557438748243085,
|
|
"learning_rate": 3.9400450894744265e-06,
|
|
"loss": 1.6704,
|
|
"mean_token_accuracy": 0.6533187210559845,
|
|
"num_tokens": 318994603.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.47287139538062045,
|
|
"grad_norm": 0.22910768272902507,
|
|
"learning_rate": 3.936522474284909e-06,
|
|
"loss": 1.7221,
|
|
"mean_token_accuracy": 0.6460629940032959,
|
|
"num_tokens": 319941225.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 0.4742380757140905,
|
|
"grad_norm": 0.30398869655159294,
|
|
"learning_rate": 3.932999859095393e-06,
|
|
"loss": 1.7078,
|
|
"mean_token_accuracy": 0.6495672345161438,
|
|
"num_tokens": 320934740.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.4756047560475605,
|
|
"grad_norm": 0.2589232189538314,
|
|
"learning_rate": 3.929477243905876e-06,
|
|
"loss": 1.6455,
|
|
"mean_token_accuracy": 0.6585084497928619,
|
|
"num_tokens": 321857524.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.47697143638103046,
|
|
"grad_norm": 0.28050434455061846,
|
|
"learning_rate": 3.925954628716359e-06,
|
|
"loss": 1.7011,
|
|
"mean_token_accuracy": 0.6507501900196075,
|
|
"num_tokens": 322775094.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.4783381167145005,
|
|
"grad_norm": 0.26576207214525194,
|
|
"learning_rate": 3.9224320135268425e-06,
|
|
"loss": 1.6795,
|
|
"mean_token_accuracy": 0.6513454437255859,
|
|
"num_tokens": 323676615.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.4797047970479705,
|
|
"grad_norm": 0.19192486631029423,
|
|
"learning_rate": 3.918909398337326e-06,
|
|
"loss": 1.7189,
|
|
"mean_token_accuracy": 0.6456737875938415,
|
|
"num_tokens": 324594262.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.48107147738144046,
|
|
"grad_norm": 0.2064251040514733,
|
|
"learning_rate": 3.91538678314781e-06,
|
|
"loss": 1.6593,
|
|
"mean_token_accuracy": 0.6561750650405884,
|
|
"num_tokens": 325536821.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.4824381577149105,
|
|
"grad_norm": 0.2668489099351852,
|
|
"learning_rate": 3.911864167958292e-06,
|
|
"loss": 1.7142,
|
|
"mean_token_accuracy": 0.6487360119819641,
|
|
"num_tokens": 326472989.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.4838048380483805,
|
|
"grad_norm": 0.18404329209393774,
|
|
"learning_rate": 3.908341552768776e-06,
|
|
"loss": 1.6481,
|
|
"mean_token_accuracy": 0.6574573218822479,
|
|
"num_tokens": 327397345.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 0.48517151838185046,
|
|
"grad_norm": 0.15732924149277266,
|
|
"learning_rate": 3.9048189375792586e-06,
|
|
"loss": 1.6565,
|
|
"mean_token_accuracy": 0.6525897264480591,
|
|
"num_tokens": 328289294.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 1.71015625,
|
|
"epoch": 0.4865381987153205,
|
|
"grad_norm": 0.25366279123422036,
|
|
"learning_rate": 3.901296322389743e-06,
|
|
"loss": 1.7163,
|
|
"mean_token_accuracy": 0.645934396982193,
|
|
"num_tokens": 329184464.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.4879048790487905,
|
|
"grad_norm": 0.2181917205860579,
|
|
"learning_rate": 3.897773707200226e-06,
|
|
"loss": 1.7078,
|
|
"mean_token_accuracy": 0.6466405630111695,
|
|
"num_tokens": 330093070.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 0.48927155938226047,
|
|
"grad_norm": 0.20588968594923218,
|
|
"learning_rate": 3.894251092010709e-06,
|
|
"loss": 1.6806,
|
|
"mean_token_accuracy": 0.649715393781662,
|
|
"num_tokens": 331040559.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 0.4906382397157305,
|
|
"grad_norm": 0.2981299687960081,
|
|
"learning_rate": 3.890728476821192e-06,
|
|
"loss": 1.6266,
|
|
"mean_token_accuracy": 0.6614166796207428,
|
|
"num_tokens": 331938859.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 0.4920049200492005,
|
|
"grad_norm": 0.15935284922623738,
|
|
"learning_rate": 3.8872058616316755e-06,
|
|
"loss": 1.5557,
|
|
"mean_token_accuracy": 0.6738888561725617,
|
|
"num_tokens": 332830313.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.49337160038267047,
|
|
"grad_norm": 0.18153782087004458,
|
|
"learning_rate": 3.883683246442159e-06,
|
|
"loss": 1.6535,
|
|
"mean_token_accuracy": 0.6549560844898223,
|
|
"num_tokens": 333744547.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.4947382807161405,
|
|
"grad_norm": 0.21929048431774537,
|
|
"learning_rate": 3.8801606312526426e-06,
|
|
"loss": 1.6704,
|
|
"mean_token_accuracy": 0.6537654757499695,
|
|
"num_tokens": 334714223.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 0.4961049610496105,
|
|
"grad_norm": 0.24240391590808857,
|
|
"learning_rate": 3.876638016063125e-06,
|
|
"loss": 1.59,
|
|
"mean_token_accuracy": 0.6683739304542542,
|
|
"num_tokens": 335654019.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.4974716413830805,
|
|
"grad_norm": 0.21273092985460185,
|
|
"learning_rate": 3.873115400873609e-06,
|
|
"loss": 1.6309,
|
|
"mean_token_accuracy": 0.6601285755634307,
|
|
"num_tokens": 336536137.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.4988383217165505,
|
|
"grad_norm": 0.25784265532574685,
|
|
"learning_rate": 3.869592785684092e-06,
|
|
"loss": 1.7288,
|
|
"mean_token_accuracy": 0.6473394572734833,
|
|
"num_tokens": 337464769.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 0.5002050020500205,
|
|
"grad_norm": 0.2241695491823867,
|
|
"learning_rate": 3.866070170494575e-06,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.6586786866188049,
|
|
"num_tokens": 338365103.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.5015716823834905,
|
|
"grad_norm": 0.3099782659184527,
|
|
"learning_rate": 3.862547555305059e-06,
|
|
"loss": 1.7088,
|
|
"mean_token_accuracy": 0.6488170504570008,
|
|
"num_tokens": 339329615.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.5029383627169605,
|
|
"grad_norm": 0.5053101251689461,
|
|
"learning_rate": 3.859024940115542e-06,
|
|
"loss": 1.7004,
|
|
"mean_token_accuracy": 0.649234163761139,
|
|
"num_tokens": 340262930.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 0.5043050430504306,
|
|
"grad_norm": 0.17004027933743457,
|
|
"learning_rate": 3.855502324926026e-06,
|
|
"loss": 1.6097,
|
|
"mean_token_accuracy": 0.6651304602622986,
|
|
"num_tokens": 341234135.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.5056717233839005,
|
|
"grad_norm": 0.21296181507808432,
|
|
"learning_rate": 3.851979709736508e-06,
|
|
"loss": 1.6751,
|
|
"mean_token_accuracy": 0.654943197965622,
|
|
"num_tokens": 342141097.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.5070384037173705,
|
|
"grad_norm": 0.1528686281257018,
|
|
"learning_rate": 3.848457094546992e-06,
|
|
"loss": 1.6589,
|
|
"mean_token_accuracy": 0.6574116885662079,
|
|
"num_tokens": 343060170.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.5084050840508405,
|
|
"grad_norm": 0.2287380339305285,
|
|
"learning_rate": 3.8449344793574755e-06,
|
|
"loss": 1.6411,
|
|
"mean_token_accuracy": 0.6595912039279938,
|
|
"num_tokens": 344016737.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.5097717643843105,
|
|
"grad_norm": 0.2045735808428984,
|
|
"learning_rate": 3.841411864167959e-06,
|
|
"loss": 1.6701,
|
|
"mean_token_accuracy": 0.6537386178970337,
|
|
"num_tokens": 344927549.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.5111384447177805,
|
|
"grad_norm": 0.2237249075093675,
|
|
"learning_rate": 3.837889248978442e-06,
|
|
"loss": 1.7024,
|
|
"mean_token_accuracy": 0.6474375188350677,
|
|
"num_tokens": 345834686.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 0.5125051250512506,
|
|
"grad_norm": 0.28933725102379715,
|
|
"learning_rate": 3.834366633788925e-06,
|
|
"loss": 1.6348,
|
|
"mean_token_accuracy": 0.659646087884903,
|
|
"num_tokens": 346764499.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.5138718053847205,
|
|
"grad_norm": 0.2540999730139384,
|
|
"learning_rate": 3.830844018599408e-06,
|
|
"loss": 1.6772,
|
|
"mean_token_accuracy": 0.651112574338913,
|
|
"num_tokens": 347629135.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 0.5152384857181905,
|
|
"grad_norm": 0.1811831753643154,
|
|
"learning_rate": 3.8273214034098915e-06,
|
|
"loss": 1.6841,
|
|
"mean_token_accuracy": 0.6522369027137757,
|
|
"num_tokens": 348554455.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 1.75234375,
|
|
"epoch": 0.5166051660516605,
|
|
"grad_norm": 0.2684927006951954,
|
|
"learning_rate": 3.823798788220375e-06,
|
|
"loss": 1.7479,
|
|
"mean_token_accuracy": 0.6428231656551361,
|
|
"num_tokens": 349451212.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 0.5179718463851305,
|
|
"grad_norm": 0.19995291737567153,
|
|
"learning_rate": 3.820276173030859e-06,
|
|
"loss": 1.6468,
|
|
"mean_token_accuracy": 0.6596734344959259,
|
|
"num_tokens": 350354045.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.5193385267186005,
|
|
"grad_norm": 0.30557508258940264,
|
|
"learning_rate": 3.816753557841341e-06,
|
|
"loss": 1.6649,
|
|
"mean_token_accuracy": 0.6545178115367889,
|
|
"num_tokens": 351249612.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 0.5207052070520706,
|
|
"grad_norm": 0.2471540650088555,
|
|
"learning_rate": 3.813230942651825e-06,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.659563934803009,
|
|
"num_tokens": 352120517.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 0.5220718873855406,
|
|
"grad_norm": 0.1708595702723927,
|
|
"learning_rate": 3.809708327462308e-06,
|
|
"loss": 1.6659,
|
|
"mean_token_accuracy": 0.6510150551795959,
|
|
"num_tokens": 353020280.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 1.70234375,
|
|
"epoch": 0.5234385677190105,
|
|
"grad_norm": 0.24194686968557427,
|
|
"learning_rate": 3.806185712272792e-06,
|
|
"loss": 1.721,
|
|
"mean_token_accuracy": 0.6481595039367676,
|
|
"num_tokens": 353962058.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.5248052480524805,
|
|
"grad_norm": 0.16449855168928632,
|
|
"learning_rate": 3.802663097083275e-06,
|
|
"loss": 1.7489,
|
|
"mean_token_accuracy": 0.639789617061615,
|
|
"num_tokens": 354830861.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.5261719283859505,
|
|
"grad_norm": 0.22521274498027544,
|
|
"learning_rate": 3.799140481893758e-06,
|
|
"loss": 1.6939,
|
|
"mean_token_accuracy": 0.6508757054805756,
|
|
"num_tokens": 355723792.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.5275386087194205,
|
|
"grad_norm": 0.21145329448362576,
|
|
"learning_rate": 3.7956178667042413e-06,
|
|
"loss": 1.633,
|
|
"mean_token_accuracy": 0.662147045135498,
|
|
"num_tokens": 356608340.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 0.5289052890528906,
|
|
"grad_norm": 0.24643124032445554,
|
|
"learning_rate": 3.792095251514725e-06,
|
|
"loss": 1.5782,
|
|
"mean_token_accuracy": 0.6658946812152863,
|
|
"num_tokens": 357544707.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 0.5302719693863606,
|
|
"grad_norm": 0.19733843409559623,
|
|
"learning_rate": 3.788572636325208e-06,
|
|
"loss": 1.6523,
|
|
"mean_token_accuracy": 0.6601674139499665,
|
|
"num_tokens": 358509164.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 0.5316386497198305,
|
|
"grad_norm": 0.29160163609915474,
|
|
"learning_rate": 3.7850500211356916e-06,
|
|
"loss": 1.586,
|
|
"mean_token_accuracy": 0.6661149859428406,
|
|
"num_tokens": 359444834.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.5330053300533005,
|
|
"grad_norm": 0.25803896305733953,
|
|
"learning_rate": 3.7815274059461747e-06,
|
|
"loss": 1.6868,
|
|
"mean_token_accuracy": 0.6517741024494171,
|
|
"num_tokens": 360362734.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.5343720103867705,
|
|
"grad_norm": 0.2645148043230131,
|
|
"learning_rate": 3.7780047907566582e-06,
|
|
"loss": 1.6778,
|
|
"mean_token_accuracy": 0.6516787827014923,
|
|
"num_tokens": 361307485.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 0.5357386907202405,
|
|
"grad_norm": 0.15527737758062368,
|
|
"learning_rate": 3.7744821755671413e-06,
|
|
"loss": 1.6491,
|
|
"mean_token_accuracy": 0.6607016623020172,
|
|
"num_tokens": 362256997.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.5371053710537106,
|
|
"grad_norm": 0.2512237436079019,
|
|
"learning_rate": 3.7709595603776245e-06,
|
|
"loss": 1.7585,
|
|
"mean_token_accuracy": 0.6436333239078522,
|
|
"num_tokens": 363189445.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.5384720513871806,
|
|
"grad_norm": 0.19741682544380015,
|
|
"learning_rate": 3.767436945188108e-06,
|
|
"loss": 1.7087,
|
|
"mean_token_accuracy": 0.6459274351596832,
|
|
"num_tokens": 364095793.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 0.5398387317206506,
|
|
"grad_norm": 0.26290621431275274,
|
|
"learning_rate": 3.7639143299985916e-06,
|
|
"loss": 1.6273,
|
|
"mean_token_accuracy": 0.6606081128120422,
|
|
"num_tokens": 365015404.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 0.5412054120541205,
|
|
"grad_norm": 0.18941065506474034,
|
|
"learning_rate": 3.7603917148090747e-06,
|
|
"loss": 1.6858,
|
|
"mean_token_accuracy": 0.6511177361011505,
|
|
"num_tokens": 365939650.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.5425720923875905,
|
|
"grad_norm": 0.17974375077790267,
|
|
"learning_rate": 3.756869099619558e-06,
|
|
"loss": 1.6478,
|
|
"mean_token_accuracy": 0.6589507341384888,
|
|
"num_tokens": 366874776.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 0.5439387727210605,
|
|
"grad_norm": 0.18336748344501613,
|
|
"learning_rate": 3.753346484430041e-06,
|
|
"loss": 1.6535,
|
|
"mean_token_accuracy": 0.6572046995162963,
|
|
"num_tokens": 367790549.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 1.59921875,
|
|
"epoch": 0.5453054530545306,
|
|
"grad_norm": 0.20846379625959024,
|
|
"learning_rate": 3.749823869240524e-06,
|
|
"loss": 1.5989,
|
|
"mean_token_accuracy": 0.6658934652805328,
|
|
"num_tokens": 368700792.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.5466721333880006,
|
|
"grad_norm": 0.2486414329500364,
|
|
"learning_rate": 3.746301254051008e-06,
|
|
"loss": 1.6292,
|
|
"mean_token_accuracy": 0.6615051388740539,
|
|
"num_tokens": 369616202.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 0.5480388137214706,
|
|
"grad_norm": 0.24005626931657958,
|
|
"learning_rate": 3.742778638861491e-06,
|
|
"loss": 1.6001,
|
|
"mean_token_accuracy": 0.6612799882888794,
|
|
"num_tokens": 370524876.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.5494054940549405,
|
|
"grad_norm": 0.3123109823359765,
|
|
"learning_rate": 3.7392560236719743e-06,
|
|
"loss": 1.6844,
|
|
"mean_token_accuracy": 0.6514267683029175,
|
|
"num_tokens": 371404894.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 0.5507721743884105,
|
|
"grad_norm": 0.1643981941689114,
|
|
"learning_rate": 3.7357334084824574e-06,
|
|
"loss": 1.5585,
|
|
"mean_token_accuracy": 0.671681421995163,
|
|
"num_tokens": 372321759.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 0.5521388547218805,
|
|
"grad_norm": 0.3317462757167787,
|
|
"learning_rate": 3.732210793292941e-06,
|
|
"loss": 1.6375,
|
|
"mean_token_accuracy": 0.6593451857566833,
|
|
"num_tokens": 373250468.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 1.72734375,
|
|
"epoch": 0.5535055350553506,
|
|
"grad_norm": 0.19075606866690867,
|
|
"learning_rate": 3.7286881781034245e-06,
|
|
"loss": 1.7317,
|
|
"mean_token_accuracy": 0.6436935007572174,
|
|
"num_tokens": 374193358.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 0.5548722153888206,
|
|
"grad_norm": 0.20140829324857198,
|
|
"learning_rate": 3.7251655629139076e-06,
|
|
"loss": 1.6731,
|
|
"mean_token_accuracy": 0.6550421178340912,
|
|
"num_tokens": 375146859.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.5562388957222906,
|
|
"grad_norm": 0.2170296843256562,
|
|
"learning_rate": 3.7216429477243907e-06,
|
|
"loss": 1.6734,
|
|
"mean_token_accuracy": 0.6509766280651093,
|
|
"num_tokens": 376072121.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.5576055760557606,
|
|
"grad_norm": 0.18816481235424165,
|
|
"learning_rate": 3.7181203325348743e-06,
|
|
"loss": 1.6734,
|
|
"mean_token_accuracy": 0.6544120907783508,
|
|
"num_tokens": 377038524.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 1.77109375,
|
|
"epoch": 0.5589722563892305,
|
|
"grad_norm": 0.23019474363211556,
|
|
"learning_rate": 3.7145977173453574e-06,
|
|
"loss": 1.7681,
|
|
"mean_token_accuracy": 0.6388165116310119,
|
|
"num_tokens": 377934801.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.5603389367227005,
|
|
"grad_norm": 0.2110746735280876,
|
|
"learning_rate": 3.7110751021558405e-06,
|
|
"loss": 1.6549,
|
|
"mean_token_accuracy": 0.6555174589157104,
|
|
"num_tokens": 378834064.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 0.5617056170561706,
|
|
"grad_norm": 0.2304705406739752,
|
|
"learning_rate": 3.7075524869663245e-06,
|
|
"loss": 1.6266,
|
|
"mean_token_accuracy": 0.6572926700115204,
|
|
"num_tokens": 379788042.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.5630722973896406,
|
|
"grad_norm": 0.15061014389814953,
|
|
"learning_rate": 3.7040298717768076e-06,
|
|
"loss": 1.6473,
|
|
"mean_token_accuracy": 0.6579816997051239,
|
|
"num_tokens": 380736699.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.5644389777231106,
|
|
"grad_norm": 0.2846668677306685,
|
|
"learning_rate": 3.7005072565872908e-06,
|
|
"loss": 1.6681,
|
|
"mean_token_accuracy": 0.6524290621280671,
|
|
"num_tokens": 381695330.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.5658056580565806,
|
|
"grad_norm": 0.2111695213146071,
|
|
"learning_rate": 3.696984641397774e-06,
|
|
"loss": 1.6437,
|
|
"mean_token_accuracy": 0.6567857503890991,
|
|
"num_tokens": 382611594.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 0.5671723383900505,
|
|
"grad_norm": 0.33056971991335293,
|
|
"learning_rate": 3.693462026208257e-06,
|
|
"loss": 1.5984,
|
|
"mean_token_accuracy": 0.6648770630359649,
|
|
"num_tokens": 383533751.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.5685390187235205,
|
|
"grad_norm": 0.2743036452178273,
|
|
"learning_rate": 3.689939411018741e-06,
|
|
"loss": 1.6949,
|
|
"mean_token_accuracy": 0.6541567146778107,
|
|
"num_tokens": 384499544.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 0.5699056990569906,
|
|
"grad_norm": 0.19281321569097268,
|
|
"learning_rate": 3.686416795829224e-06,
|
|
"loss": 1.5792,
|
|
"mean_token_accuracy": 0.6671958446502686,
|
|
"num_tokens": 385411247.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.5712723793904606,
|
|
"grad_norm": 0.19416174227609442,
|
|
"learning_rate": 3.6828941806397072e-06,
|
|
"loss": 1.6802,
|
|
"mean_token_accuracy": 0.6533600330352783,
|
|
"num_tokens": 386303069.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 0.5726390597239306,
|
|
"grad_norm": 0.17877806288697692,
|
|
"learning_rate": 3.6793715654501903e-06,
|
|
"loss": 1.6653,
|
|
"mean_token_accuracy": 0.6542647123336792,
|
|
"num_tokens": 387245106.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.5740057400574006,
|
|
"grad_norm": 0.23520125010236165,
|
|
"learning_rate": 3.6758489502606735e-06,
|
|
"loss": 1.667,
|
|
"mean_token_accuracy": 0.6581320583820343,
|
|
"num_tokens": 388221203.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.5753724203908706,
|
|
"grad_norm": 0.16989531221761256,
|
|
"learning_rate": 3.672326335071157e-06,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.6592749536037446,
|
|
"num_tokens": 389119645.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.5767391007243405,
|
|
"grad_norm": 0.2281349991856975,
|
|
"learning_rate": 3.6688037198816406e-06,
|
|
"loss": 1.6501,
|
|
"mean_token_accuracy": 0.659129387140274,
|
|
"num_tokens": 390035268.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 0.5781057810578106,
|
|
"grad_norm": 0.23886862523545843,
|
|
"learning_rate": 3.6652811046921237e-06,
|
|
"loss": 1.6379,
|
|
"mean_token_accuracy": 0.6606041729450226,
|
|
"num_tokens": 390926344.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.5794724613912806,
|
|
"grad_norm": 0.17197237920086447,
|
|
"learning_rate": 3.6617584895026072e-06,
|
|
"loss": 1.6187,
|
|
"mean_token_accuracy": 0.6619129717350006,
|
|
"num_tokens": 391814160.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.5808391417247506,
|
|
"grad_norm": 0.20044410036671537,
|
|
"learning_rate": 3.6582358743130904e-06,
|
|
"loss": 1.743,
|
|
"mean_token_accuracy": 0.6392210423946381,
|
|
"num_tokens": 392739541.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.5822058220582206,
|
|
"grad_norm": 0.234478587717973,
|
|
"learning_rate": 3.6547132591235735e-06,
|
|
"loss": 1.6494,
|
|
"mean_token_accuracy": 0.6560069978237152,
|
|
"num_tokens": 393627770.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.5835725023916906,
|
|
"grad_norm": 0.20739522775289165,
|
|
"learning_rate": 3.651190643934057e-06,
|
|
"loss": 1.639,
|
|
"mean_token_accuracy": 0.6585087239742279,
|
|
"num_tokens": 394490257.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 0.5849391827251605,
|
|
"grad_norm": 0.23573921385985488,
|
|
"learning_rate": 3.6476680287445406e-06,
|
|
"loss": 1.6494,
|
|
"mean_token_accuracy": 0.6587025821208954,
|
|
"num_tokens": 395437787.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 0.5863058630586306,
|
|
"grad_norm": 0.18527869020606189,
|
|
"learning_rate": 3.6441454135550237e-06,
|
|
"loss": 1.5727,
|
|
"mean_token_accuracy": 0.6699006855487823,
|
|
"num_tokens": 396384905.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.5876725433921006,
|
|
"grad_norm": 0.23255751919041817,
|
|
"learning_rate": 3.640622798365507e-06,
|
|
"loss": 1.6699,
|
|
"mean_token_accuracy": 0.6523589253425598,
|
|
"num_tokens": 397315080.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.5890392237255706,
|
|
"grad_norm": 0.16598230117249796,
|
|
"learning_rate": 3.63710018317599e-06,
|
|
"loss": 1.7154,
|
|
"mean_token_accuracy": 0.6448399722576141,
|
|
"num_tokens": 398245261.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.5904059040590406,
|
|
"grad_norm": 0.18104337291428105,
|
|
"learning_rate": 3.633577567986473e-06,
|
|
"loss": 1.7533,
|
|
"mean_token_accuracy": 0.6413149118423462,
|
|
"num_tokens": 399147190.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.5917725843925106,
|
|
"grad_norm": 0.21679820897976643,
|
|
"learning_rate": 3.630054952796957e-06,
|
|
"loss": 1.651,
|
|
"mean_token_accuracy": 0.6576019763946533,
|
|
"num_tokens": 400040469.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.5931392647259806,
|
|
"grad_norm": 0.19561803952663084,
|
|
"learning_rate": 3.62653233760744e-06,
|
|
"loss": 1.7267,
|
|
"mean_token_accuracy": 0.6444593131542206,
|
|
"num_tokens": 400965704.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.5945059450594506,
|
|
"grad_norm": 0.192946306810251,
|
|
"learning_rate": 3.6230097224179233e-06,
|
|
"loss": 1.6248,
|
|
"mean_token_accuracy": 0.6631713151931763,
|
|
"num_tokens": 401779462.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.5958726253929206,
|
|
"grad_norm": 0.20741351190685572,
|
|
"learning_rate": 3.6194871072284064e-06,
|
|
"loss": 1.6579,
|
|
"mean_token_accuracy": 0.6557525336742401,
|
|
"num_tokens": 402719415.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.5972393057263906,
|
|
"grad_norm": 0.19259466843087938,
|
|
"learning_rate": 3.61596449203889e-06,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.6521771252155304,
|
|
"num_tokens": 403638976.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 0.5986059860598606,
|
|
"grad_norm": 0.22358247326502587,
|
|
"learning_rate": 3.6124418768493735e-06,
|
|
"loss": 1.5563,
|
|
"mean_token_accuracy": 0.6721363186836242,
|
|
"num_tokens": 404548427.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 0.5999726663933306,
|
|
"grad_norm": 0.2269508225506426,
|
|
"learning_rate": 3.6089192616598566e-06,
|
|
"loss": 1.7302,
|
|
"mean_token_accuracy": 0.645664519071579,
|
|
"num_tokens": 405533901.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.6013393467268006,
|
|
"grad_norm": 0.21076367430212703,
|
|
"learning_rate": 3.6053966464703398e-06,
|
|
"loss": 1.7115,
|
|
"mean_token_accuracy": 0.6481589138507843,
|
|
"num_tokens": 406430118.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 1.73125,
|
|
"epoch": 0.6027060270602707,
|
|
"grad_norm": 0.2252991955438223,
|
|
"learning_rate": 3.6018740312808233e-06,
|
|
"loss": 1.7427,
|
|
"mean_token_accuracy": 0.6420607984066009,
|
|
"num_tokens": 407418450.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.6040727073937406,
|
|
"grad_norm": 0.20728664513610615,
|
|
"learning_rate": 3.5983514160913064e-06,
|
|
"loss": 1.6752,
|
|
"mean_token_accuracy": 0.6496328294277192,
|
|
"num_tokens": 408358905.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 0.6054393877272106,
|
|
"grad_norm": 0.167747688558747,
|
|
"learning_rate": 3.5948288009017895e-06,
|
|
"loss": 1.6731,
|
|
"mean_token_accuracy": 0.6509340167045593,
|
|
"num_tokens": 409287265.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.6068060680606806,
|
|
"grad_norm": 0.19657976411482558,
|
|
"learning_rate": 3.591306185712273e-06,
|
|
"loss": 1.6631,
|
|
"mean_token_accuracy": 0.6542129218578339,
|
|
"num_tokens": 410205143.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 1.76015625,
|
|
"epoch": 0.6081727483941506,
|
|
"grad_norm": 0.2118672984021604,
|
|
"learning_rate": 3.5877835705227566e-06,
|
|
"loss": 1.7522,
|
|
"mean_token_accuracy": 0.641113555431366,
|
|
"num_tokens": 411105235.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.6095394287276206,
|
|
"grad_norm": 0.28802968361134224,
|
|
"learning_rate": 3.5842609553332398e-06,
|
|
"loss": 1.6553,
|
|
"mean_token_accuracy": 0.6568327248096466,
|
|
"num_tokens": 412024641.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 1.7,
|
|
"epoch": 0.6109061090610907,
|
|
"grad_norm": 0.24300715852187554,
|
|
"learning_rate": 3.580738340143723e-06,
|
|
"loss": 1.7098,
|
|
"mean_token_accuracy": 0.6475836753845214,
|
|
"num_tokens": 412940185.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 0.6122727893945606,
|
|
"grad_norm": 0.21638835043346813,
|
|
"learning_rate": 3.577215724954206e-06,
|
|
"loss": 1.6694,
|
|
"mean_token_accuracy": 0.6550139427185059,
|
|
"num_tokens": 413896416.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.6136394697280306,
|
|
"grad_norm": 0.21076833140710655,
|
|
"learning_rate": 3.57369310976469e-06,
|
|
"loss": 1.7195,
|
|
"mean_token_accuracy": 0.6436967372894287,
|
|
"num_tokens": 414778446.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.6150061500615006,
|
|
"grad_norm": 0.1530450920508333,
|
|
"learning_rate": 3.570170494575173e-06,
|
|
"loss": 1.6656,
|
|
"mean_token_accuracy": 0.6567609906196594,
|
|
"num_tokens": 415726052.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 1.759375,
|
|
"epoch": 0.6163728303949706,
|
|
"grad_norm": 0.22947045310976946,
|
|
"learning_rate": 3.5666478793856562e-06,
|
|
"loss": 1.7842,
|
|
"mean_token_accuracy": 0.6364096999168396,
|
|
"num_tokens": 416632160.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.6177395107284406,
|
|
"grad_norm": 0.20188578155128611,
|
|
"learning_rate": 3.5631252641961394e-06,
|
|
"loss": 1.6557,
|
|
"mean_token_accuracy": 0.6571161091327667,
|
|
"num_tokens": 417520417.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 0.6191061910619107,
|
|
"grad_norm": 0.23795111313423412,
|
|
"learning_rate": 3.5596026490066225e-06,
|
|
"loss": 1.6095,
|
|
"mean_token_accuracy": 0.662272572517395,
|
|
"num_tokens": 418468254.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 0.6204728713953807,
|
|
"grad_norm": 0.17696178773894927,
|
|
"learning_rate": 3.556080033817106e-06,
|
|
"loss": 1.6455,
|
|
"mean_token_accuracy": 0.6585144519805908,
|
|
"num_tokens": 419362345.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.6218395517288506,
|
|
"grad_norm": 0.20486774160561483,
|
|
"learning_rate": 3.5525574186275896e-06,
|
|
"loss": 1.7202,
|
|
"mean_token_accuracy": 0.6456796526908875,
|
|
"num_tokens": 420251807.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.6232062320623206,
|
|
"grad_norm": 0.19876896043303036,
|
|
"learning_rate": 3.5490348034380727e-06,
|
|
"loss": 1.7188,
|
|
"mean_token_accuracy": 0.6455161273479462,
|
|
"num_tokens": 421183075.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.6245729123957906,
|
|
"grad_norm": 0.22917459127644327,
|
|
"learning_rate": 3.545512188248556e-06,
|
|
"loss": 1.68,
|
|
"mean_token_accuracy": 0.6511856973171234,
|
|
"num_tokens": 422078836.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 0.6259395927292606,
|
|
"grad_norm": 0.2179088266100347,
|
|
"learning_rate": 3.5419895730590394e-06,
|
|
"loss": 1.6183,
|
|
"mean_token_accuracy": 0.6595957577228546,
|
|
"num_tokens": 422966154.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 0.6273062730627307,
|
|
"grad_norm": 0.21805921331822678,
|
|
"learning_rate": 3.5384669578695225e-06,
|
|
"loss": 1.6688,
|
|
"mean_token_accuracy": 0.6532413601875305,
|
|
"num_tokens": 423878014.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.6286729533962007,
|
|
"grad_norm": 0.20847137953036965,
|
|
"learning_rate": 3.534944342680006e-06,
|
|
"loss": 1.7115,
|
|
"mean_token_accuracy": 0.6483781933784485,
|
|
"num_tokens": 424831235.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 0.6300396337296706,
|
|
"grad_norm": 0.19265327376292984,
|
|
"learning_rate": 3.5314217274904896e-06,
|
|
"loss": 1.6247,
|
|
"mean_token_accuracy": 0.6574327886104584,
|
|
"num_tokens": 425757426.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.6314063140631406,
|
|
"grad_norm": 0.2962812684931281,
|
|
"learning_rate": 3.5278991123009727e-06,
|
|
"loss": 1.6936,
|
|
"mean_token_accuracy": 0.651179188489914,
|
|
"num_tokens": 426670047.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.6327729943966106,
|
|
"grad_norm": 0.19794268059991654,
|
|
"learning_rate": 3.524376497111456e-06,
|
|
"loss": 1.7142,
|
|
"mean_token_accuracy": 0.6434573948383331,
|
|
"num_tokens": 427615118.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 0.6341396747300806,
|
|
"grad_norm": 0.23245895102037525,
|
|
"learning_rate": 3.520853881921939e-06,
|
|
"loss": 1.673,
|
|
"mean_token_accuracy": 0.6545408129692077,
|
|
"num_tokens": 428530022.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.6355063550635507,
|
|
"grad_norm": 0.19832535479948477,
|
|
"learning_rate": 3.517331266732422e-06,
|
|
"loss": 1.6953,
|
|
"mean_token_accuracy": 0.6520774722099304,
|
|
"num_tokens": 429487912.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 1.74140625,
|
|
"epoch": 0.6368730353970207,
|
|
"grad_norm": 0.20540506384717855,
|
|
"learning_rate": 3.513808651542906e-06,
|
|
"loss": 1.7567,
|
|
"mean_token_accuracy": 0.6438647329807281,
|
|
"num_tokens": 430398559.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 0.6382397157304907,
|
|
"grad_norm": 0.24618671792335373,
|
|
"learning_rate": 3.510286036353389e-06,
|
|
"loss": 1.6181,
|
|
"mean_token_accuracy": 0.6616644740104676,
|
|
"num_tokens": 431265815.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.6396063960639606,
|
|
"grad_norm": 0.18687189260492004,
|
|
"learning_rate": 3.5067634211638723e-06,
|
|
"loss": 1.6482,
|
|
"mean_token_accuracy": 0.6573707282543182,
|
|
"num_tokens": 432227360.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.6409730763974306,
|
|
"grad_norm": 0.1984220567698712,
|
|
"learning_rate": 3.5032408059743554e-06,
|
|
"loss": 1.6485,
|
|
"mean_token_accuracy": 0.6554848730564118,
|
|
"num_tokens": 433164830.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.6423397567309006,
|
|
"grad_norm": 0.19294644218207155,
|
|
"learning_rate": 3.499718190784839e-06,
|
|
"loss": 1.6723,
|
|
"mean_token_accuracy": 0.6523234486579895,
|
|
"num_tokens": 434040729.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.6437064370643707,
|
|
"grad_norm": 0.22143174940322394,
|
|
"learning_rate": 3.4961955755953225e-06,
|
|
"loss": 1.6844,
|
|
"mean_token_accuracy": 0.6519472539424896,
|
|
"num_tokens": 434968171.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.6450731173978407,
|
|
"grad_norm": 0.26712944027234947,
|
|
"learning_rate": 3.4926729604058056e-06,
|
|
"loss": 1.7131,
|
|
"mean_token_accuracy": 0.6484714627265931,
|
|
"num_tokens": 435882982.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.6464397977313107,
|
|
"grad_norm": 0.1941348285960509,
|
|
"learning_rate": 3.4891503452162888e-06,
|
|
"loss": 1.6533,
|
|
"mean_token_accuracy": 0.6586045563220978,
|
|
"num_tokens": 436755207.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 0.6478064780647806,
|
|
"grad_norm": 0.20998759301924633,
|
|
"learning_rate": 3.4856277300267723e-06,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.6603692710399628,
|
|
"num_tokens": 437650644.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.6491731583982506,
|
|
"grad_norm": 0.20112996455069587,
|
|
"learning_rate": 3.4821051148372554e-06,
|
|
"loss": 1.6572,
|
|
"mean_token_accuracy": 0.6544272065162658,
|
|
"num_tokens": 438548806.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 0.6505398387317206,
|
|
"grad_norm": 0.19242033539709882,
|
|
"learning_rate": 3.4785824996477386e-06,
|
|
"loss": 1.5769,
|
|
"mean_token_accuracy": 0.6659819543361664,
|
|
"num_tokens": 439428891.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 1.56875,
|
|
"epoch": 0.6519065190651907,
|
|
"grad_norm": 0.16292328065823433,
|
|
"learning_rate": 3.475059884458222e-06,
|
|
"loss": 1.5877,
|
|
"mean_token_accuracy": 0.6646440148353576,
|
|
"num_tokens": 440339921.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 0.6532731993986607,
|
|
"grad_norm": 0.2658043620004993,
|
|
"learning_rate": 3.4715372692687057e-06,
|
|
"loss": 1.6042,
|
|
"mean_token_accuracy": 0.6633561670780181,
|
|
"num_tokens": 441219354.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.6546398797321307,
|
|
"grad_norm": 0.1820011605581968,
|
|
"learning_rate": 3.4680146540791888e-06,
|
|
"loss": 1.6952,
|
|
"mean_token_accuracy": 0.6489320158958435,
|
|
"num_tokens": 442157810.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 0.6560065600656007,
|
|
"grad_norm": 0.20402494718445974,
|
|
"learning_rate": 3.464492038889672e-06,
|
|
"loss": 1.6742,
|
|
"mean_token_accuracy": 0.6539379000663758,
|
|
"num_tokens": 443030410.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.6573732403990706,
|
|
"grad_norm": 0.19224377395104286,
|
|
"learning_rate": 3.460969423700155e-06,
|
|
"loss": 1.6977,
|
|
"mean_token_accuracy": 0.6471102356910705,
|
|
"num_tokens": 444000582.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.6587399207325406,
|
|
"grad_norm": 0.1554137395705988,
|
|
"learning_rate": 3.457446808510639e-06,
|
|
"loss": 1.6535,
|
|
"mean_token_accuracy": 0.6570299148559571,
|
|
"num_tokens": 444938892.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 0.6601066010660107,
|
|
"grad_norm": 0.22921451076597885,
|
|
"learning_rate": 3.453924193321122e-06,
|
|
"loss": 1.6493,
|
|
"mean_token_accuracy": 0.6562624573707581,
|
|
"num_tokens": 445901641.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.6614732813994807,
|
|
"grad_norm": 0.23841879620388756,
|
|
"learning_rate": 3.4504015781316052e-06,
|
|
"loss": 1.5994,
|
|
"mean_token_accuracy": 0.6656190454959869,
|
|
"num_tokens": 446853273.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.6628399617329507,
|
|
"grad_norm": 0.2192624900239513,
|
|
"learning_rate": 3.4468789629420884e-06,
|
|
"loss": 1.7335,
|
|
"mean_token_accuracy": 0.6448606073856353,
|
|
"num_tokens": 447838189.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 0.6642066420664207,
|
|
"grad_norm": 0.25485390827705806,
|
|
"learning_rate": 3.4433563477525715e-06,
|
|
"loss": 1.6246,
|
|
"mean_token_accuracy": 0.6624457478523255,
|
|
"num_tokens": 448755053.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 0.6655733223998906,
|
|
"grad_norm": 0.1900297139146924,
|
|
"learning_rate": 3.439833732563055e-06,
|
|
"loss": 1.6358,
|
|
"mean_token_accuracy": 0.6578743159770966,
|
|
"num_tokens": 449667742.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.6669400027333606,
|
|
"grad_norm": 0.27473309189551964,
|
|
"learning_rate": 3.4363111173735386e-06,
|
|
"loss": 1.7047,
|
|
"mean_token_accuracy": 0.6475667953491211,
|
|
"num_tokens": 450632268.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.6683066830668307,
|
|
"grad_norm": 0.23047268900753726,
|
|
"learning_rate": 3.4327885021840217e-06,
|
|
"loss": 1.6802,
|
|
"mean_token_accuracy": 0.6540661036968232,
|
|
"num_tokens": 451577145.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.6696733634003007,
|
|
"grad_norm": 0.1847664852961655,
|
|
"learning_rate": 3.429265886994505e-06,
|
|
"loss": 1.6831,
|
|
"mean_token_accuracy": 0.6519534885883331,
|
|
"num_tokens": 452536825.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.6710400437337707,
|
|
"grad_norm": 0.2603694908454456,
|
|
"learning_rate": 3.4257432718049884e-06,
|
|
"loss": 1.6444,
|
|
"mean_token_accuracy": 0.6579820215702057,
|
|
"num_tokens": 453483269.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.6724067240672407,
|
|
"grad_norm": 0.21653181663350327,
|
|
"learning_rate": 3.4222206566154715e-06,
|
|
"loss": 1.6668,
|
|
"mean_token_accuracy": 0.6559320509433746,
|
|
"num_tokens": 454442072.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 1.703125,
|
|
"epoch": 0.6737734044007107,
|
|
"grad_norm": 0.20728096178321426,
|
|
"learning_rate": 3.418698041425955e-06,
|
|
"loss": 1.7105,
|
|
"mean_token_accuracy": 0.6445403814315795,
|
|
"num_tokens": 455350410.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 0.6751400847341806,
|
|
"grad_norm": 0.1889194133100524,
|
|
"learning_rate": 3.415175426236438e-06,
|
|
"loss": 1.6133,
|
|
"mean_token_accuracy": 0.6634906947612762,
|
|
"num_tokens": 456300176.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 1.55078125,
|
|
"epoch": 0.6765067650676507,
|
|
"grad_norm": 0.22593232593749235,
|
|
"learning_rate": 3.4116528110469217e-06,
|
|
"loss": 1.5498,
|
|
"mean_token_accuracy": 0.6766405522823333,
|
|
"num_tokens": 457228439.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 0.6778734454011207,
|
|
"grad_norm": 0.2281538727729559,
|
|
"learning_rate": 3.408130195857405e-06,
|
|
"loss": 1.5933,
|
|
"mean_token_accuracy": 0.6665524125099183,
|
|
"num_tokens": 458107783.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 0.6792401257345907,
|
|
"grad_norm": 0.21504753411742764,
|
|
"learning_rate": 3.404607580667888e-06,
|
|
"loss": 1.6465,
|
|
"mean_token_accuracy": 0.6576745390892029,
|
|
"num_tokens": 459041724.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 0.6806068060680607,
|
|
"grad_norm": 0.19826386338388483,
|
|
"learning_rate": 3.401084965478371e-06,
|
|
"loss": 1.5774,
|
|
"mean_token_accuracy": 0.6697787880897522,
|
|
"num_tokens": 459944785.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.6819734864015307,
|
|
"grad_norm": 0.2204719453662286,
|
|
"learning_rate": 3.397562350288855e-06,
|
|
"loss": 1.685,
|
|
"mean_token_accuracy": 0.6524436175823212,
|
|
"num_tokens": 460873882.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.6833401667350006,
|
|
"grad_norm": 0.1868308606836345,
|
|
"learning_rate": 3.394039735099338e-06,
|
|
"loss": 1.6528,
|
|
"mean_token_accuracy": 0.6544591426849365,
|
|
"num_tokens": 461798029.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 0.6847068470684707,
|
|
"grad_norm": 0.3357950021824975,
|
|
"learning_rate": 3.3905171199098213e-06,
|
|
"loss": 1.5726,
|
|
"mean_token_accuracy": 0.6670619726181031,
|
|
"num_tokens": 462707021.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 0.6860735274019407,
|
|
"grad_norm": 0.15014604154206565,
|
|
"learning_rate": 3.3869945047203044e-06,
|
|
"loss": 1.6095,
|
|
"mean_token_accuracy": 0.6602976977825165,
|
|
"num_tokens": 463652610.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.6874402077354107,
|
|
"grad_norm": 0.23087675365342303,
|
|
"learning_rate": 3.3834718895307876e-06,
|
|
"loss": 1.6761,
|
|
"mean_token_accuracy": 0.6523682773113251,
|
|
"num_tokens": 464558445.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.6888068880688807,
|
|
"grad_norm": 0.21499535824675625,
|
|
"learning_rate": 3.3799492743412715e-06,
|
|
"loss": 1.6524,
|
|
"mean_token_accuracy": 0.6575279772281647,
|
|
"num_tokens": 465486955.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 0.6901735684023507,
|
|
"grad_norm": 0.19504225486539917,
|
|
"learning_rate": 3.3764266591517547e-06,
|
|
"loss": 1.5921,
|
|
"mean_token_accuracy": 0.6649239003658295,
|
|
"num_tokens": 466423234.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 0.6915402487358207,
|
|
"grad_norm": 0.20905929936231046,
|
|
"learning_rate": 3.3729040439622378e-06,
|
|
"loss": 1.6226,
|
|
"mean_token_accuracy": 0.658042597770691,
|
|
"num_tokens": 467353745.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.6929069290692907,
|
|
"grad_norm": 0.20165771680320302,
|
|
"learning_rate": 3.3693814287727213e-06,
|
|
"loss": 1.7056,
|
|
"mean_token_accuracy": 0.6477044939994812,
|
|
"num_tokens": 468250412.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.6942736094027607,
|
|
"grad_norm": 0.22182760720758687,
|
|
"learning_rate": 3.3658588135832044e-06,
|
|
"loss": 1.6758,
|
|
"mean_token_accuracy": 0.652918666601181,
|
|
"num_tokens": 469238775.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.6956402897362307,
|
|
"grad_norm": 0.1722018616442895,
|
|
"learning_rate": 3.3623361983936876e-06,
|
|
"loss": 1.6254,
|
|
"mean_token_accuracy": 0.6620585262775421,
|
|
"num_tokens": 470233087.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.6970069700697007,
|
|
"grad_norm": 0.1623241106091593,
|
|
"learning_rate": 3.358813583204171e-06,
|
|
"loss": 1.6715,
|
|
"mean_token_accuracy": 0.6535182356834411,
|
|
"num_tokens": 471210093.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 0.6983736504031707,
|
|
"grad_norm": 0.31393373051933227,
|
|
"learning_rate": 3.3552909680146547e-06,
|
|
"loss": 1.7416,
|
|
"mean_token_accuracy": 0.6426091432571411,
|
|
"num_tokens": 472133544.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 0.6997403307366407,
|
|
"grad_norm": 0.34504457312679865,
|
|
"learning_rate": 3.3517683528251378e-06,
|
|
"loss": 1.6017,
|
|
"mean_token_accuracy": 0.6639619708061218,
|
|
"num_tokens": 473063980.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 0.7011070110701108,
|
|
"grad_norm": 0.18332422006763222,
|
|
"learning_rate": 3.348245737635621e-06,
|
|
"loss": 1.6124,
|
|
"mean_token_accuracy": 0.6615500807762146,
|
|
"num_tokens": 474009544.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 0.7024736914035807,
|
|
"grad_norm": 0.16132679700200236,
|
|
"learning_rate": 3.344723122446104e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.6533381044864655,
|
|
"num_tokens": 474900166.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.7038403717370507,
|
|
"grad_norm": 0.2577280052224606,
|
|
"learning_rate": 3.341200507256587e-06,
|
|
"loss": 1.6807,
|
|
"mean_token_accuracy": 0.6547457754611969,
|
|
"num_tokens": 475818307.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.7052070520705207,
|
|
"grad_norm": 0.1843746594521599,
|
|
"learning_rate": 3.337677892067071e-06,
|
|
"loss": 1.7053,
|
|
"mean_token_accuracy": 0.648986566066742,
|
|
"num_tokens": 476726028.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.7065737324039907,
|
|
"grad_norm": 0.22206719095256103,
|
|
"learning_rate": 3.3341552768775543e-06,
|
|
"loss": 1.6983,
|
|
"mean_token_accuracy": 0.6468342304229736,
|
|
"num_tokens": 477653405.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 0.7079404127374607,
|
|
"grad_norm": 0.2122206729485613,
|
|
"learning_rate": 3.3306326616880374e-06,
|
|
"loss": 1.6725,
|
|
"mean_token_accuracy": 0.653656417131424,
|
|
"num_tokens": 478579206.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.7093070930709308,
|
|
"grad_norm": 0.15935703103144105,
|
|
"learning_rate": 3.3271100464985205e-06,
|
|
"loss": 1.7165,
|
|
"mean_token_accuracy": 0.6467913150787353,
|
|
"num_tokens": 479517206.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 0.7106737734044007,
|
|
"grad_norm": 0.21689286574839256,
|
|
"learning_rate": 3.323587431309004e-06,
|
|
"loss": 1.6217,
|
|
"mean_token_accuracy": 0.6626726806163787,
|
|
"num_tokens": 480480482.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.7120404537378707,
|
|
"grad_norm": 0.22045208667254365,
|
|
"learning_rate": 3.3200648161194876e-06,
|
|
"loss": 1.6909,
|
|
"mean_token_accuracy": 0.6484004259109497,
|
|
"num_tokens": 481412687.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 0.7134071340713407,
|
|
"grad_norm": 0.2597179529891814,
|
|
"learning_rate": 3.3165422009299707e-06,
|
|
"loss": 1.6259,
|
|
"mean_token_accuracy": 0.659461772441864,
|
|
"num_tokens": 482340800.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.7147738144048107,
|
|
"grad_norm": 0.20227441796276022,
|
|
"learning_rate": 3.313019585740454e-06,
|
|
"loss": 1.6508,
|
|
"mean_token_accuracy": 0.6555399298667908,
|
|
"num_tokens": 483262961.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.7161404947382807,
|
|
"grad_norm": 0.18657960178600633,
|
|
"learning_rate": 3.3094969705509374e-06,
|
|
"loss": 1.7192,
|
|
"mean_token_accuracy": 0.645880526304245,
|
|
"num_tokens": 484188351.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 0.7175071750717508,
|
|
"grad_norm": 0.21855899856605412,
|
|
"learning_rate": 3.3059743553614205e-06,
|
|
"loss": 1.6259,
|
|
"mean_token_accuracy": 0.6579080641269683,
|
|
"num_tokens": 485098871.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.7188738554052208,
|
|
"grad_norm": 0.2994050915517825,
|
|
"learning_rate": 3.3024517401719036e-06,
|
|
"loss": 1.6242,
|
|
"mean_token_accuracy": 0.6588131427764893,
|
|
"num_tokens": 486001297.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 0.7202405357386907,
|
|
"grad_norm": 0.2595715275184205,
|
|
"learning_rate": 3.298929124982387e-06,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6601145148277283,
|
|
"num_tokens": 486936941.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.7216072160721607,
|
|
"grad_norm": 0.22004065819572022,
|
|
"learning_rate": 3.2954065097928707e-06,
|
|
"loss": 1.7019,
|
|
"mean_token_accuracy": 0.6508915781974792,
|
|
"num_tokens": 487881903.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.7229738964056307,
|
|
"grad_norm": 0.18930498007376284,
|
|
"learning_rate": 3.291883894603354e-06,
|
|
"loss": 1.6563,
|
|
"mean_token_accuracy": 0.6583892285823822,
|
|
"num_tokens": 488802929.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 1.57734375,
|
|
"epoch": 0.7243405767391007,
|
|
"grad_norm": 0.21704102980878215,
|
|
"learning_rate": 3.288361279413837e-06,
|
|
"loss": 1.5896,
|
|
"mean_token_accuracy": 0.6637836515903472,
|
|
"num_tokens": 489715698.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.7257072570725708,
|
|
"grad_norm": 0.3014382270994431,
|
|
"learning_rate": 3.28483866422432e-06,
|
|
"loss": 1.619,
|
|
"mean_token_accuracy": 0.6630760729312897,
|
|
"num_tokens": 490628664.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 1.71796875,
|
|
"epoch": 0.7270739374060408,
|
|
"grad_norm": 0.21795699064197996,
|
|
"learning_rate": 3.281316049034804e-06,
|
|
"loss": 1.7397,
|
|
"mean_token_accuracy": 0.6391644954681397,
|
|
"num_tokens": 491605350.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.7284406177395107,
|
|
"grad_norm": 0.35695925410086926,
|
|
"learning_rate": 3.277793433845287e-06,
|
|
"loss": 1.6586,
|
|
"mean_token_accuracy": 0.6512080729007721,
|
|
"num_tokens": 492532412.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 0.7298072980729807,
|
|
"grad_norm": 0.2691660597772992,
|
|
"learning_rate": 3.2742708186557703e-06,
|
|
"loss": 1.6058,
|
|
"mean_token_accuracy": 0.6651232659816741,
|
|
"num_tokens": 493480931.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.7311739784064507,
|
|
"grad_norm": 0.20607015836645884,
|
|
"learning_rate": 3.2707482034662534e-06,
|
|
"loss": 1.6874,
|
|
"mean_token_accuracy": 0.6484021723270417,
|
|
"num_tokens": 494410045.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 0.7325406587399207,
|
|
"grad_norm": 0.14992256926848663,
|
|
"learning_rate": 3.2672255882767366e-06,
|
|
"loss": 1.6178,
|
|
"mean_token_accuracy": 0.6640694677829743,
|
|
"num_tokens": 495342442.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.7339073390733908,
|
|
"grad_norm": 0.2295238811494555,
|
|
"learning_rate": 3.26370297308722e-06,
|
|
"loss": 1.6424,
|
|
"mean_token_accuracy": 0.6586408495903016,
|
|
"num_tokens": 496297308.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 0.7352740194068608,
|
|
"grad_norm": 0.2022824878347535,
|
|
"learning_rate": 3.2601803578977037e-06,
|
|
"loss": 1.6428,
|
|
"mean_token_accuracy": 0.6557801425457,
|
|
"num_tokens": 497217134.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 0.7366406997403308,
|
|
"grad_norm": 0.15048481296567529,
|
|
"learning_rate": 3.2566577427081868e-06,
|
|
"loss": 1.58,
|
|
"mean_token_accuracy": 0.6665792882442474,
|
|
"num_tokens": 498177355.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 0.7380073800738007,
|
|
"grad_norm": 0.19304499597060448,
|
|
"learning_rate": 3.25313512751867e-06,
|
|
"loss": 1.6292,
|
|
"mean_token_accuracy": 0.6598256707191468,
|
|
"num_tokens": 499082608.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.7393740604072707,
|
|
"grad_norm": 0.18659368782239766,
|
|
"learning_rate": 3.2496125123291535e-06,
|
|
"loss": 1.7024,
|
|
"mean_token_accuracy": 0.6461093962192536,
|
|
"num_tokens": 500003509.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 0.18497014785943747,
|
|
"learning_rate": 3.2460898971396366e-06,
|
|
"loss": 1.5969,
|
|
"mean_token_accuracy": 0.666548216342926,
|
|
"num_tokens": 500935377.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.7421074210742108,
|
|
"grad_norm": 0.18100454249163686,
|
|
"learning_rate": 3.24256728195012e-06,
|
|
"loss": 1.6429,
|
|
"mean_token_accuracy": 0.6578535795211792,
|
|
"num_tokens": 501845168.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 0.7434741014076808,
|
|
"grad_norm": 0.18825067184082417,
|
|
"learning_rate": 3.2390446667606037e-06,
|
|
"loss": 1.6415,
|
|
"mean_token_accuracy": 0.6588270485401153,
|
|
"num_tokens": 502800482.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 0.7448407817411508,
|
|
"grad_norm": 0.23765671913151534,
|
|
"learning_rate": 3.235522051571087e-06,
|
|
"loss": 1.6321,
|
|
"mean_token_accuracy": 0.6594701111316681,
|
|
"num_tokens": 503721018.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.7462074620746207,
|
|
"grad_norm": 0.19097417971296302,
|
|
"learning_rate": 3.23199943638157e-06,
|
|
"loss": 1.6598,
|
|
"mean_token_accuracy": 0.6547618091106415,
|
|
"num_tokens": 504604731.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.7475741424080907,
|
|
"grad_norm": 0.20213339102253905,
|
|
"learning_rate": 3.228476821192053e-06,
|
|
"loss": 1.6802,
|
|
"mean_token_accuracy": 0.6508907556533814,
|
|
"num_tokens": 505532774.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.7489408227415607,
|
|
"grad_norm": 0.25605754595817776,
|
|
"learning_rate": 3.224954206002536e-06,
|
|
"loss": 1.6686,
|
|
"mean_token_accuracy": 0.6527039110660553,
|
|
"num_tokens": 506489139.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.7503075030750308,
|
|
"grad_norm": 0.246705534398532,
|
|
"learning_rate": 3.22143159081302e-06,
|
|
"loss": 1.629,
|
|
"mean_token_accuracy": 0.6627198576927185,
|
|
"num_tokens": 507415937.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 1.72890625,
|
|
"epoch": 0.7516741834085008,
|
|
"grad_norm": 0.23016096643317233,
|
|
"learning_rate": 3.2179089756235033e-06,
|
|
"loss": 1.7169,
|
|
"mean_token_accuracy": 0.6434590816497803,
|
|
"num_tokens": 508329740.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.7530408637419708,
|
|
"grad_norm": 0.16599283684816005,
|
|
"learning_rate": 3.2143863604339864e-06,
|
|
"loss": 1.7103,
|
|
"mean_token_accuracy": 0.6473401188850403,
|
|
"num_tokens": 509270978.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.7544075440754408,
|
|
"grad_norm": 0.18844675405403122,
|
|
"learning_rate": 3.2108637452444695e-06,
|
|
"loss": 1.6688,
|
|
"mean_token_accuracy": 0.6541977107524872,
|
|
"num_tokens": 510187255.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.7557742244089107,
|
|
"grad_norm": 0.2395331050456403,
|
|
"learning_rate": 3.2073411300549526e-06,
|
|
"loss": 1.6974,
|
|
"mean_token_accuracy": 0.6516661047935486,
|
|
"num_tokens": 511095184.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 0.7571409047423807,
|
|
"grad_norm": 0.16200111073098677,
|
|
"learning_rate": 3.2038185148654366e-06,
|
|
"loss": 1.6071,
|
|
"mean_token_accuracy": 0.6642232477664948,
|
|
"num_tokens": 512008907.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 0.7585075850758508,
|
|
"grad_norm": 0.22148641032364863,
|
|
"learning_rate": 3.2002958996759197e-06,
|
|
"loss": 1.6943,
|
|
"mean_token_accuracy": 0.6497397601604462,
|
|
"num_tokens": 512898088.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.7598742654093208,
|
|
"grad_norm": 0.2758011456634575,
|
|
"learning_rate": 3.196773284486403e-06,
|
|
"loss": 1.6841,
|
|
"mean_token_accuracy": 0.6523485720157624,
|
|
"num_tokens": 513824464.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.7612409457427908,
|
|
"grad_norm": 0.19521332254171903,
|
|
"learning_rate": 3.1932506692968864e-06,
|
|
"loss": 1.7071,
|
|
"mean_token_accuracy": 0.6473329901695252,
|
|
"num_tokens": 514761252.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.7626076260762608,
|
|
"grad_norm": 0.18964944167398232,
|
|
"learning_rate": 3.1897280541073695e-06,
|
|
"loss": 1.6522,
|
|
"mean_token_accuracy": 0.6544399201869965,
|
|
"num_tokens": 515703955.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.7639743064097307,
|
|
"grad_norm": 0.26397333088511404,
|
|
"learning_rate": 3.1862054389178526e-06,
|
|
"loss": 1.6986,
|
|
"mean_token_accuracy": 0.649600750207901,
|
|
"num_tokens": 516679923.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 0.7653409867432007,
|
|
"grad_norm": 0.19486577309957745,
|
|
"learning_rate": 3.182682823728336e-06,
|
|
"loss": 1.6241,
|
|
"mean_token_accuracy": 0.6591627061367035,
|
|
"num_tokens": 517614323.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 0.7667076670766708,
|
|
"grad_norm": 0.20030879449205866,
|
|
"learning_rate": 3.1791602085388197e-06,
|
|
"loss": 1.6134,
|
|
"mean_token_accuracy": 0.6621118068695069,
|
|
"num_tokens": 518549985.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.7680743474101408,
|
|
"grad_norm": 0.22097489531211548,
|
|
"learning_rate": 3.175637593349303e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6504874885082245,
|
|
"num_tokens": 519458349.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.7694410277436108,
|
|
"grad_norm": 0.2085765297489567,
|
|
"learning_rate": 3.172114978159786e-06,
|
|
"loss": 1.7013,
|
|
"mean_token_accuracy": 0.6482382774353027,
|
|
"num_tokens": 520405581.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 0.7708077080770808,
|
|
"grad_norm": 0.17459035794950536,
|
|
"learning_rate": 3.168592362970269e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6529590129852295,
|
|
"num_tokens": 521293063.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 0.7721743884105507,
|
|
"grad_norm": 0.22046626598832647,
|
|
"learning_rate": 3.165069747780753e-06,
|
|
"loss": 1.649,
|
|
"mean_token_accuracy": 0.6573987245559693,
|
|
"num_tokens": 522187313.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.7735410687440207,
|
|
"grad_norm": 0.178711796483335,
|
|
"learning_rate": 3.161547132591236e-06,
|
|
"loss": 1.7113,
|
|
"mean_token_accuracy": 0.6467081308364868,
|
|
"num_tokens": 523133069.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.7749077490774908,
|
|
"grad_norm": 0.27975262080736324,
|
|
"learning_rate": 3.1580245174017193e-06,
|
|
"loss": 1.6897,
|
|
"mean_token_accuracy": 0.6494198918342591,
|
|
"num_tokens": 524072850.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.7762744294109608,
|
|
"grad_norm": 0.19433844116812984,
|
|
"learning_rate": 3.1545019022122025e-06,
|
|
"loss": 1.6844,
|
|
"mean_token_accuracy": 0.6526411235332489,
|
|
"num_tokens": 525002521.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 1.71796875,
|
|
"epoch": 0.7776411097444308,
|
|
"grad_norm": 0.2822619154642704,
|
|
"learning_rate": 3.1509792870226856e-06,
|
|
"loss": 1.7365,
|
|
"mean_token_accuracy": 0.6418895840644836,
|
|
"num_tokens": 525933255.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 0.7790077900779008,
|
|
"grad_norm": 0.26520024019323474,
|
|
"learning_rate": 3.147456671833169e-06,
|
|
"loss": 1.6229,
|
|
"mean_token_accuracy": 0.6603316068649292,
|
|
"num_tokens": 526844629.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 0.7803744704113708,
|
|
"grad_norm": 0.20385395387412902,
|
|
"learning_rate": 3.1439340566436527e-06,
|
|
"loss": 1.6269,
|
|
"mean_token_accuracy": 0.659045523405075,
|
|
"num_tokens": 527768450.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 0.7817411507448407,
|
|
"grad_norm": 0.3087418496774897,
|
|
"learning_rate": 3.140411441454136e-06,
|
|
"loss": 1.6685,
|
|
"mean_token_accuracy": 0.6545985639095306,
|
|
"num_tokens": 528686390.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.7831078310783108,
|
|
"grad_norm": 0.2198632382584689,
|
|
"learning_rate": 3.136888826264619e-06,
|
|
"loss": 1.6581,
|
|
"mean_token_accuracy": 0.6533248662948609,
|
|
"num_tokens": 529607532.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 0.7844745114117808,
|
|
"grad_norm": 0.21631539452258344,
|
|
"learning_rate": 3.1333662110751025e-06,
|
|
"loss": 1.6426,
|
|
"mean_token_accuracy": 0.6559727191925049,
|
|
"num_tokens": 530507947.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 1.578125,
|
|
"epoch": 0.7858411917452508,
|
|
"grad_norm": 0.17732669385822253,
|
|
"learning_rate": 3.1298435958855856e-06,
|
|
"loss": 1.5783,
|
|
"mean_token_accuracy": 0.6674174427986145,
|
|
"num_tokens": 531389343.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.7872078720787208,
|
|
"grad_norm": 0.2569031216421856,
|
|
"learning_rate": 3.126320980696069e-06,
|
|
"loss": 1.6853,
|
|
"mean_token_accuracy": 0.6513855457305908,
|
|
"num_tokens": 532289213.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.7885745524121908,
|
|
"grad_norm": 0.1600170247714624,
|
|
"learning_rate": 3.1227983655065523e-06,
|
|
"loss": 1.6645,
|
|
"mean_token_accuracy": 0.6525399804115295,
|
|
"num_tokens": 533193956.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 0.7899412327456607,
|
|
"grad_norm": 0.20477823394996145,
|
|
"learning_rate": 3.119275750317036e-06,
|
|
"loss": 1.5658,
|
|
"mean_token_accuracy": 0.6663733720779419,
|
|
"num_tokens": 534115322.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.7913079130791308,
|
|
"grad_norm": 0.17940713431078514,
|
|
"learning_rate": 3.115753135127519e-06,
|
|
"loss": 1.7245,
|
|
"mean_token_accuracy": 0.6440808415412903,
|
|
"num_tokens": 535044829.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 0.7926745934126008,
|
|
"grad_norm": 0.23192023197896883,
|
|
"learning_rate": 3.112230519938002e-06,
|
|
"loss": 1.6316,
|
|
"mean_token_accuracy": 0.6597498834133149,
|
|
"num_tokens": 536000610.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.7940412737460708,
|
|
"grad_norm": 0.39280162052733514,
|
|
"learning_rate": 3.108707904748485e-06,
|
|
"loss": 1.6494,
|
|
"mean_token_accuracy": 0.6587268650531769,
|
|
"num_tokens": 536860607.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.7954079540795408,
|
|
"grad_norm": 0.26092386087941466,
|
|
"learning_rate": 3.105185289558969e-06,
|
|
"loss": 1.7457,
|
|
"mean_token_accuracy": 0.6449077665805817,
|
|
"num_tokens": 537829266.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.7967746344130108,
|
|
"grad_norm": 0.2885618724751559,
|
|
"learning_rate": 3.1016626743694523e-06,
|
|
"loss": 1.6628,
|
|
"mean_token_accuracy": 0.654271811246872,
|
|
"num_tokens": 538752893.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.7981413147464808,
|
|
"grad_norm": 0.23672867648695503,
|
|
"learning_rate": 3.0981400591799354e-06,
|
|
"loss": 1.647,
|
|
"mean_token_accuracy": 0.6566824376583099,
|
|
"num_tokens": 539729795.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 0.7995079950799509,
|
|
"grad_norm": 0.1613797826845674,
|
|
"learning_rate": 3.0946174439904185e-06,
|
|
"loss": 1.6272,
|
|
"mean_token_accuracy": 0.6613822162151337,
|
|
"num_tokens": 540671382.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 1.58203125,
|
|
"epoch": 0.8008746754134208,
|
|
"grad_norm": 0.30974515811755415,
|
|
"learning_rate": 3.0910948288009016e-06,
|
|
"loss": 1.5818,
|
|
"mean_token_accuracy": 0.6679625809192657,
|
|
"num_tokens": 541572488.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.8022413557468908,
|
|
"grad_norm": 0.22859979455254179,
|
|
"learning_rate": 3.0875722136113856e-06,
|
|
"loss": 1.6495,
|
|
"mean_token_accuracy": 0.6546528935432434,
|
|
"num_tokens": 542488727.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.8036080360803608,
|
|
"grad_norm": 0.17733279329520643,
|
|
"learning_rate": 3.0840495984218687e-06,
|
|
"loss": 1.6435,
|
|
"mean_token_accuracy": 0.6588581144809723,
|
|
"num_tokens": 543432938.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 0.8049747164138308,
|
|
"grad_norm": 0.26405035502989244,
|
|
"learning_rate": 3.080526983232352e-06,
|
|
"loss": 1.6801,
|
|
"mean_token_accuracy": 0.6506463646888733,
|
|
"num_tokens": 544345146.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.8063413967473008,
|
|
"grad_norm": 0.2418645816039286,
|
|
"learning_rate": 3.0770043680428354e-06,
|
|
"loss": 1.658,
|
|
"mean_token_accuracy": 0.6524530947208405,
|
|
"num_tokens": 545276992.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 0.8077080770807709,
|
|
"grad_norm": 0.23340876516768194,
|
|
"learning_rate": 3.0734817528533185e-06,
|
|
"loss": 1.6311,
|
|
"mean_token_accuracy": 0.6618912816047668,
|
|
"num_tokens": 546182552.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.8090747574142408,
|
|
"grad_norm": 0.19979860071926556,
|
|
"learning_rate": 3.0699591376638017e-06,
|
|
"loss": 1.6613,
|
|
"mean_token_accuracy": 0.6549089789390564,
|
|
"num_tokens": 547099892.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 0.8104414377477108,
|
|
"grad_norm": 0.16943549240012648,
|
|
"learning_rate": 3.066436522474285e-06,
|
|
"loss": 1.5876,
|
|
"mean_token_accuracy": 0.6674472570419312,
|
|
"num_tokens": 548025247.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 0.8118081180811808,
|
|
"grad_norm": 0.18917023262172727,
|
|
"learning_rate": 3.0629139072847688e-06,
|
|
"loss": 1.5887,
|
|
"mean_token_accuracy": 0.6647202610969544,
|
|
"num_tokens": 548995794.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.8131747984146508,
|
|
"grad_norm": 0.2158613113475277,
|
|
"learning_rate": 3.059391292095252e-06,
|
|
"loss": 1.6459,
|
|
"mean_token_accuracy": 0.6537158131599426,
|
|
"num_tokens": 549898838.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.8145414787481208,
|
|
"grad_norm": 0.1761180246790986,
|
|
"learning_rate": 3.055868676905735e-06,
|
|
"loss": 1.694,
|
|
"mean_token_accuracy": 0.6509846031665802,
|
|
"num_tokens": 550874300.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 0.8159081590815909,
|
|
"grad_norm": 0.33232329064395527,
|
|
"learning_rate": 3.052346061716218e-06,
|
|
"loss": 1.6449,
|
|
"mean_token_accuracy": 0.6562651932239533,
|
|
"num_tokens": 551740344.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.8172748394150608,
|
|
"grad_norm": 0.24580627054010268,
|
|
"learning_rate": 3.048823446526702e-06,
|
|
"loss": 1.678,
|
|
"mean_token_accuracy": 0.6531108319759369,
|
|
"num_tokens": 552675987.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 0.8186415197485308,
|
|
"grad_norm": 0.224733308358228,
|
|
"learning_rate": 3.0453008313371852e-06,
|
|
"loss": 1.6823,
|
|
"mean_token_accuracy": 0.6511002957820893,
|
|
"num_tokens": 553645439.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 0.8200082000820008,
|
|
"grad_norm": 0.16162336045869505,
|
|
"learning_rate": 3.0417782161476683e-06,
|
|
"loss": 1.6089,
|
|
"mean_token_accuracy": 0.660329419374466,
|
|
"num_tokens": 554562360.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.8213748804154708,
|
|
"grad_norm": 0.19157612768852617,
|
|
"learning_rate": 3.0382556009581515e-06,
|
|
"loss": 1.7437,
|
|
"mean_token_accuracy": 0.6424703776836396,
|
|
"num_tokens": 555473906.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 0.8227415607489408,
|
|
"grad_norm": 0.22231981558762132,
|
|
"learning_rate": 3.0347329857686346e-06,
|
|
"loss": 1.5962,
|
|
"mean_token_accuracy": 0.6634117305278778,
|
|
"num_tokens": 556369135.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 0.8241082410824109,
|
|
"grad_norm": 0.2250031275773851,
|
|
"learning_rate": 3.031210370579118e-06,
|
|
"loss": 1.6055,
|
|
"mean_token_accuracy": 0.6665997207164764,
|
|
"num_tokens": 557299065.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 0.8254749214158809,
|
|
"grad_norm": 0.20690871220773527,
|
|
"learning_rate": 3.0276877553896017e-06,
|
|
"loss": 1.6816,
|
|
"mean_token_accuracy": 0.6491065680980682,
|
|
"num_tokens": 558215659.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.8268416017493508,
|
|
"grad_norm": 0.3132880970016886,
|
|
"learning_rate": 3.024165140200085e-06,
|
|
"loss": 1.6386,
|
|
"mean_token_accuracy": 0.6578512132167816,
|
|
"num_tokens": 559131274.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 0.8282082820828208,
|
|
"grad_norm": 0.1736624691107361,
|
|
"learning_rate": 3.020642525010568e-06,
|
|
"loss": 1.685,
|
|
"mean_token_accuracy": 0.6518297553062439,
|
|
"num_tokens": 560041641.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.8295749624162908,
|
|
"grad_norm": 0.1959256902747601,
|
|
"learning_rate": 3.0171199098210515e-06,
|
|
"loss": 1.6408,
|
|
"mean_token_accuracy": 0.6586882293224334,
|
|
"num_tokens": 560940156.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.8309416427497608,
|
|
"grad_norm": 0.27352378038972536,
|
|
"learning_rate": 3.0135972946315346e-06,
|
|
"loss": 1.6436,
|
|
"mean_token_accuracy": 0.659467077255249,
|
|
"num_tokens": 561871654.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.8323083230832309,
|
|
"grad_norm": 0.15647038941192723,
|
|
"learning_rate": 3.010074679442018e-06,
|
|
"loss": 1.6597,
|
|
"mean_token_accuracy": 0.6554698526859284,
|
|
"num_tokens": 562771917.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.8336750034167009,
|
|
"grad_norm": 0.2193355632717742,
|
|
"learning_rate": 3.0065520642525013e-06,
|
|
"loss": 1.6816,
|
|
"mean_token_accuracy": 0.6533270239830017,
|
|
"num_tokens": 563679562.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 0.8350416837501708,
|
|
"grad_norm": 0.2779585457617374,
|
|
"learning_rate": 3.003029449062985e-06,
|
|
"loss": 1.7049,
|
|
"mean_token_accuracy": 0.6483532786369324,
|
|
"num_tokens": 564620968.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.8364083640836408,
|
|
"grad_norm": 0.26376255089275324,
|
|
"learning_rate": 2.999506833873468e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6556403994560241,
|
|
"num_tokens": 565495071.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.8377750444171108,
|
|
"grad_norm": 0.19420121258129153,
|
|
"learning_rate": 2.995984218683951e-06,
|
|
"loss": 1.6856,
|
|
"mean_token_accuracy": 0.6472577333450318,
|
|
"num_tokens": 566408104.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 0.8391417247505808,
|
|
"grad_norm": 0.18462400743835097,
|
|
"learning_rate": 2.992461603494434e-06,
|
|
"loss": 1.62,
|
|
"mean_token_accuracy": 0.6582315564155579,
|
|
"num_tokens": 567352396.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.8405084050840509,
|
|
"grad_norm": 0.17519016542174842,
|
|
"learning_rate": 2.988938988304918e-06,
|
|
"loss": 1.6463,
|
|
"mean_token_accuracy": 0.6561886072158813,
|
|
"num_tokens": 568297995.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 0.8418750854175209,
|
|
"grad_norm": 0.20393833336984066,
|
|
"learning_rate": 2.9854163731154013e-06,
|
|
"loss": 1.6355,
|
|
"mean_token_accuracy": 0.6607475638389587,
|
|
"num_tokens": 569208445.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 0.8432417657509909,
|
|
"grad_norm": 0.3174880990861806,
|
|
"learning_rate": 2.9818937579258844e-06,
|
|
"loss": 1.6397,
|
|
"mean_token_accuracy": 0.6574244499206543,
|
|
"num_tokens": 570085233.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 0.8446084460844608,
|
|
"grad_norm": 0.2612923982992373,
|
|
"learning_rate": 2.9783711427363675e-06,
|
|
"loss": 1.6508,
|
|
"mean_token_accuracy": 0.6547590970993042,
|
|
"num_tokens": 571011008.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 0.8459751264179308,
|
|
"grad_norm": 0.20844142861240952,
|
|
"learning_rate": 2.9748485275468507e-06,
|
|
"loss": 1.6639,
|
|
"mean_token_accuracy": 0.6555649816989899,
|
|
"num_tokens": 571923781.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 0.8473418067514008,
|
|
"grad_norm": 0.1940789618376814,
|
|
"learning_rate": 2.9713259123573346e-06,
|
|
"loss": 1.6378,
|
|
"mean_token_accuracy": 0.6573352694511414,
|
|
"num_tokens": 572801884.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 0.8487084870848709,
|
|
"grad_norm": 0.21811190893705598,
|
|
"learning_rate": 2.9678032971678177e-06,
|
|
"loss": 1.614,
|
|
"mean_token_accuracy": 0.6603352904319764,
|
|
"num_tokens": 573677678.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 0.8500751674183409,
|
|
"grad_norm": 0.18587990665234483,
|
|
"learning_rate": 2.964280681978301e-06,
|
|
"loss": 1.6695,
|
|
"mean_token_accuracy": 0.6529141366481781,
|
|
"num_tokens": 574630729.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 0.8514418477518109,
|
|
"grad_norm": 0.24019011655009928,
|
|
"learning_rate": 2.960758066788784e-06,
|
|
"loss": 1.6162,
|
|
"mean_token_accuracy": 0.6593936920166016,
|
|
"num_tokens": 575508305.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.8528085280852808,
|
|
"grad_norm": 0.2327638583866374,
|
|
"learning_rate": 2.9572354515992675e-06,
|
|
"loss": 1.682,
|
|
"mean_token_accuracy": 0.6486114501953125,
|
|
"num_tokens": 576396827.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 1.54765625,
|
|
"epoch": 0.8541752084187508,
|
|
"grad_norm": 0.2150820065335058,
|
|
"learning_rate": 2.9537128364097507e-06,
|
|
"loss": 1.5422,
|
|
"mean_token_accuracy": 0.6754782617092132,
|
|
"num_tokens": 577326302.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.8555418887522208,
|
|
"grad_norm": 0.19839035164526123,
|
|
"learning_rate": 2.9501902212202342e-06,
|
|
"loss": 1.6512,
|
|
"mean_token_accuracy": 0.6560932517051696,
|
|
"num_tokens": 578288853.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 0.8569085690856909,
|
|
"grad_norm": 0.2299176888329866,
|
|
"learning_rate": 2.9466676060307178e-06,
|
|
"loss": 1.6549,
|
|
"mean_token_accuracy": 0.6538411855697632,
|
|
"num_tokens": 579222491.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 0.8582752494191609,
|
|
"grad_norm": 0.16086771698163546,
|
|
"learning_rate": 2.943144990841201e-06,
|
|
"loss": 1.6687,
|
|
"mean_token_accuracy": 0.6531603753566741,
|
|
"num_tokens": 580163477.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.8596419297526309,
|
|
"grad_norm": 0.18201058549001595,
|
|
"learning_rate": 2.939622375651684e-06,
|
|
"loss": 1.64,
|
|
"mean_token_accuracy": 0.6605691134929657,
|
|
"num_tokens": 581088929.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 0.8610086100861009,
|
|
"grad_norm": 0.1768628541742815,
|
|
"learning_rate": 2.936099760462167e-06,
|
|
"loss": 1.6648,
|
|
"mean_token_accuracy": 0.6530806422233582,
|
|
"num_tokens": 582058943.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 0.8623752904195708,
|
|
"grad_norm": 0.1861385731788424,
|
|
"learning_rate": 2.932577145272651e-06,
|
|
"loss": 1.6421,
|
|
"mean_token_accuracy": 0.6592398345470428,
|
|
"num_tokens": 582956520.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.8637419707530408,
|
|
"grad_norm": 0.2018530170855684,
|
|
"learning_rate": 2.9290545300831342e-06,
|
|
"loss": 1.6701,
|
|
"mean_token_accuracy": 0.6543229520320892,
|
|
"num_tokens": 583904961.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.8651086510865109,
|
|
"grad_norm": 0.2576903432700064,
|
|
"learning_rate": 2.9255319148936174e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6527678489685058,
|
|
"num_tokens": 584829274.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 1.5515625,
|
|
"epoch": 0.8664753314199809,
|
|
"grad_norm": 0.1770064975148639,
|
|
"learning_rate": 2.9220092997041005e-06,
|
|
"loss": 1.5411,
|
|
"mean_token_accuracy": 0.6758271217346191,
|
|
"num_tokens": 585768968.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.8678420117534509,
|
|
"grad_norm": 0.1716264064134498,
|
|
"learning_rate": 2.9184866845145836e-06,
|
|
"loss": 1.6557,
|
|
"mean_token_accuracy": 0.6535046875476838,
|
|
"num_tokens": 586690235.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 0.8692086920869209,
|
|
"grad_norm": 0.23819462327010477,
|
|
"learning_rate": 2.9149640693250667e-06,
|
|
"loss": 1.6553,
|
|
"mean_token_accuracy": 0.6557229161262512,
|
|
"num_tokens": 587646609.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 0.8705753724203908,
|
|
"grad_norm": 0.19752812648061135,
|
|
"learning_rate": 2.9114414541355507e-06,
|
|
"loss": 1.6567,
|
|
"mean_token_accuracy": 0.6554705619812011,
|
|
"num_tokens": 588551254.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.8719420527538608,
|
|
"grad_norm": 0.19172469117978985,
|
|
"learning_rate": 2.907918838946034e-06,
|
|
"loss": 1.6522,
|
|
"mean_token_accuracy": 0.657931125164032,
|
|
"num_tokens": 589478223.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.8733087330873309,
|
|
"grad_norm": 0.20977006865758682,
|
|
"learning_rate": 2.904396223756517e-06,
|
|
"loss": 1.6518,
|
|
"mean_token_accuracy": 0.6562906742095947,
|
|
"num_tokens": 590400258.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.8746754134208009,
|
|
"grad_norm": 0.2154190357854447,
|
|
"learning_rate": 2.9008736085670005e-06,
|
|
"loss": 1.6954,
|
|
"mean_token_accuracy": 0.6523117065429688,
|
|
"num_tokens": 591320318.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 0.8760420937542709,
|
|
"grad_norm": 0.2136423670499421,
|
|
"learning_rate": 2.8973509933774836e-06,
|
|
"loss": 1.5706,
|
|
"mean_token_accuracy": 0.6683948338031769,
|
|
"num_tokens": 592215893.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.8774087740877409,
|
|
"grad_norm": 0.2012289946501684,
|
|
"learning_rate": 2.893828378187967e-06,
|
|
"loss": 1.663,
|
|
"mean_token_accuracy": 0.6542115330696106,
|
|
"num_tokens": 593158145.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 0.8787754544212109,
|
|
"grad_norm": 0.1955263764693573,
|
|
"learning_rate": 2.8903057629984503e-06,
|
|
"loss": 1.5915,
|
|
"mean_token_accuracy": 0.6643987476825715,
|
|
"num_tokens": 594059637.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 0.8801421347546808,
|
|
"grad_norm": 0.3813669621447526,
|
|
"learning_rate": 2.886783147808934e-06,
|
|
"loss": 1.5766,
|
|
"mean_token_accuracy": 0.6689460217952728,
|
|
"num_tokens": 594999747.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.8815088150881509,
|
|
"grad_norm": 0.16188770423360374,
|
|
"learning_rate": 2.883260532619417e-06,
|
|
"loss": 1.665,
|
|
"mean_token_accuracy": 0.6517450630664825,
|
|
"num_tokens": 595898176.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 0.8828754954216209,
|
|
"grad_norm": 0.2303694892774742,
|
|
"learning_rate": 2.8797379174299e-06,
|
|
"loss": 1.6262,
|
|
"mean_token_accuracy": 0.6598759055137634,
|
|
"num_tokens": 596769839.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 0.8842421757550909,
|
|
"grad_norm": 0.18610760477857732,
|
|
"learning_rate": 2.876215302240383e-06,
|
|
"loss": 1.614,
|
|
"mean_token_accuracy": 0.6659195959568024,
|
|
"num_tokens": 597679610.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 0.8856088560885609,
|
|
"grad_norm": 0.23247914902869732,
|
|
"learning_rate": 2.872692687050867e-06,
|
|
"loss": 1.6661,
|
|
"mean_token_accuracy": 0.6540320217609406,
|
|
"num_tokens": 598637955.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.8869755364220309,
|
|
"grad_norm": 0.20493250102041494,
|
|
"learning_rate": 2.8691700718613503e-06,
|
|
"loss": 1.6814,
|
|
"mean_token_accuracy": 0.6497736692428588,
|
|
"num_tokens": 599548219.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 0.8883422167555008,
|
|
"grad_norm": 0.20506201661535015,
|
|
"learning_rate": 2.8656474566718334e-06,
|
|
"loss": 1.6331,
|
|
"mean_token_accuracy": 0.6567708909511566,
|
|
"num_tokens": 600502050.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.8897088970889709,
|
|
"grad_norm": 0.2291630337692197,
|
|
"learning_rate": 2.8621248414823165e-06,
|
|
"loss": 1.6181,
|
|
"mean_token_accuracy": 0.6608550429344178,
|
|
"num_tokens": 601426213.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.8910755774224409,
|
|
"grad_norm": 0.23428996830340923,
|
|
"learning_rate": 2.8586022262927997e-06,
|
|
"loss": 1.6396,
|
|
"mean_token_accuracy": 0.657115364074707,
|
|
"num_tokens": 602278828.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 0.8924422577559109,
|
|
"grad_norm": 0.15774768385428364,
|
|
"learning_rate": 2.8550796111032836e-06,
|
|
"loss": 1.7662,
|
|
"mean_token_accuracy": 0.6386707246303558,
|
|
"num_tokens": 603211600.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.8938089380893809,
|
|
"grad_norm": 0.24256334045156544,
|
|
"learning_rate": 2.8515569959137668e-06,
|
|
"loss": 1.6604,
|
|
"mean_token_accuracy": 0.6539735674858094,
|
|
"num_tokens": 604125831.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 0.8951756184228509,
|
|
"grad_norm": 0.35819918825513747,
|
|
"learning_rate": 2.84803438072425e-06,
|
|
"loss": 1.6383,
|
|
"mean_token_accuracy": 0.6611362516880035,
|
|
"num_tokens": 605043581.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 0.8965422987563209,
|
|
"grad_norm": 0.17326821949490717,
|
|
"learning_rate": 2.844511765534733e-06,
|
|
"loss": 1.6314,
|
|
"mean_token_accuracy": 0.659745067358017,
|
|
"num_tokens": 605953064.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 0.897908979089791,
|
|
"grad_norm": 0.239737969075233,
|
|
"learning_rate": 2.8409891503452166e-06,
|
|
"loss": 1.5889,
|
|
"mean_token_accuracy": 0.6664168655872345,
|
|
"num_tokens": 606893731.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.8992756594232609,
|
|
"grad_norm": 0.18654927310807834,
|
|
"learning_rate": 2.8374665351556997e-06,
|
|
"loss": 1.6379,
|
|
"mean_token_accuracy": 0.6569889724254608,
|
|
"num_tokens": 607790775.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 0.9006423397567309,
|
|
"grad_norm": 0.17372054454657573,
|
|
"learning_rate": 2.8339439199661832e-06,
|
|
"loss": 1.6145,
|
|
"mean_token_accuracy": 0.663248461484909,
|
|
"num_tokens": 608758974.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.9020090200902009,
|
|
"grad_norm": 0.22774234490633838,
|
|
"learning_rate": 2.8304213047766663e-06,
|
|
"loss": 1.6757,
|
|
"mean_token_accuracy": 0.6516222476959228,
|
|
"num_tokens": 609649758.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 0.9033757004236709,
|
|
"grad_norm": 0.2173914862648114,
|
|
"learning_rate": 2.82689868958715e-06,
|
|
"loss": 1.5959,
|
|
"mean_token_accuracy": 0.6650044441223144,
|
|
"num_tokens": 610585015.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.9047423807571409,
|
|
"grad_norm": 0.19354823025664894,
|
|
"learning_rate": 2.823376074397633e-06,
|
|
"loss": 1.632,
|
|
"mean_token_accuracy": 0.6581346392631531,
|
|
"num_tokens": 611507159.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 0.906109061090611,
|
|
"grad_norm": 0.22216552528086983,
|
|
"learning_rate": 2.819853459208116e-06,
|
|
"loss": 1.5964,
|
|
"mean_token_accuracy": 0.6641504764556885,
|
|
"num_tokens": 612393740.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.9074757414240809,
|
|
"grad_norm": 0.17790780922965987,
|
|
"learning_rate": 2.8163308440186e-06,
|
|
"loss": 1.7064,
|
|
"mean_token_accuracy": 0.6463433921337127,
|
|
"num_tokens": 613315051.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 0.9088424217575509,
|
|
"grad_norm": 0.21964843396407838,
|
|
"learning_rate": 2.8128082288290832e-06,
|
|
"loss": 1.6235,
|
|
"mean_token_accuracy": 0.6612190067768097,
|
|
"num_tokens": 614223391.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.9102091020910209,
|
|
"grad_norm": 0.2066574995047673,
|
|
"learning_rate": 2.8092856136395664e-06,
|
|
"loss": 1.6809,
|
|
"mean_token_accuracy": 0.650900810956955,
|
|
"num_tokens": 615153192.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.9115757824244909,
|
|
"grad_norm": 0.18466056835795833,
|
|
"learning_rate": 2.8057629984500495e-06,
|
|
"loss": 1.6687,
|
|
"mean_token_accuracy": 0.6487333655357361,
|
|
"num_tokens": 616121054.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.9129424627579609,
|
|
"grad_norm": 0.2651938472405402,
|
|
"learning_rate": 2.8022403832605326e-06,
|
|
"loss": 1.6376,
|
|
"mean_token_accuracy": 0.6596436381340027,
|
|
"num_tokens": 617060251.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 0.914309143091431,
|
|
"grad_norm": 0.18355895599923344,
|
|
"learning_rate": 2.7987177680710157e-06,
|
|
"loss": 1.6426,
|
|
"mean_token_accuracy": 0.6603686392307282,
|
|
"num_tokens": 618006096.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.915675823424901,
|
|
"grad_norm": 0.1900721664623405,
|
|
"learning_rate": 2.7951951528814997e-06,
|
|
"loss": 1.6663,
|
|
"mean_token_accuracy": 0.653769713640213,
|
|
"num_tokens": 618954624.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 0.9170425037583709,
|
|
"grad_norm": 0.17400738230148025,
|
|
"learning_rate": 2.791672537691983e-06,
|
|
"loss": 1.6578,
|
|
"mean_token_accuracy": 0.65167937874794,
|
|
"num_tokens": 619928406.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 0.9184091840918409,
|
|
"grad_norm": 0.20518025049439798,
|
|
"learning_rate": 2.788149922502466e-06,
|
|
"loss": 1.616,
|
|
"mean_token_accuracy": 0.6638425767421723,
|
|
"num_tokens": 620837251.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 0.9197758644253109,
|
|
"grad_norm": 0.24548698130145452,
|
|
"learning_rate": 2.784627307312949e-06,
|
|
"loss": 1.5963,
|
|
"mean_token_accuracy": 0.6668699204921722,
|
|
"num_tokens": 621786056.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 0.9211425447587809,
|
|
"grad_norm": 0.17705570008580124,
|
|
"learning_rate": 2.7811046921234326e-06,
|
|
"loss": 1.5965,
|
|
"mean_token_accuracy": 0.6636301755905152,
|
|
"num_tokens": 622713332.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.922509225092251,
|
|
"grad_norm": 0.1731774604979562,
|
|
"learning_rate": 2.777582076933916e-06,
|
|
"loss": 1.6381,
|
|
"mean_token_accuracy": 0.6581340432167053,
|
|
"num_tokens": 623619628.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 0.923875905425721,
|
|
"grad_norm": 0.23948274688359117,
|
|
"learning_rate": 2.7740594617443993e-06,
|
|
"loss": 1.6484,
|
|
"mean_token_accuracy": 0.6560145735740661,
|
|
"num_tokens": 624570451.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 0.9252425857591909,
|
|
"grad_norm": 0.17861888460410114,
|
|
"learning_rate": 2.770536846554883e-06,
|
|
"loss": 1.6207,
|
|
"mean_token_accuracy": 0.6616679608821869,
|
|
"num_tokens": 625544172.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.9266092660926609,
|
|
"grad_norm": 0.3366818394341423,
|
|
"learning_rate": 2.767014231365366e-06,
|
|
"loss": 1.6599,
|
|
"mean_token_accuracy": 0.6539095282554627,
|
|
"num_tokens": 626460482.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.9279759464261309,
|
|
"grad_norm": 0.1822278980778532,
|
|
"learning_rate": 2.763491616175849e-06,
|
|
"loss": 1.6638,
|
|
"mean_token_accuracy": 0.6553999245166778,
|
|
"num_tokens": 627430803.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.9293426267596009,
|
|
"grad_norm": 0.21791057191715887,
|
|
"learning_rate": 2.759969000986332e-06,
|
|
"loss": 1.6255,
|
|
"mean_token_accuracy": 0.6592534005641937,
|
|
"num_tokens": 628373504.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 0.930709307093071,
|
|
"grad_norm": 0.23302662806743038,
|
|
"learning_rate": 2.756446385796816e-06,
|
|
"loss": 1.594,
|
|
"mean_token_accuracy": 0.6652568876743317,
|
|
"num_tokens": 629282669.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 0.932075987426541,
|
|
"grad_norm": 0.24339461735829124,
|
|
"learning_rate": 2.7529237706072993e-06,
|
|
"loss": 1.6167,
|
|
"mean_token_accuracy": 0.6601527810096741,
|
|
"num_tokens": 630222269.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 0.933442667760011,
|
|
"grad_norm": 0.21038571996983407,
|
|
"learning_rate": 2.7494011554177824e-06,
|
|
"loss": 1.5867,
|
|
"mean_token_accuracy": 0.6682217001914978,
|
|
"num_tokens": 631143459.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.9348093480934809,
|
|
"grad_norm": 0.1969611097055871,
|
|
"learning_rate": 2.7458785402282656e-06,
|
|
"loss": 1.6894,
|
|
"mean_token_accuracy": 0.6504644751548767,
|
|
"num_tokens": 632059336.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.9361760284269509,
|
|
"grad_norm": 0.19760675322949578,
|
|
"learning_rate": 2.7423559250387487e-06,
|
|
"loss": 1.6824,
|
|
"mean_token_accuracy": 0.6504470944404602,
|
|
"num_tokens": 632985307.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 0.9375427087604209,
|
|
"grad_norm": 0.17079401168836786,
|
|
"learning_rate": 2.7388333098492326e-06,
|
|
"loss": 1.5917,
|
|
"mean_token_accuracy": 0.6666249513626099,
|
|
"num_tokens": 633856753.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.938909389093891,
|
|
"grad_norm": 0.16837061698670353,
|
|
"learning_rate": 2.7353106946597158e-06,
|
|
"loss": 1.6547,
|
|
"mean_token_accuracy": 0.654223483800888,
|
|
"num_tokens": 634824750.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.940276069427361,
|
|
"grad_norm": 0.15637703060202723,
|
|
"learning_rate": 2.731788079470199e-06,
|
|
"loss": 1.6585,
|
|
"mean_token_accuracy": 0.6526130437850952,
|
|
"num_tokens": 635705386.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.941642749760831,
|
|
"grad_norm": 0.1694185679286746,
|
|
"learning_rate": 2.728265464280682e-06,
|
|
"loss": 1.6732,
|
|
"mean_token_accuracy": 0.6528739333152771,
|
|
"num_tokens": 636605146.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 0.9430094300943009,
|
|
"grad_norm": 0.20033904222096532,
|
|
"learning_rate": 2.7247428490911656e-06,
|
|
"loss": 1.6337,
|
|
"mean_token_accuracy": 0.6586840450763702,
|
|
"num_tokens": 637490988.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.9443761104277709,
|
|
"grad_norm": 0.20874786864646197,
|
|
"learning_rate": 2.7212202339016487e-06,
|
|
"loss": 1.6472,
|
|
"mean_token_accuracy": 0.6565926492214202,
|
|
"num_tokens": 638429663.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 0.9457427907612409,
|
|
"grad_norm": 0.222217379014536,
|
|
"learning_rate": 2.7176976187121322e-06,
|
|
"loss": 1.6249,
|
|
"mean_token_accuracy": 0.660797506570816,
|
|
"num_tokens": 639419683.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.947109471094711,
|
|
"grad_norm": 0.29176339456651695,
|
|
"learning_rate": 2.7141750035226154e-06,
|
|
"loss": 1.6505,
|
|
"mean_token_accuracy": 0.65588498711586,
|
|
"num_tokens": 640386636.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.948476151428181,
|
|
"grad_norm": 0.43354195077166097,
|
|
"learning_rate": 2.710652388333099e-06,
|
|
"loss": 1.6415,
|
|
"mean_token_accuracy": 0.6594763219356536,
|
|
"num_tokens": 641343156.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.949842831761651,
|
|
"grad_norm": 0.19961027880696777,
|
|
"learning_rate": 2.707129773143582e-06,
|
|
"loss": 1.6911,
|
|
"mean_token_accuracy": 0.6482066214084625,
|
|
"num_tokens": 642301004.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 0.951209512095121,
|
|
"grad_norm": 0.17957294074546098,
|
|
"learning_rate": 2.703607157954065e-06,
|
|
"loss": 1.6181,
|
|
"mean_token_accuracy": 0.6609061002731323,
|
|
"num_tokens": 643236325.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.9525761924285909,
|
|
"grad_norm": 0.1555371626519259,
|
|
"learning_rate": 2.7000845427645483e-06,
|
|
"loss": 1.6686,
|
|
"mean_token_accuracy": 0.6523111402988434,
|
|
"num_tokens": 644125251.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 1.5640625,
|
|
"epoch": 0.9539428727620609,
|
|
"grad_norm": 0.20243554498819508,
|
|
"learning_rate": 2.6965619275750322e-06,
|
|
"loss": 1.5663,
|
|
"mean_token_accuracy": 0.6678133130073547,
|
|
"num_tokens": 645029955.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.955309553095531,
|
|
"grad_norm": 0.24676006350063462,
|
|
"learning_rate": 2.6930393123855154e-06,
|
|
"loss": 1.7411,
|
|
"mean_token_accuracy": 0.6444582521915436,
|
|
"num_tokens": 645970940.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.956676233429001,
|
|
"grad_norm": 0.22819983504687893,
|
|
"learning_rate": 2.6895166971959985e-06,
|
|
"loss": 1.719,
|
|
"mean_token_accuracy": 0.6458564221858978,
|
|
"num_tokens": 646871362.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 0.958042913762471,
|
|
"grad_norm": 0.24185904361200128,
|
|
"learning_rate": 2.6859940820064816e-06,
|
|
"loss": 1.5967,
|
|
"mean_token_accuracy": 0.6660885572433471,
|
|
"num_tokens": 647778656.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.959409594095941,
|
|
"grad_norm": 0.21071942149393277,
|
|
"learning_rate": 2.6824714668169647e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.6576824009418487,
|
|
"num_tokens": 648694015.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.9607762744294109,
|
|
"grad_norm": 0.2906672854306217,
|
|
"learning_rate": 2.6789488516274487e-06,
|
|
"loss": 1.6704,
|
|
"mean_token_accuracy": 0.6524830102920532,
|
|
"num_tokens": 649689442.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 0.9621429547628809,
|
|
"grad_norm": 0.19342311182419272,
|
|
"learning_rate": 2.675426236437932e-06,
|
|
"loss": 1.6335,
|
|
"mean_token_accuracy": 0.6573789179325104,
|
|
"num_tokens": 650654576.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 0.963509635096351,
|
|
"grad_norm": 0.27374836275432424,
|
|
"learning_rate": 2.671903621248415e-06,
|
|
"loss": 1.6164,
|
|
"mean_token_accuracy": 0.6589148819446564,
|
|
"num_tokens": 651547078.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.964876315429821,
|
|
"grad_norm": 0.21072301168139884,
|
|
"learning_rate": 2.668381006058898e-06,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.6560944020748138,
|
|
"num_tokens": 652444537.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 0.966242995763291,
|
|
"grad_norm": 0.1680406203624872,
|
|
"learning_rate": 2.6648583908693816e-06,
|
|
"loss": 1.6646,
|
|
"mean_token_accuracy": 0.6539726555347443,
|
|
"num_tokens": 653366830.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.967609676096761,
|
|
"grad_norm": 0.17972033482929367,
|
|
"learning_rate": 2.6613357756798648e-06,
|
|
"loss": 1.7089,
|
|
"mean_token_accuracy": 0.6482871174812317,
|
|
"num_tokens": 654286916.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 0.968976356430231,
|
|
"grad_norm": 0.17944279480092995,
|
|
"learning_rate": 2.6578131604903483e-06,
|
|
"loss": 1.6287,
|
|
"mean_token_accuracy": 0.6592101037502289,
|
|
"num_tokens": 655174099.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 0.9703430367637009,
|
|
"grad_norm": 0.44599009134660306,
|
|
"learning_rate": 2.6542905453008314e-06,
|
|
"loss": 1.634,
|
|
"mean_token_accuracy": 0.6588185787200928,
|
|
"num_tokens": 656096964.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 0.971709717097171,
|
|
"grad_norm": 0.24426217223714028,
|
|
"learning_rate": 2.650767930111315e-06,
|
|
"loss": 1.656,
|
|
"mean_token_accuracy": 0.6576848030090332,
|
|
"num_tokens": 657029319.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.973076397430641,
|
|
"grad_norm": 0.17446121557606775,
|
|
"learning_rate": 2.647245314921798e-06,
|
|
"loss": 1.6636,
|
|
"mean_token_accuracy": 0.6550226449966431,
|
|
"num_tokens": 657989440.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.974443077764111,
|
|
"grad_norm": 0.1616979090847364,
|
|
"learning_rate": 2.6437226997322812e-06,
|
|
"loss": 1.6432,
|
|
"mean_token_accuracy": 0.6575157523155213,
|
|
"num_tokens": 658962019.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.975809758097581,
|
|
"grad_norm": 0.16392305856538383,
|
|
"learning_rate": 2.640200084542765e-06,
|
|
"loss": 1.6665,
|
|
"mean_token_accuracy": 0.6534431755542756,
|
|
"num_tokens": 659898803.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 0.977176438431051,
|
|
"grad_norm": 0.2039566217662769,
|
|
"learning_rate": 2.6366774693532483e-06,
|
|
"loss": 1.6271,
|
|
"mean_token_accuracy": 0.6597892999649048,
|
|
"num_tokens": 660760206.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.9785431187645209,
|
|
"grad_norm": 0.3296413898603227,
|
|
"learning_rate": 2.6331548541637314e-06,
|
|
"loss": 1.702,
|
|
"mean_token_accuracy": 0.6479421973228454,
|
|
"num_tokens": 661678071.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 0.979909799097991,
|
|
"grad_norm": 0.1599441535948199,
|
|
"learning_rate": 2.6296322389742146e-06,
|
|
"loss": 1.5974,
|
|
"mean_token_accuracy": 0.6641391038894653,
|
|
"num_tokens": 662613418.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 1.58125,
|
|
"epoch": 0.981276479431461,
|
|
"grad_norm": 0.3348191367129681,
|
|
"learning_rate": 2.6261096237846977e-06,
|
|
"loss": 1.5877,
|
|
"mean_token_accuracy": 0.6669567942619323,
|
|
"num_tokens": 663524536.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 0.982643159764931,
|
|
"grad_norm": 0.19127543893335752,
|
|
"learning_rate": 2.622587008595181e-06,
|
|
"loss": 1.6112,
|
|
"mean_token_accuracy": 0.6611364901065826,
|
|
"num_tokens": 664426344.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 0.984009840098401,
|
|
"grad_norm": 0.23508823267601825,
|
|
"learning_rate": 2.6190643934056648e-06,
|
|
"loss": 1.6757,
|
|
"mean_token_accuracy": 0.6508909404277802,
|
|
"num_tokens": 665358418.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 0.985376520431871,
|
|
"grad_norm": 0.1939925985099776,
|
|
"learning_rate": 2.615541778216148e-06,
|
|
"loss": 1.6852,
|
|
"mean_token_accuracy": 0.6504729270935059,
|
|
"num_tokens": 666261131.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 0.9867432007653409,
|
|
"grad_norm": 0.2061615419113158,
|
|
"learning_rate": 2.612019163026631e-06,
|
|
"loss": 1.6374,
|
|
"mean_token_accuracy": 0.6596892952919007,
|
|
"num_tokens": 667166482.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.988109881098811,
|
|
"grad_norm": 0.22424686568127453,
|
|
"learning_rate": 2.6084965478371146e-06,
|
|
"loss": 1.6982,
|
|
"mean_token_accuracy": 0.648541408777237,
|
|
"num_tokens": 668093983.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 1.56328125,
|
|
"epoch": 0.989476561432281,
|
|
"grad_norm": 0.16879973880265728,
|
|
"learning_rate": 2.6049739326475977e-06,
|
|
"loss": 1.5868,
|
|
"mean_token_accuracy": 0.6651666283607482,
|
|
"num_tokens": 669024497.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.990843241765751,
|
|
"grad_norm": 0.8456804739912809,
|
|
"learning_rate": 2.6014513174580812e-06,
|
|
"loss": 1.6812,
|
|
"mean_token_accuracy": 0.6506978929042816,
|
|
"num_tokens": 669920204.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 0.992209922099221,
|
|
"grad_norm": 0.1867544669553327,
|
|
"learning_rate": 2.5979287022685644e-06,
|
|
"loss": 1.6018,
|
|
"mean_token_accuracy": 0.6642862796783447,
|
|
"num_tokens": 670821720.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 0.993576602432691,
|
|
"grad_norm": 0.20834011495609228,
|
|
"learning_rate": 2.594406087079048e-06,
|
|
"loss": 1.5784,
|
|
"mean_token_accuracy": 0.6659556865692139,
|
|
"num_tokens": 671732909.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 0.994943282766161,
|
|
"grad_norm": 0.19699668454353225,
|
|
"learning_rate": 2.590883471889531e-06,
|
|
"loss": 1.6903,
|
|
"mean_token_accuracy": 0.6498040378093719,
|
|
"num_tokens": 672642730.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 0.996309963099631,
|
|
"grad_norm": 0.3379398177326183,
|
|
"learning_rate": 2.587360856700014e-06,
|
|
"loss": 1.6004,
|
|
"mean_token_accuracy": 0.6626649081707001,
|
|
"num_tokens": 673545590.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 0.997676643433101,
|
|
"grad_norm": 0.20698794533342144,
|
|
"learning_rate": 2.5838382415104973e-06,
|
|
"loss": 1.6352,
|
|
"mean_token_accuracy": 0.6577263116836548,
|
|
"num_tokens": 674462440.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.999043323766571,
|
|
"grad_norm": 0.1804067589157295,
|
|
"learning_rate": 2.5803156263209813e-06,
|
|
"loss": 1.6655,
|
|
"mean_token_accuracy": 0.6504822373390198,
|
|
"num_tokens": 675361871.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 1.000410004100041,
|
|
"grad_norm": 0.21226339542003167,
|
|
"learning_rate": 2.5767930111314644e-06,
|
|
"loss": 1.6974,
|
|
"mean_token_accuracy": 0.6469481408596038,
|
|
"num_tokens": 676316526.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.001776684433511,
|
|
"grad_norm": 0.25749475102600405,
|
|
"learning_rate": 2.5732703959419475e-06,
|
|
"loss": 1.6157,
|
|
"mean_token_accuracy": 0.6601819038391114,
|
|
"num_tokens": 677218271.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.003143364766981,
|
|
"grad_norm": 0.20055764562432393,
|
|
"learning_rate": 2.5697477807524306e-06,
|
|
"loss": 1.6542,
|
|
"mean_token_accuracy": 0.6530854940414429,
|
|
"num_tokens": 678197731.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.004510045100451,
|
|
"grad_norm": 0.20417742269555342,
|
|
"learning_rate": 2.5662251655629138e-06,
|
|
"loss": 1.5926,
|
|
"mean_token_accuracy": 0.6646928131580353,
|
|
"num_tokens": 679125755.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 1.005876725433921,
|
|
"grad_norm": 0.1828209099181416,
|
|
"learning_rate": 2.5627025503733977e-06,
|
|
"loss": 1.7129,
|
|
"mean_token_accuracy": 0.6465388417243958,
|
|
"num_tokens": 680052412.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.007243405767391,
|
|
"grad_norm": 0.22612942578278514,
|
|
"learning_rate": 2.559179935183881e-06,
|
|
"loss": 1.6558,
|
|
"mean_token_accuracy": 0.6564060270786285,
|
|
"num_tokens": 680960539.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.0086100861008611,
|
|
"grad_norm": 0.19632594220047833,
|
|
"learning_rate": 2.555657319994364e-06,
|
|
"loss": 1.7066,
|
|
"mean_token_accuracy": 0.6509465396404266,
|
|
"num_tokens": 681880832.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.009976766434331,
|
|
"grad_norm": 0.19143949778940997,
|
|
"learning_rate": 2.552134704804847e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.6514132142066955,
|
|
"num_tokens": 682835184.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 1.011343446767801,
|
|
"grad_norm": 0.24931795959879316,
|
|
"learning_rate": 2.5486120896153306e-06,
|
|
"loss": 1.7367,
|
|
"mean_token_accuracy": 0.6399857819080352,
|
|
"num_tokens": 683737448.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.012710127101271,
|
|
"grad_norm": 0.2255215708079143,
|
|
"learning_rate": 2.5450894744258138e-06,
|
|
"loss": 1.6365,
|
|
"mean_token_accuracy": 0.6588810503482818,
|
|
"num_tokens": 684660851.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.014076807434741,
|
|
"grad_norm": 0.20806330416977944,
|
|
"learning_rate": 2.5415668592362973e-06,
|
|
"loss": 1.5977,
|
|
"mean_token_accuracy": 0.6657264530658722,
|
|
"num_tokens": 685544752.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.015443487768211,
|
|
"grad_norm": 0.20539792352470693,
|
|
"learning_rate": 2.5380442440467804e-06,
|
|
"loss": 1.6109,
|
|
"mean_token_accuracy": 0.6653925776481628,
|
|
"num_tokens": 686490494.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.016810168101681,
|
|
"grad_norm": 0.18647663143007273,
|
|
"learning_rate": 2.534521628857264e-06,
|
|
"loss": 1.6641,
|
|
"mean_token_accuracy": 0.6522125780582428,
|
|
"num_tokens": 687406352.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.018176848435151,
|
|
"grad_norm": 0.20508943249842,
|
|
"learning_rate": 2.530999013667747e-06,
|
|
"loss": 1.6223,
|
|
"mean_token_accuracy": 0.6612802803516388,
|
|
"num_tokens": 688305378.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 1.019543528768621,
|
|
"grad_norm": 0.2138121885086559,
|
|
"learning_rate": 2.5274763984782302e-06,
|
|
"loss": 1.7521,
|
|
"mean_token_accuracy": 0.6424909770488739,
|
|
"num_tokens": 689230996.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.020910209102091,
|
|
"grad_norm": 0.27209498031105656,
|
|
"learning_rate": 2.5239537832887138e-06,
|
|
"loss": 1.6005,
|
|
"mean_token_accuracy": 0.6650220930576325,
|
|
"num_tokens": 690189450.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.022276889435561,
|
|
"grad_norm": 0.21791311565232588,
|
|
"learning_rate": 2.5204311680991973e-06,
|
|
"loss": 1.6398,
|
|
"mean_token_accuracy": 0.6552492856979371,
|
|
"num_tokens": 691119626.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 1.5875,
|
|
"epoch": 1.023643569769031,
|
|
"grad_norm": 0.1419450731730608,
|
|
"learning_rate": 2.5169085529096804e-06,
|
|
"loss": 1.5841,
|
|
"mean_token_accuracy": 0.6666845202445983,
|
|
"num_tokens": 691976882.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 1.0250102501025011,
|
|
"grad_norm": 0.21864754469781722,
|
|
"learning_rate": 2.5133859377201636e-06,
|
|
"loss": 1.6946,
|
|
"mean_token_accuracy": 0.6449464380741119,
|
|
"num_tokens": 692887980.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 1.0263769304359711,
|
|
"grad_norm": 0.18462063597370276,
|
|
"learning_rate": 2.5098633225306467e-06,
|
|
"loss": 1.6964,
|
|
"mean_token_accuracy": 0.6481751084327698,
|
|
"num_tokens": 693820697.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.027743610769441,
|
|
"grad_norm": 0.16712419199185538,
|
|
"learning_rate": 2.50634070734113e-06,
|
|
"loss": 1.6242,
|
|
"mean_token_accuracy": 0.6577604591846467,
|
|
"num_tokens": 694755967.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.029110291102911,
|
|
"grad_norm": 0.1884959000233314,
|
|
"learning_rate": 2.502818092151614e-06,
|
|
"loss": 1.6463,
|
|
"mean_token_accuracy": 0.6577674388885498,
|
|
"num_tokens": 695661949.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.030476971436381,
|
|
"grad_norm": 0.20260100810554746,
|
|
"learning_rate": 2.499295476962097e-06,
|
|
"loss": 1.6298,
|
|
"mean_token_accuracy": 0.6581154763698578,
|
|
"num_tokens": 696589845.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.031843651769851,
|
|
"grad_norm": 0.31511956986438044,
|
|
"learning_rate": 2.49577286177258e-06,
|
|
"loss": 1.6648,
|
|
"mean_token_accuracy": 0.6570824205875396,
|
|
"num_tokens": 697543953.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.033210332103321,
|
|
"grad_norm": 0.3056817870835255,
|
|
"learning_rate": 2.492250246583063e-06,
|
|
"loss": 1.6688,
|
|
"mean_token_accuracy": 0.6546256840229034,
|
|
"num_tokens": 698454879.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.034577012436791,
|
|
"grad_norm": 0.1864514837188523,
|
|
"learning_rate": 2.4887276313935467e-06,
|
|
"loss": 1.6341,
|
|
"mean_token_accuracy": 0.6596402227878571,
|
|
"num_tokens": 699361659.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 1.035943692770261,
|
|
"grad_norm": 0.21025731128360117,
|
|
"learning_rate": 2.48520501620403e-06,
|
|
"loss": 1.6267,
|
|
"mean_token_accuracy": 0.6576940059661865,
|
|
"num_tokens": 700300121.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.037310373103731,
|
|
"grad_norm": 0.2512406465112006,
|
|
"learning_rate": 2.4816824010145134e-06,
|
|
"loss": 1.6591,
|
|
"mean_token_accuracy": 0.6549495041370392,
|
|
"num_tokens": 701203446.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.038677053437201,
|
|
"grad_norm": 0.19844497287645324,
|
|
"learning_rate": 2.478159785824997e-06,
|
|
"loss": 1.6374,
|
|
"mean_token_accuracy": 0.6567413449287415,
|
|
"num_tokens": 702184689.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.040043733770671,
|
|
"grad_norm": 0.23519421771664964,
|
|
"learning_rate": 2.47463717063548e-06,
|
|
"loss": 1.6464,
|
|
"mean_token_accuracy": 0.6572311043739318,
|
|
"num_tokens": 703115365.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 1.0414104141041411,
|
|
"grad_norm": 0.19332480222201426,
|
|
"learning_rate": 2.4711145554459636e-06,
|
|
"loss": 1.7112,
|
|
"mean_token_accuracy": 0.6459639668464661,
|
|
"num_tokens": 704066907.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.0427770944376111,
|
|
"grad_norm": 0.16062124167771508,
|
|
"learning_rate": 2.4675919402564467e-06,
|
|
"loss": 1.6894,
|
|
"mean_token_accuracy": 0.6500767946243287,
|
|
"num_tokens": 704986542.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.0441437747710811,
|
|
"grad_norm": 0.2035714147726386,
|
|
"learning_rate": 2.46406932506693e-06,
|
|
"loss": 1.6568,
|
|
"mean_token_accuracy": 0.6541108667850495,
|
|
"num_tokens": 705943014.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.045510455104551,
|
|
"grad_norm": 0.2565065281347031,
|
|
"learning_rate": 2.4605467098774134e-06,
|
|
"loss": 1.6756,
|
|
"mean_token_accuracy": 0.6523030996322632,
|
|
"num_tokens": 706905758.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.046877135438021,
|
|
"grad_norm": 0.17848340127758455,
|
|
"learning_rate": 2.4570240946878965e-06,
|
|
"loss": 1.6304,
|
|
"mean_token_accuracy": 0.6589498400688172,
|
|
"num_tokens": 707820456.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.048243815771491,
|
|
"grad_norm": 0.18655777966653678,
|
|
"learning_rate": 2.4535014794983796e-06,
|
|
"loss": 1.6592,
|
|
"mean_token_accuracy": 0.6561080276966095,
|
|
"num_tokens": 708741290.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.049610496104961,
|
|
"grad_norm": 0.1671264016497512,
|
|
"learning_rate": 2.449978864308863e-06,
|
|
"loss": 1.5934,
|
|
"mean_token_accuracy": 0.6646679699420929,
|
|
"num_tokens": 709623622.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.050977176438431,
|
|
"grad_norm": 0.1875566584414334,
|
|
"learning_rate": 2.4464562491193463e-06,
|
|
"loss": 1.6322,
|
|
"mean_token_accuracy": 0.6594183504581451,
|
|
"num_tokens": 710513948.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 1.052343856771901,
|
|
"grad_norm": 0.2291709577030358,
|
|
"learning_rate": 2.4429336339298294e-06,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.6526593327522278,
|
|
"num_tokens": 711462084.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.053710537105371,
|
|
"grad_norm": 0.21505397460059225,
|
|
"learning_rate": 2.439411018740313e-06,
|
|
"loss": 1.5597,
|
|
"mean_token_accuracy": 0.6666822552680969,
|
|
"num_tokens": 712342773.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.055077217438841,
|
|
"grad_norm": 0.2968572421965935,
|
|
"learning_rate": 2.435888403550796e-06,
|
|
"loss": 1.6284,
|
|
"mean_token_accuracy": 0.6590883433818817,
|
|
"num_tokens": 713243142.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.056443897772311,
|
|
"grad_norm": 0.16425373244723002,
|
|
"learning_rate": 2.4323657883612797e-06,
|
|
"loss": 1.6529,
|
|
"mean_token_accuracy": 0.6551034867763519,
|
|
"num_tokens": 714172083.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.0578105781057812,
|
|
"grad_norm": 0.20082258497136513,
|
|
"learning_rate": 2.4288431731717628e-06,
|
|
"loss": 1.648,
|
|
"mean_token_accuracy": 0.6557169318199157,
|
|
"num_tokens": 715059044.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.0591772584392511,
|
|
"grad_norm": 0.2916017401644145,
|
|
"learning_rate": 2.4253205579822463e-06,
|
|
"loss": 1.6572,
|
|
"mean_token_accuracy": 0.6539817869663238,
|
|
"num_tokens": 715954355.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 1.578125,
|
|
"epoch": 1.0605439387727211,
|
|
"grad_norm": 0.28620492138484144,
|
|
"learning_rate": 2.4217979427927294e-06,
|
|
"loss": 1.5802,
|
|
"mean_token_accuracy": 0.6663908898830414,
|
|
"num_tokens": 716853894.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.0619106191061911,
|
|
"grad_norm": 0.2347492060229291,
|
|
"learning_rate": 2.418275327603213e-06,
|
|
"loss": 1.5855,
|
|
"mean_token_accuracy": 0.6653394937515259,
|
|
"num_tokens": 717756811.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.063277299439661,
|
|
"grad_norm": 0.2726962695048299,
|
|
"learning_rate": 2.414752712413696e-06,
|
|
"loss": 1.6164,
|
|
"mean_token_accuracy": 0.6617069840431213,
|
|
"num_tokens": 718678358.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 1.5515625,
|
|
"epoch": 1.064643979773131,
|
|
"grad_norm": 0.2007859073651048,
|
|
"learning_rate": 2.4112300972241797e-06,
|
|
"loss": 1.558,
|
|
"mean_token_accuracy": 0.6706412136554718,
|
|
"num_tokens": 719593264.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.066010660106601,
|
|
"grad_norm": 0.25224750216226494,
|
|
"learning_rate": 2.4077074820346628e-06,
|
|
"loss": 1.6549,
|
|
"mean_token_accuracy": 0.6553803980350494,
|
|
"num_tokens": 720518604.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.067377340440071,
|
|
"grad_norm": 0.20612607520131912,
|
|
"learning_rate": 2.404184866845146e-06,
|
|
"loss": 1.6818,
|
|
"mean_token_accuracy": 0.6499745666980743,
|
|
"num_tokens": 721466133.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.068744020773541,
|
|
"grad_norm": 0.27618121789437233,
|
|
"learning_rate": 2.4006622516556295e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6543761730194092,
|
|
"num_tokens": 722391338.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 1.070110701107011,
|
|
"grad_norm": 0.19310730260081907,
|
|
"learning_rate": 2.3971396364661126e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6518762230873107,
|
|
"num_tokens": 723301003.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.071477381440481,
|
|
"grad_norm": 0.2669053830902099,
|
|
"learning_rate": 2.393617021276596e-06,
|
|
"loss": 1.6586,
|
|
"mean_token_accuracy": 0.6547171294689178,
|
|
"num_tokens": 724210007.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.072844061773951,
|
|
"grad_norm": 0.18018127842832427,
|
|
"learning_rate": 2.3900944060870793e-06,
|
|
"loss": 1.6105,
|
|
"mean_token_accuracy": 0.6613023281097412,
|
|
"num_tokens": 725101691.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.0742107421074212,
|
|
"grad_norm": 0.17535842630193627,
|
|
"learning_rate": 2.3865717908975624e-06,
|
|
"loss": 1.6515,
|
|
"mean_token_accuracy": 0.6533520102500916,
|
|
"num_tokens": 726033590.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 1.0755774224408912,
|
|
"grad_norm": 0.2291463321982412,
|
|
"learning_rate": 2.383049175708046e-06,
|
|
"loss": 1.6826,
|
|
"mean_token_accuracy": 0.6516656816005707,
|
|
"num_tokens": 726954429.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.0769441027743611,
|
|
"grad_norm": 0.18030884207186895,
|
|
"learning_rate": 2.379526560518529e-06,
|
|
"loss": 1.6345,
|
|
"mean_token_accuracy": 0.6569553315639496,
|
|
"num_tokens": 727863664.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.0783107831078311,
|
|
"grad_norm": 0.1907152189832732,
|
|
"learning_rate": 2.376003945329012e-06,
|
|
"loss": 1.6745,
|
|
"mean_token_accuracy": 0.6495916187763214,
|
|
"num_tokens": 728804706.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 1.51953125,
|
|
"epoch": 1.079677463441301,
|
|
"grad_norm": 0.1703186879742653,
|
|
"learning_rate": 2.3724813301394957e-06,
|
|
"loss": 1.532,
|
|
"mean_token_accuracy": 0.6745715022087098,
|
|
"num_tokens": 729699765.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.081044143774771,
|
|
"grad_norm": 0.1429426717987482,
|
|
"learning_rate": 2.368958714949979e-06,
|
|
"loss": 1.5763,
|
|
"mean_token_accuracy": 0.66774622797966,
|
|
"num_tokens": 730589818.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.082410824108241,
|
|
"grad_norm": 0.25951852900816097,
|
|
"learning_rate": 2.3654360997604624e-06,
|
|
"loss": 1.5951,
|
|
"mean_token_accuracy": 0.6631380915641785,
|
|
"num_tokens": 731498359.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.083777504441711,
|
|
"grad_norm": 0.193809373937375,
|
|
"learning_rate": 2.3619134845709455e-06,
|
|
"loss": 1.642,
|
|
"mean_token_accuracy": 0.6584615588188172,
|
|
"num_tokens": 732449727.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.085144184775181,
|
|
"grad_norm": 0.18530129604197385,
|
|
"learning_rate": 2.358390869381429e-06,
|
|
"loss": 1.6429,
|
|
"mean_token_accuracy": 0.658336740732193,
|
|
"num_tokens": 733364961.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.086510865108651,
|
|
"grad_norm": 0.16678509691882717,
|
|
"learning_rate": 2.354868254191912e-06,
|
|
"loss": 1.6584,
|
|
"mean_token_accuracy": 0.654327118396759,
|
|
"num_tokens": 734314232.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.087877545442121,
|
|
"grad_norm": 0.20655470634629683,
|
|
"learning_rate": 2.3513456390023957e-06,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.6595368683338165,
|
|
"num_tokens": 735251046.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.089244225775591,
|
|
"grad_norm": 0.21318450681146028,
|
|
"learning_rate": 2.347823023812879e-06,
|
|
"loss": 1.6609,
|
|
"mean_token_accuracy": 0.6525816738605499,
|
|
"num_tokens": 736144845.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.090610906109061,
|
|
"grad_norm": 0.19380200669061567,
|
|
"learning_rate": 2.3443004086233624e-06,
|
|
"loss": 1.6477,
|
|
"mean_token_accuracy": 0.6550802648067474,
|
|
"num_tokens": 737055730.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.0919775864425312,
|
|
"grad_norm": 0.2865842600440681,
|
|
"learning_rate": 2.3407777934338455e-06,
|
|
"loss": 1.6148,
|
|
"mean_token_accuracy": 0.6653242588043213,
|
|
"num_tokens": 737995900.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.0933442667760012,
|
|
"grad_norm": 0.21500952038878274,
|
|
"learning_rate": 2.3372551782443286e-06,
|
|
"loss": 1.6018,
|
|
"mean_token_accuracy": 0.6635199129581452,
|
|
"num_tokens": 738909574.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.0947109471094711,
|
|
"grad_norm": 0.24408284662027988,
|
|
"learning_rate": 2.333732563054812e-06,
|
|
"loss": 1.6798,
|
|
"mean_token_accuracy": 0.6524914264678955,
|
|
"num_tokens": 739855510.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.0960776274429411,
|
|
"grad_norm": 0.1951257230458408,
|
|
"learning_rate": 2.3302099478652953e-06,
|
|
"loss": 1.6559,
|
|
"mean_token_accuracy": 0.656836473941803,
|
|
"num_tokens": 740788317.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.097444307776411,
|
|
"grad_norm": 0.19935909874283012,
|
|
"learning_rate": 2.3266873326757784e-06,
|
|
"loss": 1.6497,
|
|
"mean_token_accuracy": 0.6591231048107147,
|
|
"num_tokens": 741680344.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 1.74453125,
|
|
"epoch": 1.098810988109881,
|
|
"grad_norm": 0.17837246468904686,
|
|
"learning_rate": 2.323164717486262e-06,
|
|
"loss": 1.7545,
|
|
"mean_token_accuracy": 0.6371114075183868,
|
|
"num_tokens": 742618075.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 1.5609375,
|
|
"epoch": 1.100177668443351,
|
|
"grad_norm": 0.18387820921897086,
|
|
"learning_rate": 2.319642102296745e-06,
|
|
"loss": 1.58,
|
|
"mean_token_accuracy": 0.6685060262680054,
|
|
"num_tokens": 743543827.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.101544348776821,
|
|
"grad_norm": 0.25491009177633683,
|
|
"learning_rate": 2.3161194871072287e-06,
|
|
"loss": 1.6455,
|
|
"mean_token_accuracy": 0.6556438326835632,
|
|
"num_tokens": 744457000.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.102911029110291,
|
|
"grad_norm": 0.21002317382682806,
|
|
"learning_rate": 2.3125968719177118e-06,
|
|
"loss": 1.573,
|
|
"mean_token_accuracy": 0.6686094224452972,
|
|
"num_tokens": 745361756.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 1.104277709443761,
|
|
"grad_norm": 0.2530276460701327,
|
|
"learning_rate": 2.3090742567281953e-06,
|
|
"loss": 1.7196,
|
|
"mean_token_accuracy": 0.6439615845680237,
|
|
"num_tokens": 746310392.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.105644389777231,
|
|
"grad_norm": 0.15361612892898774,
|
|
"learning_rate": 2.3055516415386785e-06,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6629143178462982,
|
|
"num_tokens": 747221415.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.1070110701107012,
|
|
"grad_norm": 0.2331292560730481,
|
|
"learning_rate": 2.302029026349162e-06,
|
|
"loss": 1.6232,
|
|
"mean_token_accuracy": 0.661793726682663,
|
|
"num_tokens": 748130713.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.1083777504441712,
|
|
"grad_norm": 0.31649420094759156,
|
|
"learning_rate": 2.298506411159645e-06,
|
|
"loss": 1.6594,
|
|
"mean_token_accuracy": 0.6544447243213654,
|
|
"num_tokens": 749073453.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 1.1097444307776412,
|
|
"grad_norm": 0.19281557831952625,
|
|
"learning_rate": 2.2949837959701287e-06,
|
|
"loss": 1.6476,
|
|
"mean_token_accuracy": 0.6540527820587159,
|
|
"num_tokens": 750060066.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.24337950292917065,
|
|
"learning_rate": 2.291461180780612e-06,
|
|
"loss": 1.6494,
|
|
"mean_token_accuracy": 0.656864869594574,
|
|
"num_tokens": 751013703.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.1124777914445811,
|
|
"grad_norm": 0.23393610288047129,
|
|
"learning_rate": 2.287938565591095e-06,
|
|
"loss": 1.6127,
|
|
"mean_token_accuracy": 0.6615233361721039,
|
|
"num_tokens": 751945585.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.1138444717780511,
|
|
"grad_norm": 0.14680252447202227,
|
|
"learning_rate": 2.2844159504015785e-06,
|
|
"loss": 1.5941,
|
|
"mean_token_accuracy": 0.6673865795135498,
|
|
"num_tokens": 752899418.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.115211152111521,
|
|
"grad_norm": 0.17054054260018378,
|
|
"learning_rate": 2.2808933352120616e-06,
|
|
"loss": 1.6438,
|
|
"mean_token_accuracy": 0.6595860242843627,
|
|
"num_tokens": 753770178.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.116577832444991,
|
|
"grad_norm": 0.1498015103573574,
|
|
"learning_rate": 2.277370720022545e-06,
|
|
"loss": 1.6655,
|
|
"mean_token_accuracy": 0.6529469132423401,
|
|
"num_tokens": 754683701.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.117944512778461,
|
|
"grad_norm": 0.25509968862218774,
|
|
"learning_rate": 2.2738481048330283e-06,
|
|
"loss": 1.6297,
|
|
"mean_token_accuracy": 0.657200288772583,
|
|
"num_tokens": 755630431.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.119311193111931,
|
|
"grad_norm": 0.20159506692335974,
|
|
"learning_rate": 2.2703254896435114e-06,
|
|
"loss": 1.6135,
|
|
"mean_token_accuracy": 0.6606966495513916,
|
|
"num_tokens": 756584640.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 1.120677873445401,
|
|
"grad_norm": 0.17657717136438936,
|
|
"learning_rate": 2.266802874453995e-06,
|
|
"loss": 1.6867,
|
|
"mean_token_accuracy": 0.6492113471031189,
|
|
"num_tokens": 757525940.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.122044553778871,
|
|
"grad_norm": 0.18086507502487492,
|
|
"learning_rate": 2.263280259264478e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.658352154493332,
|
|
"num_tokens": 758433501.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.123411234112341,
|
|
"grad_norm": 0.19633479042389793,
|
|
"learning_rate": 2.259757644074961e-06,
|
|
"loss": 1.5732,
|
|
"mean_token_accuracy": 0.6674098610877991,
|
|
"num_tokens": 759342392.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.1247779144458112,
|
|
"grad_norm": 0.3049412765064081,
|
|
"learning_rate": 2.2562350288854447e-06,
|
|
"loss": 1.6334,
|
|
"mean_token_accuracy": 0.6603392064571381,
|
|
"num_tokens": 760272512.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.1261445947792812,
|
|
"grad_norm": 0.216921324205191,
|
|
"learning_rate": 2.252712413695928e-06,
|
|
"loss": 1.5944,
|
|
"mean_token_accuracy": 0.6632249832153321,
|
|
"num_tokens": 761157909.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 1.1275112751127512,
|
|
"grad_norm": 0.1848702914227559,
|
|
"learning_rate": 2.2491897985064114e-06,
|
|
"loss": 1.6622,
|
|
"mean_token_accuracy": 0.6529818832874298,
|
|
"num_tokens": 762099596.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.1288779554462212,
|
|
"grad_norm": 0.17516701090550288,
|
|
"learning_rate": 2.2456671833168945e-06,
|
|
"loss": 1.6071,
|
|
"mean_token_accuracy": 0.6639756560325623,
|
|
"num_tokens": 762988474.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.1302446357796911,
|
|
"grad_norm": 0.21397903421207323,
|
|
"learning_rate": 2.242144568127378e-06,
|
|
"loss": 1.6257,
|
|
"mean_token_accuracy": 0.6603670120239258,
|
|
"num_tokens": 763880345.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 1.1316113161131611,
|
|
"grad_norm": 0.17048244572736274,
|
|
"learning_rate": 2.238621952937861e-06,
|
|
"loss": 1.6864,
|
|
"mean_token_accuracy": 0.6513740241527557,
|
|
"num_tokens": 764834355.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 1.703125,
|
|
"epoch": 1.132977996446631,
|
|
"grad_norm": 0.20089374336424734,
|
|
"learning_rate": 2.2350993377483447e-06,
|
|
"loss": 1.6915,
|
|
"mean_token_accuracy": 0.650792908668518,
|
|
"num_tokens": 765749108.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.134344676780101,
|
|
"grad_norm": 0.28137935633231004,
|
|
"learning_rate": 2.231576722558828e-06,
|
|
"loss": 1.6281,
|
|
"mean_token_accuracy": 0.6578247785568238,
|
|
"num_tokens": 766651854.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.135711357113571,
|
|
"grad_norm": 0.1713603357380757,
|
|
"learning_rate": 2.2280541073693114e-06,
|
|
"loss": 1.6228,
|
|
"mean_token_accuracy": 0.6607555389404297,
|
|
"num_tokens": 767584061.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.137078037447041,
|
|
"grad_norm": 0.21890583506602737,
|
|
"learning_rate": 2.2245314921797945e-06,
|
|
"loss": 1.6773,
|
|
"mean_token_accuracy": 0.6505992114543915,
|
|
"num_tokens": 768517850.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.1384447177805113,
|
|
"grad_norm": 0.1719288829795588,
|
|
"learning_rate": 2.2210088769902777e-06,
|
|
"loss": 1.6468,
|
|
"mean_token_accuracy": 0.6561433732509613,
|
|
"num_tokens": 769465982.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 1.4921875,
|
|
"epoch": 1.1398113981139812,
|
|
"grad_norm": 0.250940778529367,
|
|
"learning_rate": 2.217486261800761e-06,
|
|
"loss": 1.4877,
|
|
"mean_token_accuracy": 0.6821200013160705,
|
|
"num_tokens": 770337830.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.1411780784474512,
|
|
"grad_norm": 0.17791603158207092,
|
|
"learning_rate": 2.2139636466112443e-06,
|
|
"loss": 1.643,
|
|
"mean_token_accuracy": 0.6560865640640259,
|
|
"num_tokens": 771296751.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.1425447587809212,
|
|
"grad_norm": 0.20644804829898722,
|
|
"learning_rate": 2.2104410314217275e-06,
|
|
"loss": 1.5868,
|
|
"mean_token_accuracy": 0.6663864135742188,
|
|
"num_tokens": 772208695.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 1.7,
|
|
"epoch": 1.1439114391143912,
|
|
"grad_norm": 0.24173342287984498,
|
|
"learning_rate": 2.206918416232211e-06,
|
|
"loss": 1.71,
|
|
"mean_token_accuracy": 0.6462280213832855,
|
|
"num_tokens": 773154122.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.1452781194478612,
|
|
"grad_norm": 0.22155037037063258,
|
|
"learning_rate": 2.203395801042694e-06,
|
|
"loss": 1.6817,
|
|
"mean_token_accuracy": 0.6491577923297882,
|
|
"num_tokens": 774097011.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.1466447997813312,
|
|
"grad_norm": 0.369171249093992,
|
|
"learning_rate": 2.1998731858531777e-06,
|
|
"loss": 1.6235,
|
|
"mean_token_accuracy": 0.6596496284008027,
|
|
"num_tokens": 775075914.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.1480114801148011,
|
|
"grad_norm": 0.2513152755041124,
|
|
"learning_rate": 2.196350570663661e-06,
|
|
"loss": 1.6834,
|
|
"mean_token_accuracy": 0.6523225009441376,
|
|
"num_tokens": 776010797.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.1493781604482711,
|
|
"grad_norm": 0.2744650126919259,
|
|
"learning_rate": 2.192827955474144e-06,
|
|
"loss": 1.6243,
|
|
"mean_token_accuracy": 0.6621739983558654,
|
|
"num_tokens": 776912202.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 1.553125,
|
|
"epoch": 1.150744840781741,
|
|
"grad_norm": 0.1783833572746322,
|
|
"learning_rate": 2.1893053402846275e-06,
|
|
"loss": 1.5629,
|
|
"mean_token_accuracy": 0.6723062694072723,
|
|
"num_tokens": 777842284.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 1.152111521115211,
|
|
"grad_norm": 0.21304258115317717,
|
|
"learning_rate": 2.1857827250951106e-06,
|
|
"loss": 1.6917,
|
|
"mean_token_accuracy": 0.6516682088375092,
|
|
"num_tokens": 778797950.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 1.153478201448681,
|
|
"grad_norm": 0.17129004826706304,
|
|
"learning_rate": 2.182260109905594e-06,
|
|
"loss": 1.6963,
|
|
"mean_token_accuracy": 0.6477392315864563,
|
|
"num_tokens": 779720744.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.154844881782151,
|
|
"grad_norm": 0.3773614723031313,
|
|
"learning_rate": 2.1787374947160777e-06,
|
|
"loss": 1.6246,
|
|
"mean_token_accuracy": 0.663664048910141,
|
|
"num_tokens": 780627604.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.156211562115621,
|
|
"grad_norm": 0.17635577323209087,
|
|
"learning_rate": 2.175214879526561e-06,
|
|
"loss": 1.5988,
|
|
"mean_token_accuracy": 0.6625190734863281,
|
|
"num_tokens": 781583768.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.1575782424490912,
|
|
"grad_norm": 0.21424746603469588,
|
|
"learning_rate": 2.171692264337044e-06,
|
|
"loss": 1.6361,
|
|
"mean_token_accuracy": 0.6586150884628296,
|
|
"num_tokens": 782490527.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.1589449227825612,
|
|
"grad_norm": 0.17317388496741337,
|
|
"learning_rate": 2.1681696491475275e-06,
|
|
"loss": 1.5897,
|
|
"mean_token_accuracy": 0.6653286755084992,
|
|
"num_tokens": 783390316.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.1603116031160312,
|
|
"grad_norm": 0.18230673041924506,
|
|
"learning_rate": 2.1646470339580106e-06,
|
|
"loss": 1.6145,
|
|
"mean_token_accuracy": 0.6620845913887023,
|
|
"num_tokens": 784371920.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.1616782834495012,
|
|
"grad_norm": 0.27479815578914873,
|
|
"learning_rate": 2.1611244187684937e-06,
|
|
"loss": 1.6136,
|
|
"mean_token_accuracy": 0.6633778631687164,
|
|
"num_tokens": 785282535.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.1630449637829712,
|
|
"grad_norm": 0.27197853442688436,
|
|
"learning_rate": 2.1576018035789773e-06,
|
|
"loss": 1.6224,
|
|
"mean_token_accuracy": 0.6602053463459014,
|
|
"num_tokens": 786240800.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 1.1644116441164412,
|
|
"grad_norm": 0.2732953446585046,
|
|
"learning_rate": 2.1540791883894604e-06,
|
|
"loss": 1.5574,
|
|
"mean_token_accuracy": 0.6682095050811767,
|
|
"num_tokens": 787109120.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.1657783244499111,
|
|
"grad_norm": 0.22314025656272019,
|
|
"learning_rate": 2.150556573199944e-06,
|
|
"loss": 1.6711,
|
|
"mean_token_accuracy": 0.6559431970119476,
|
|
"num_tokens": 788017028.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.1671450047833811,
|
|
"grad_norm": 0.190450689587912,
|
|
"learning_rate": 2.147033958010427e-06,
|
|
"loss": 1.5963,
|
|
"mean_token_accuracy": 0.66424840092659,
|
|
"num_tokens": 788933277.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.168511685116851,
|
|
"grad_norm": 0.25835786576703546,
|
|
"learning_rate": 2.14351134282091e-06,
|
|
"loss": 1.6339,
|
|
"mean_token_accuracy": 0.6577525317668915,
|
|
"num_tokens": 789878859.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.169878365450321,
|
|
"grad_norm": 0.16289485864430506,
|
|
"learning_rate": 2.1399887276313937e-06,
|
|
"loss": 1.5677,
|
|
"mean_token_accuracy": 0.6700462579727173,
|
|
"num_tokens": 790831728.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.1712450457837913,
|
|
"grad_norm": 0.20103455581601723,
|
|
"learning_rate": 2.136466112441877e-06,
|
|
"loss": 1.6649,
|
|
"mean_token_accuracy": 0.6556553244590759,
|
|
"num_tokens": 791777254.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.1726117261172613,
|
|
"grad_norm": 0.21495815967715673,
|
|
"learning_rate": 2.1329434972523604e-06,
|
|
"loss": 1.621,
|
|
"mean_token_accuracy": 0.6603146433830261,
|
|
"num_tokens": 792700497.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.1739784064507313,
|
|
"grad_norm": 0.21918398346103507,
|
|
"learning_rate": 2.1294208820628435e-06,
|
|
"loss": 1.6054,
|
|
"mean_token_accuracy": 0.6634316384792328,
|
|
"num_tokens": 793599703.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 1.1753450867842012,
|
|
"grad_norm": 0.20873054297262225,
|
|
"learning_rate": 2.125898266873327e-06,
|
|
"loss": 1.6852,
|
|
"mean_token_accuracy": 0.6481540262699127,
|
|
"num_tokens": 794573490.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.1767117671176712,
|
|
"grad_norm": 0.23032929581648626,
|
|
"learning_rate": 2.12237565168381e-06,
|
|
"loss": 1.6779,
|
|
"mean_token_accuracy": 0.6518541753292084,
|
|
"num_tokens": 795509661.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.1780784474511412,
|
|
"grad_norm": 0.25561907714163506,
|
|
"learning_rate": 2.1188530364942938e-06,
|
|
"loss": 1.6425,
|
|
"mean_token_accuracy": 0.656450879573822,
|
|
"num_tokens": 796433548.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.1794451277846112,
|
|
"grad_norm": 0.18313584907198335,
|
|
"learning_rate": 2.115330421304777e-06,
|
|
"loss": 1.5571,
|
|
"mean_token_accuracy": 0.6684423804283142,
|
|
"num_tokens": 797350324.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.1808118081180812,
|
|
"grad_norm": 0.17351443418831938,
|
|
"learning_rate": 2.1118078061152604e-06,
|
|
"loss": 1.6261,
|
|
"mean_token_accuracy": 0.6636813998222351,
|
|
"num_tokens": 798320084.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.1821784884515512,
|
|
"grad_norm": 0.20768719603236122,
|
|
"learning_rate": 2.1082851909257435e-06,
|
|
"loss": 1.624,
|
|
"mean_token_accuracy": 0.660314291715622,
|
|
"num_tokens": 799256877.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.1835451687850211,
|
|
"grad_norm": 0.2213452420932683,
|
|
"learning_rate": 2.1047625757362267e-06,
|
|
"loss": 1.6041,
|
|
"mean_token_accuracy": 0.66301149725914,
|
|
"num_tokens": 800205953.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.1849118491184911,
|
|
"grad_norm": 0.2302215292325758,
|
|
"learning_rate": 2.1012399605467102e-06,
|
|
"loss": 1.6498,
|
|
"mean_token_accuracy": 0.6577127754688263,
|
|
"num_tokens": 801097835.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.186278529451961,
|
|
"grad_norm": 0.1821582010234677,
|
|
"learning_rate": 2.0977173453571933e-06,
|
|
"loss": 1.6718,
|
|
"mean_token_accuracy": 0.6562166333198547,
|
|
"num_tokens": 801983354.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 1.187645209785431,
|
|
"grad_norm": 0.17780473516521159,
|
|
"learning_rate": 2.0941947301676765e-06,
|
|
"loss": 1.6732,
|
|
"mean_token_accuracy": 0.6533462882041932,
|
|
"num_tokens": 802863350.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.189011890118901,
|
|
"grad_norm": 0.2353857648208466,
|
|
"learning_rate": 2.09067211497816e-06,
|
|
"loss": 1.5778,
|
|
"mean_token_accuracy": 0.664344996213913,
|
|
"num_tokens": 803795245.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.1903785704523713,
|
|
"grad_norm": 0.2765558901223689,
|
|
"learning_rate": 2.087149499788643e-06,
|
|
"loss": 1.615,
|
|
"mean_token_accuracy": 0.6586447954177856,
|
|
"num_tokens": 804709619.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 1.1917452507858413,
|
|
"grad_norm": 0.20953813395085913,
|
|
"learning_rate": 2.0836268845991263e-06,
|
|
"loss": 1.6682,
|
|
"mean_token_accuracy": 0.6506433308124542,
|
|
"num_tokens": 805627263.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.1931119311193112,
|
|
"grad_norm": 0.19639459272622498,
|
|
"learning_rate": 2.08010426940961e-06,
|
|
"loss": 1.662,
|
|
"mean_token_accuracy": 0.6567970454692841,
|
|
"num_tokens": 806581795.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.1944786114527812,
|
|
"grad_norm": 0.2879644167629232,
|
|
"learning_rate": 2.076581654220093e-06,
|
|
"loss": 1.6474,
|
|
"mean_token_accuracy": 0.6576579630374908,
|
|
"num_tokens": 807513303.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.1958452917862512,
|
|
"grad_norm": 0.3146679970367737,
|
|
"learning_rate": 2.0730590390305765e-06,
|
|
"loss": 1.591,
|
|
"mean_token_accuracy": 0.6660410642623902,
|
|
"num_tokens": 808427679.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 1.1972119721197212,
|
|
"grad_norm": 0.20275393797587254,
|
|
"learning_rate": 2.0695364238410596e-06,
|
|
"loss": 1.6485,
|
|
"mean_token_accuracy": 0.6568281829357148,
|
|
"num_tokens": 809402753.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.1985786524531912,
|
|
"grad_norm": 0.21603031830721212,
|
|
"learning_rate": 2.066013808651543e-06,
|
|
"loss": 1.6803,
|
|
"mean_token_accuracy": 0.6502882599830627,
|
|
"num_tokens": 810321187.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 1.1999453327866612,
|
|
"grad_norm": 0.19150381476750727,
|
|
"learning_rate": 2.0624911934620263e-06,
|
|
"loss": 1.6524,
|
|
"mean_token_accuracy": 0.6555010437965393,
|
|
"num_tokens": 811205950.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 1.2013120131201311,
|
|
"grad_norm": 0.17720895834108683,
|
|
"learning_rate": 2.05896857827251e-06,
|
|
"loss": 1.6916,
|
|
"mean_token_accuracy": 0.6471143066883087,
|
|
"num_tokens": 812168379.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.2026786934536011,
|
|
"grad_norm": 0.20321687854989065,
|
|
"learning_rate": 2.055445963082993e-06,
|
|
"loss": 1.6001,
|
|
"mean_token_accuracy": 0.6660366177558898,
|
|
"num_tokens": 813080253.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.2040453737870713,
|
|
"grad_norm": 0.19439277587242884,
|
|
"learning_rate": 2.0519233478934765e-06,
|
|
"loss": 1.6088,
|
|
"mean_token_accuracy": 0.6622526407241821,
|
|
"num_tokens": 814025778.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.2054120541205413,
|
|
"grad_norm": 0.1637755004510148,
|
|
"learning_rate": 2.0484007327039596e-06,
|
|
"loss": 1.6583,
|
|
"mean_token_accuracy": 0.653345400094986,
|
|
"num_tokens": 814932209.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.2067787344540113,
|
|
"grad_norm": 0.22927049961194002,
|
|
"learning_rate": 2.0448781175144427e-06,
|
|
"loss": 1.6208,
|
|
"mean_token_accuracy": 0.6614237844944,
|
|
"num_tokens": 815852930.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.2081454147874813,
|
|
"grad_norm": 0.20471417631621994,
|
|
"learning_rate": 2.0413555023249263e-06,
|
|
"loss": 1.6123,
|
|
"mean_token_accuracy": 0.6624108135700226,
|
|
"num_tokens": 816785199.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.2095120951209513,
|
|
"grad_norm": 0.22565318433849865,
|
|
"learning_rate": 2.0378328871354094e-06,
|
|
"loss": 1.6174,
|
|
"mean_token_accuracy": 0.6596987783908844,
|
|
"num_tokens": 817737427.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.2108787754544212,
|
|
"grad_norm": 0.24278826036004195,
|
|
"learning_rate": 2.034310271945893e-06,
|
|
"loss": 1.6447,
|
|
"mean_token_accuracy": 0.6587843537330628,
|
|
"num_tokens": 818706469.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.2122454557878912,
|
|
"grad_norm": 0.18842810019268028,
|
|
"learning_rate": 2.030787656756376e-06,
|
|
"loss": 1.6002,
|
|
"mean_token_accuracy": 0.6608192265033722,
|
|
"num_tokens": 819615193.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 1.5640625,
|
|
"epoch": 1.2136121361213612,
|
|
"grad_norm": 0.1574685102697622,
|
|
"learning_rate": 2.027265041566859e-06,
|
|
"loss": 1.592,
|
|
"mean_token_accuracy": 0.666092050075531,
|
|
"num_tokens": 820538266.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.2149788164548312,
|
|
"grad_norm": 0.2027249129848444,
|
|
"learning_rate": 2.0237424263773427e-06,
|
|
"loss": 1.5952,
|
|
"mean_token_accuracy": 0.6665946066379547,
|
|
"num_tokens": 821425948.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 1.5375,
|
|
"epoch": 1.2163454967883012,
|
|
"grad_norm": 0.1583749632398129,
|
|
"learning_rate": 2.020219811187826e-06,
|
|
"loss": 1.5487,
|
|
"mean_token_accuracy": 0.6713702499866485,
|
|
"num_tokens": 822375316.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.2177121771217712,
|
|
"grad_norm": 0.20488783687768714,
|
|
"learning_rate": 2.0166971959983094e-06,
|
|
"loss": 1.6736,
|
|
"mean_token_accuracy": 0.6502883672714234,
|
|
"num_tokens": 823310733.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.2190788574552411,
|
|
"grad_norm": 0.24209340879370717,
|
|
"learning_rate": 2.0131745808087925e-06,
|
|
"loss": 1.6019,
|
|
"mean_token_accuracy": 0.661716777086258,
|
|
"num_tokens": 824241398.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.2204455377887111,
|
|
"grad_norm": 0.2301107695415495,
|
|
"learning_rate": 2.009651965619276e-06,
|
|
"loss": 1.6328,
|
|
"mean_token_accuracy": 0.6568999052047729,
|
|
"num_tokens": 825163330.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.221812218122181,
|
|
"grad_norm": 0.2774396832043037,
|
|
"learning_rate": 2.0061293504297592e-06,
|
|
"loss": 1.7009,
|
|
"mean_token_accuracy": 0.6479405879974365,
|
|
"num_tokens": 826106651.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.2231788984556513,
|
|
"grad_norm": 0.2184256098121147,
|
|
"learning_rate": 2.0026067352402428e-06,
|
|
"loss": 1.6236,
|
|
"mean_token_accuracy": 0.659999543428421,
|
|
"num_tokens": 827032881.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.2245455787891213,
|
|
"grad_norm": 0.21160156969951463,
|
|
"learning_rate": 1.999084120050726e-06,
|
|
"loss": 1.6677,
|
|
"mean_token_accuracy": 0.6536165952682496,
|
|
"num_tokens": 827961657.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.2259122591225913,
|
|
"grad_norm": 0.1709412736610459,
|
|
"learning_rate": 1.9955615048612094e-06,
|
|
"loss": 1.5687,
|
|
"mean_token_accuracy": 0.6690698683261871,
|
|
"num_tokens": 828882025.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.2272789394560613,
|
|
"grad_norm": 0.2652186560864596,
|
|
"learning_rate": 1.9920388896716926e-06,
|
|
"loss": 1.6264,
|
|
"mean_token_accuracy": 0.6602205038070679,
|
|
"num_tokens": 829818207.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.2286456197895312,
|
|
"grad_norm": 0.29243834229580173,
|
|
"learning_rate": 1.9885162744821757e-06,
|
|
"loss": 1.6334,
|
|
"mean_token_accuracy": 0.6616554439067841,
|
|
"num_tokens": 830745534.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.2300123001230012,
|
|
"grad_norm": 0.22412497970962084,
|
|
"learning_rate": 1.9849936592926592e-06,
|
|
"loss": 1.6215,
|
|
"mean_token_accuracy": 0.6604977607727051,
|
|
"num_tokens": 831606742.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.2313789804564712,
|
|
"grad_norm": 0.17371222238474207,
|
|
"learning_rate": 1.9814710441031424e-06,
|
|
"loss": 1.6141,
|
|
"mean_token_accuracy": 0.6604167997837067,
|
|
"num_tokens": 832516112.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.2327456607899412,
|
|
"grad_norm": 0.19859587417358318,
|
|
"learning_rate": 1.9779484289136255e-06,
|
|
"loss": 1.584,
|
|
"mean_token_accuracy": 0.6687736809253693,
|
|
"num_tokens": 833415752.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.2341123411234112,
|
|
"grad_norm": 0.23765011984260132,
|
|
"learning_rate": 1.974425813724109e-06,
|
|
"loss": 1.6372,
|
|
"mean_token_accuracy": 0.6581868946552276,
|
|
"num_tokens": 834352245.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.2354790214568812,
|
|
"grad_norm": 0.1677057777580782,
|
|
"learning_rate": 1.970903198534592e-06,
|
|
"loss": 1.6203,
|
|
"mean_token_accuracy": 0.6601116538047791,
|
|
"num_tokens": 835294849.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.2368457017903514,
|
|
"grad_norm": 0.17880019901471705,
|
|
"learning_rate": 1.9673805833450753e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.6557646632194519,
|
|
"num_tokens": 836233134.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.2382123821238213,
|
|
"grad_norm": 0.1651044216576436,
|
|
"learning_rate": 1.963857968155559e-06,
|
|
"loss": 1.6421,
|
|
"mean_token_accuracy": 0.6568689227104187,
|
|
"num_tokens": 837179118.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.2395790624572913,
|
|
"grad_norm": 0.1892434739269766,
|
|
"learning_rate": 1.960335352966042e-06,
|
|
"loss": 1.6068,
|
|
"mean_token_accuracy": 0.6666494071483612,
|
|
"num_tokens": 838104151.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.2409457427907613,
|
|
"grad_norm": 0.25298035681582975,
|
|
"learning_rate": 1.9568127377765255e-06,
|
|
"loss": 1.6265,
|
|
"mean_token_accuracy": 0.6570579469203949,
|
|
"num_tokens": 839054587.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.2423124231242313,
|
|
"grad_norm": 0.20417017106899749,
|
|
"learning_rate": 1.9532901225870086e-06,
|
|
"loss": 1.6371,
|
|
"mean_token_accuracy": 0.6587815880775452,
|
|
"num_tokens": 840025689.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.2436791034577013,
|
|
"grad_norm": 0.20029046022576671,
|
|
"learning_rate": 1.949767507397492e-06,
|
|
"loss": 1.5844,
|
|
"mean_token_accuracy": 0.6637799084186554,
|
|
"num_tokens": 840925150.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.2450457837911713,
|
|
"grad_norm": 0.16368749723694231,
|
|
"learning_rate": 1.9462448922079753e-06,
|
|
"loss": 1.6517,
|
|
"mean_token_accuracy": 0.6554785966873169,
|
|
"num_tokens": 841841474.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 1.59921875,
|
|
"epoch": 1.2464124641246412,
|
|
"grad_norm": 0.160218597699072,
|
|
"learning_rate": 1.942722277018459e-06,
|
|
"loss": 1.6081,
|
|
"mean_token_accuracy": 0.6653038322925567,
|
|
"num_tokens": 842738532.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.2477791444581112,
|
|
"grad_norm": 0.15484461743028702,
|
|
"learning_rate": 1.939199661828942e-06,
|
|
"loss": 1.674,
|
|
"mean_token_accuracy": 0.6507823765277863,
|
|
"num_tokens": 843649532.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 1.51953125,
|
|
"epoch": 1.2491458247915812,
|
|
"grad_norm": 0.17238318371455416,
|
|
"learning_rate": 1.9356770466394255e-06,
|
|
"loss": 1.5277,
|
|
"mean_token_accuracy": 0.6750833988189697,
|
|
"num_tokens": 844505400.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 1.2505125051250512,
|
|
"grad_norm": 0.20338914999980548,
|
|
"learning_rate": 1.9321544314499086e-06,
|
|
"loss": 1.5694,
|
|
"mean_token_accuracy": 0.6680317103862763,
|
|
"num_tokens": 845420061.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.2518791854585212,
|
|
"grad_norm": 0.2632005741484675,
|
|
"learning_rate": 1.9286318162603917e-06,
|
|
"loss": 1.6333,
|
|
"mean_token_accuracy": 0.6569846093654632,
|
|
"num_tokens": 846361607.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 1.55234375,
|
|
"epoch": 1.2532458657919912,
|
|
"grad_norm": 0.20609040559379802,
|
|
"learning_rate": 1.9251092010708753e-06,
|
|
"loss": 1.563,
|
|
"mean_token_accuracy": 0.6715534329414368,
|
|
"num_tokens": 847283351.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.2546125461254611,
|
|
"grad_norm": 0.20572889609637235,
|
|
"learning_rate": 1.9215865858813584e-06,
|
|
"loss": 1.6379,
|
|
"mean_token_accuracy": 0.6560554146766663,
|
|
"num_tokens": 848201779.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.2559792264589311,
|
|
"grad_norm": 0.20742216328660026,
|
|
"learning_rate": 1.918063970691842e-06,
|
|
"loss": 1.6404,
|
|
"mean_token_accuracy": 0.6557006955146789,
|
|
"num_tokens": 849137352.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.2573459067924013,
|
|
"grad_norm": 0.23093267939893064,
|
|
"learning_rate": 1.914541355502325e-06,
|
|
"loss": 1.6497,
|
|
"mean_token_accuracy": 0.6548084676265716,
|
|
"num_tokens": 850016230.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.2587125871258713,
|
|
"grad_norm": 0.2662282794892419,
|
|
"learning_rate": 1.9110187403128082e-06,
|
|
"loss": 1.596,
|
|
"mean_token_accuracy": 0.6626170933246612,
|
|
"num_tokens": 850929679.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.2600792674593413,
|
|
"grad_norm": 0.19655015540443818,
|
|
"learning_rate": 1.9074961251232918e-06,
|
|
"loss": 1.5976,
|
|
"mean_token_accuracy": 0.6658562600612641,
|
|
"num_tokens": 851901987.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.2614459477928113,
|
|
"grad_norm": 0.19734809760492972,
|
|
"learning_rate": 1.903973509933775e-06,
|
|
"loss": 1.5943,
|
|
"mean_token_accuracy": 0.6616030931472778,
|
|
"num_tokens": 852807729.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.2628126281262813,
|
|
"grad_norm": 0.2015648584931164,
|
|
"learning_rate": 1.9004508947442582e-06,
|
|
"loss": 1.6704,
|
|
"mean_token_accuracy": 0.6523842453956604,
|
|
"num_tokens": 853704734.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 1.571875,
|
|
"epoch": 1.2641793084597512,
|
|
"grad_norm": 0.19514371421674612,
|
|
"learning_rate": 1.8969282795547418e-06,
|
|
"loss": 1.5941,
|
|
"mean_token_accuracy": 0.6639213621616363,
|
|
"num_tokens": 854605384.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.2655459887932212,
|
|
"grad_norm": 0.1813248679483005,
|
|
"learning_rate": 1.8934056643652249e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6560372114181519,
|
|
"num_tokens": 855518283.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.2669126691266912,
|
|
"grad_norm": 0.2316079467519023,
|
|
"learning_rate": 1.889883049175708e-06,
|
|
"loss": 1.6103,
|
|
"mean_token_accuracy": 0.6644162654876709,
|
|
"num_tokens": 856427351.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.2682793494601612,
|
|
"grad_norm": 0.20014208506565143,
|
|
"learning_rate": 1.8863604339861916e-06,
|
|
"loss": 1.5723,
|
|
"mean_token_accuracy": 0.6660362184047699,
|
|
"num_tokens": 857348420.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.2696460297936314,
|
|
"grad_norm": 0.19105854571032851,
|
|
"learning_rate": 1.8828378187966747e-06,
|
|
"loss": 1.6245,
|
|
"mean_token_accuracy": 0.6611084461212158,
|
|
"num_tokens": 858289770.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.2710127101271014,
|
|
"grad_norm": 0.20370110548244957,
|
|
"learning_rate": 1.8793152036071582e-06,
|
|
"loss": 1.6394,
|
|
"mean_token_accuracy": 0.6592767357826232,
|
|
"num_tokens": 859204743.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.2723793904605714,
|
|
"grad_norm": 0.17395567804906178,
|
|
"learning_rate": 1.8757925884176414e-06,
|
|
"loss": 1.6417,
|
|
"mean_token_accuracy": 0.6541674554347991,
|
|
"num_tokens": 860137113.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 1.2737460707940413,
|
|
"grad_norm": 0.24460226115535882,
|
|
"learning_rate": 1.8722699732281247e-06,
|
|
"loss": 1.717,
|
|
"mean_token_accuracy": 0.642821341753006,
|
|
"num_tokens": 861116485.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.2751127511275113,
|
|
"grad_norm": 0.15597830886889097,
|
|
"learning_rate": 1.868747358038608e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.6540095269680023,
|
|
"num_tokens": 862025749.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.2764794314609813,
|
|
"grad_norm": 0.20498149463716128,
|
|
"learning_rate": 1.8652247428490914e-06,
|
|
"loss": 1.5766,
|
|
"mean_token_accuracy": 0.6698024213314057,
|
|
"num_tokens": 862947743.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.2778461117944513,
|
|
"grad_norm": 0.19943962567010154,
|
|
"learning_rate": 1.8617021276595745e-06,
|
|
"loss": 1.639,
|
|
"mean_token_accuracy": 0.6588376522064209,
|
|
"num_tokens": 863856579.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.2792127921279213,
|
|
"grad_norm": 0.23695035418112073,
|
|
"learning_rate": 1.858179512470058e-06,
|
|
"loss": 1.603,
|
|
"mean_token_accuracy": 0.6660663068294526,
|
|
"num_tokens": 864776989.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.2805794724613913,
|
|
"grad_norm": 0.1699793952161012,
|
|
"learning_rate": 1.8546568972805412e-06,
|
|
"loss": 1.5695,
|
|
"mean_token_accuracy": 0.6680994749069213,
|
|
"num_tokens": 865755999.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.2819461527948612,
|
|
"grad_norm": 0.21864445372649502,
|
|
"learning_rate": 1.8511342820910245e-06,
|
|
"loss": 1.5822,
|
|
"mean_token_accuracy": 0.6645114123821259,
|
|
"num_tokens": 866672674.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.2833128331283312,
|
|
"grad_norm": 0.1939498949665438,
|
|
"learning_rate": 1.8476116669015078e-06,
|
|
"loss": 1.5874,
|
|
"mean_token_accuracy": 0.667881327867508,
|
|
"num_tokens": 867610343.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.2846795134618012,
|
|
"grad_norm": 0.18585330910538633,
|
|
"learning_rate": 1.8440890517119912e-06,
|
|
"loss": 1.5464,
|
|
"mean_token_accuracy": 0.6718300104141235,
|
|
"num_tokens": 868542024.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.2860461937952712,
|
|
"grad_norm": 0.2857786362264477,
|
|
"learning_rate": 1.8405664365224743e-06,
|
|
"loss": 1.5857,
|
|
"mean_token_accuracy": 0.665126496553421,
|
|
"num_tokens": 869428307.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.2874128741287412,
|
|
"grad_norm": 0.26704438607045844,
|
|
"learning_rate": 1.8370438213329578e-06,
|
|
"loss": 1.6927,
|
|
"mean_token_accuracy": 0.6495187878608704,
|
|
"num_tokens": 870403927.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.2887795544622112,
|
|
"grad_norm": 0.21106115794797997,
|
|
"learning_rate": 1.833521206143441e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.6589071989059448,
|
|
"num_tokens": 871331579.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.2901462347956814,
|
|
"grad_norm": 0.1551969682785923,
|
|
"learning_rate": 1.8299985909539245e-06,
|
|
"loss": 1.6562,
|
|
"mean_token_accuracy": 0.6535547494888305,
|
|
"num_tokens": 872219931.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.2915129151291513,
|
|
"grad_norm": 0.29566247685292124,
|
|
"learning_rate": 1.8264759757644076e-06,
|
|
"loss": 1.6236,
|
|
"mean_token_accuracy": 0.6591123044490814,
|
|
"num_tokens": 873135634.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 1.56484375,
|
|
"epoch": 1.2928795954626213,
|
|
"grad_norm": 0.23527997793753228,
|
|
"learning_rate": 1.822953360574891e-06,
|
|
"loss": 1.5686,
|
|
"mean_token_accuracy": 0.6678933501243591,
|
|
"num_tokens": 874017491.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 1.56796875,
|
|
"epoch": 1.2942462757960913,
|
|
"grad_norm": 0.1933609868618196,
|
|
"learning_rate": 1.8194307453853743e-06,
|
|
"loss": 1.5663,
|
|
"mean_token_accuracy": 0.6694317996501923,
|
|
"num_tokens": 874929770.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 1.2956129561295613,
|
|
"grad_norm": 0.22525213477928008,
|
|
"learning_rate": 1.8159081301958576e-06,
|
|
"loss": 1.6666,
|
|
"mean_token_accuracy": 0.6554764986038208,
|
|
"num_tokens": 875872522.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.2969796364630313,
|
|
"grad_norm": 0.15980595706034278,
|
|
"learning_rate": 1.8123855150063408e-06,
|
|
"loss": 1.6058,
|
|
"mean_token_accuracy": 0.6622600555419922,
|
|
"num_tokens": 876811590.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.2983463167965013,
|
|
"grad_norm": 0.20084445144906724,
|
|
"learning_rate": 1.8088628998168243e-06,
|
|
"loss": 1.6206,
|
|
"mean_token_accuracy": 0.6614287257194519,
|
|
"num_tokens": 877715892.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 1.54765625,
|
|
"epoch": 1.2997129971299712,
|
|
"grad_norm": 0.3792623241464484,
|
|
"learning_rate": 1.8053402846273074e-06,
|
|
"loss": 1.5477,
|
|
"mean_token_accuracy": 0.6698496401309967,
|
|
"num_tokens": 878628621.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 1.534375,
|
|
"epoch": 1.3010796774634412,
|
|
"grad_norm": 0.2043442864049843,
|
|
"learning_rate": 1.8018176694377906e-06,
|
|
"loss": 1.5449,
|
|
"mean_token_accuracy": 0.6713538408279419,
|
|
"num_tokens": 879531956.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.3024463577969114,
|
|
"grad_norm": 0.17252486996490007,
|
|
"learning_rate": 1.798295054248274e-06,
|
|
"loss": 1.6591,
|
|
"mean_token_accuracy": 0.655505484342575,
|
|
"num_tokens": 880422016.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.3038130381303814,
|
|
"grad_norm": 0.21960690013353423,
|
|
"learning_rate": 1.7947724390587572e-06,
|
|
"loss": 1.5852,
|
|
"mean_token_accuracy": 0.6630842030048371,
|
|
"num_tokens": 881316976.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.3051797184638514,
|
|
"grad_norm": 0.1937356583343251,
|
|
"learning_rate": 1.7912498238692408e-06,
|
|
"loss": 1.6681,
|
|
"mean_token_accuracy": 0.6542088210582733,
|
|
"num_tokens": 882260690.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.3065463987973214,
|
|
"grad_norm": 0.18050567027146283,
|
|
"learning_rate": 1.7877272086797239e-06,
|
|
"loss": 1.5805,
|
|
"mean_token_accuracy": 0.6680208027362824,
|
|
"num_tokens": 883187237.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.3079130791307914,
|
|
"grad_norm": 0.2021793627079275,
|
|
"learning_rate": 1.7842045934902072e-06,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6548952400684357,
|
|
"num_tokens": 884127250.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.3092797594642613,
|
|
"grad_norm": 0.1864435354166255,
|
|
"learning_rate": 1.7806819783006908e-06,
|
|
"loss": 1.6042,
|
|
"mean_token_accuracy": 0.663700807094574,
|
|
"num_tokens": 885045874.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.3106464397977313,
|
|
"grad_norm": 0.18791924840397603,
|
|
"learning_rate": 1.777159363111174e-06,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.6613615691661835,
|
|
"num_tokens": 885973980.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.3120131201312013,
|
|
"grad_norm": 0.18788512762934273,
|
|
"learning_rate": 1.773636747921657e-06,
|
|
"loss": 1.5767,
|
|
"mean_token_accuracy": 0.6657974481582641,
|
|
"num_tokens": 886851721.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.3133798004646713,
|
|
"grad_norm": 0.19384973514916146,
|
|
"learning_rate": 1.7701141327321406e-06,
|
|
"loss": 1.6292,
|
|
"mean_token_accuracy": 0.6599762320518494,
|
|
"num_tokens": 887812531.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.3147464807981413,
|
|
"grad_norm": 0.24195345137439506,
|
|
"learning_rate": 1.7665915175426237e-06,
|
|
"loss": 1.6464,
|
|
"mean_token_accuracy": 0.6548418343067169,
|
|
"num_tokens": 888788117.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 1.56328125,
|
|
"epoch": 1.3161131611316113,
|
|
"grad_norm": 0.18075177037959714,
|
|
"learning_rate": 1.763068902353107e-06,
|
|
"loss": 1.5654,
|
|
"mean_token_accuracy": 0.6668335199356079,
|
|
"num_tokens": 889686268.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.3174798414650812,
|
|
"grad_norm": 0.21506444300844293,
|
|
"learning_rate": 1.7595462871635904e-06,
|
|
"loss": 1.634,
|
|
"mean_token_accuracy": 0.6542869091033936,
|
|
"num_tokens": 890574718.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.3188465217985512,
|
|
"grad_norm": 0.20259015303737724,
|
|
"learning_rate": 1.7560236719740737e-06,
|
|
"loss": 1.5759,
|
|
"mean_token_accuracy": 0.6660227000713348,
|
|
"num_tokens": 891510427.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.3202132021320212,
|
|
"grad_norm": 0.21242101806492836,
|
|
"learning_rate": 1.752501056784557e-06,
|
|
"loss": 1.5911,
|
|
"mean_token_accuracy": 0.6694853067398071,
|
|
"num_tokens": 892466239.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 1.57734375,
|
|
"epoch": 1.3215798824654912,
|
|
"grad_norm": 0.34374060950421564,
|
|
"learning_rate": 1.7489784415950404e-06,
|
|
"loss": 1.5907,
|
|
"mean_token_accuracy": 0.6631456732749939,
|
|
"num_tokens": 893354742.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.3229465627989614,
|
|
"grad_norm": 0.19370929921619007,
|
|
"learning_rate": 1.7454558264055235e-06,
|
|
"loss": 1.5714,
|
|
"mean_token_accuracy": 0.6675727069377899,
|
|
"num_tokens": 894254607.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 1.3243132431324314,
|
|
"grad_norm": 0.16717936424166208,
|
|
"learning_rate": 1.741933211216007e-06,
|
|
"loss": 1.5984,
|
|
"mean_token_accuracy": 0.6672245144844056,
|
|
"num_tokens": 895151778.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.3256799234659014,
|
|
"grad_norm": 0.19400353859374558,
|
|
"learning_rate": 1.7384105960264902e-06,
|
|
"loss": 1.5922,
|
|
"mean_token_accuracy": 0.6666933298110962,
|
|
"num_tokens": 896088320.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 1.73828125,
|
|
"epoch": 1.3270466037993713,
|
|
"grad_norm": 0.20250802433675363,
|
|
"learning_rate": 1.7348879808369735e-06,
|
|
"loss": 1.7506,
|
|
"mean_token_accuracy": 0.6372614681720734,
|
|
"num_tokens": 897033198.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 1.3284132841328413,
|
|
"grad_norm": 0.22037112571739093,
|
|
"learning_rate": 1.7313653656474568e-06,
|
|
"loss": 1.6085,
|
|
"mean_token_accuracy": 0.6617143154144287,
|
|
"num_tokens": 897980676.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.3297799644663113,
|
|
"grad_norm": 0.1727533686707606,
|
|
"learning_rate": 1.7278427504579402e-06,
|
|
"loss": 1.6548,
|
|
"mean_token_accuracy": 0.6561595797538757,
|
|
"num_tokens": 898942485.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.3311466447997813,
|
|
"grad_norm": 0.16169107889903023,
|
|
"learning_rate": 1.7243201352684233e-06,
|
|
"loss": 1.6845,
|
|
"mean_token_accuracy": 0.6520072221755981,
|
|
"num_tokens": 899913150.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.3325133251332513,
|
|
"grad_norm": 0.1914649272537934,
|
|
"learning_rate": 1.7207975200789068e-06,
|
|
"loss": 1.6754,
|
|
"mean_token_accuracy": 0.6503610670566559,
|
|
"num_tokens": 900834324.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.3338800054667213,
|
|
"grad_norm": 0.1802410673764987,
|
|
"learning_rate": 1.71727490488939e-06,
|
|
"loss": 1.6352,
|
|
"mean_token_accuracy": 0.6572534263134002,
|
|
"num_tokens": 901704049.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 1.3352466858001915,
|
|
"grad_norm": 0.2565262486294049,
|
|
"learning_rate": 1.7137522896998735e-06,
|
|
"loss": 1.6283,
|
|
"mean_token_accuracy": 0.6576483428478241,
|
|
"num_tokens": 902634843.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 1.540625,
|
|
"epoch": 1.3366133661336614,
|
|
"grad_norm": 0.1857199405564427,
|
|
"learning_rate": 1.7102296745103566e-06,
|
|
"loss": 1.5576,
|
|
"mean_token_accuracy": 0.6727153718471527,
|
|
"num_tokens": 903526741.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.3379800464671314,
|
|
"grad_norm": 0.19240988341333606,
|
|
"learning_rate": 1.7067070593208398e-06,
|
|
"loss": 1.6187,
|
|
"mean_token_accuracy": 0.6613983929157257,
|
|
"num_tokens": 904472577.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.3393467268006014,
|
|
"grad_norm": 0.2391437973458412,
|
|
"learning_rate": 1.7031844441313233e-06,
|
|
"loss": 1.6128,
|
|
"mean_token_accuracy": 0.6622181534767151,
|
|
"num_tokens": 905409729.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 1.58203125,
|
|
"epoch": 1.3407134071340714,
|
|
"grad_norm": 0.1644402806619733,
|
|
"learning_rate": 1.6996618289418064e-06,
|
|
"loss": 1.6039,
|
|
"mean_token_accuracy": 0.6645031034946441,
|
|
"num_tokens": 906347665.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.3420800874675414,
|
|
"grad_norm": 0.20157949573684858,
|
|
"learning_rate": 1.6961392137522898e-06,
|
|
"loss": 1.6276,
|
|
"mean_token_accuracy": 0.658072555065155,
|
|
"num_tokens": 907218441.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 1.3434467678010114,
|
|
"grad_norm": 0.5365278742791441,
|
|
"learning_rate": 1.6926165985627733e-06,
|
|
"loss": 1.5498,
|
|
"mean_token_accuracy": 0.6709898471832275,
|
|
"num_tokens": 908096902.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.3448134481344813,
|
|
"grad_norm": 0.19484144122298794,
|
|
"learning_rate": 1.6890939833732564e-06,
|
|
"loss": 1.6488,
|
|
"mean_token_accuracy": 0.6530332148075104,
|
|
"num_tokens": 909044921.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.3461801284679513,
|
|
"grad_norm": 0.16022382327425297,
|
|
"learning_rate": 1.6855713681837396e-06,
|
|
"loss": 1.6617,
|
|
"mean_token_accuracy": 0.6544517695903778,
|
|
"num_tokens": 909982394.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.3475468088014213,
|
|
"grad_norm": 0.1718358114903201,
|
|
"learning_rate": 1.6820487529942231e-06,
|
|
"loss": 1.5937,
|
|
"mean_token_accuracy": 0.6643797039985657,
|
|
"num_tokens": 910893155.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.3489134891348913,
|
|
"grad_norm": 0.26195652784368195,
|
|
"learning_rate": 1.6785261378047062e-06,
|
|
"loss": 1.5776,
|
|
"mean_token_accuracy": 0.6660836696624756,
|
|
"num_tokens": 911811820.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.3502801694683613,
|
|
"grad_norm": 0.22026116614426408,
|
|
"learning_rate": 1.6750035226151898e-06,
|
|
"loss": 1.5938,
|
|
"mean_token_accuracy": 0.665427953004837,
|
|
"num_tokens": 912692019.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.3516468498018313,
|
|
"grad_norm": 0.19069105268709652,
|
|
"learning_rate": 1.671480907425673e-06,
|
|
"loss": 1.6304,
|
|
"mean_token_accuracy": 0.6591661393642425,
|
|
"num_tokens": 913632537.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.3530135301353012,
|
|
"grad_norm": 0.22265218832794,
|
|
"learning_rate": 1.6679582922361562e-06,
|
|
"loss": 1.6044,
|
|
"mean_token_accuracy": 0.6634859502315521,
|
|
"num_tokens": 914538236.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.3543802104687712,
|
|
"grad_norm": 0.26454953113569807,
|
|
"learning_rate": 1.6644356770466396e-06,
|
|
"loss": 1.6196,
|
|
"mean_token_accuracy": 0.6581050634384156,
|
|
"num_tokens": 915473872.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.3557468908022414,
|
|
"grad_norm": 0.2182413201220366,
|
|
"learning_rate": 1.660913061857123e-06,
|
|
"loss": 1.6383,
|
|
"mean_token_accuracy": 0.6573639333248138,
|
|
"num_tokens": 916408023.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.3571135711357114,
|
|
"grad_norm": 0.23188083841288523,
|
|
"learning_rate": 1.657390446667606e-06,
|
|
"loss": 1.5391,
|
|
"mean_token_accuracy": 0.6755678057670593,
|
|
"num_tokens": 917332863.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.3584802514691814,
|
|
"grad_norm": 0.21390126831749504,
|
|
"learning_rate": 1.6538678314780896e-06,
|
|
"loss": 1.5998,
|
|
"mean_token_accuracy": 0.6645826995372772,
|
|
"num_tokens": 918240543.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 1.3598469318026514,
|
|
"grad_norm": 0.1714757666886751,
|
|
"learning_rate": 1.6503452162885727e-06,
|
|
"loss": 1.7067,
|
|
"mean_token_accuracy": 0.6445335209369659,
|
|
"num_tokens": 919135588.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.3612136121361214,
|
|
"grad_norm": 0.16511120021315484,
|
|
"learning_rate": 1.646822601099056e-06,
|
|
"loss": 1.5977,
|
|
"mean_token_accuracy": 0.6635120570659637,
|
|
"num_tokens": 920057476.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.3625802924695913,
|
|
"grad_norm": 0.2010009387481495,
|
|
"learning_rate": 1.6432999859095394e-06,
|
|
"loss": 1.6051,
|
|
"mean_token_accuracy": 0.6634848296642304,
|
|
"num_tokens": 920984492.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 1.3639469728030613,
|
|
"grad_norm": 0.18794710303281453,
|
|
"learning_rate": 1.6397773707200227e-06,
|
|
"loss": 1.687,
|
|
"mean_token_accuracy": 0.648221218585968,
|
|
"num_tokens": 921912195.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.3653136531365313,
|
|
"grad_norm": 0.22158542913850443,
|
|
"learning_rate": 1.636254755530506e-06,
|
|
"loss": 1.6136,
|
|
"mean_token_accuracy": 0.6602677166461944,
|
|
"num_tokens": 922795469.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 1.50625,
|
|
"epoch": 1.3666803334700013,
|
|
"grad_norm": 0.2437850268850487,
|
|
"learning_rate": 1.6327321403409894e-06,
|
|
"loss": 1.5098,
|
|
"mean_token_accuracy": 0.6773176968097687,
|
|
"num_tokens": 923706385.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.3680470138034715,
|
|
"grad_norm": 0.24398615370184462,
|
|
"learning_rate": 1.6292095251514725e-06,
|
|
"loss": 1.6352,
|
|
"mean_token_accuracy": 0.6590065956115723,
|
|
"num_tokens": 924619252.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.3694136941369415,
|
|
"grad_norm": 0.2706639479990343,
|
|
"learning_rate": 1.625686909961956e-06,
|
|
"loss": 1.677,
|
|
"mean_token_accuracy": 0.6488702476024628,
|
|
"num_tokens": 925550766.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.3707803744704115,
|
|
"grad_norm": 0.1818673725765093,
|
|
"learning_rate": 1.6221642947724392e-06,
|
|
"loss": 1.6162,
|
|
"mean_token_accuracy": 0.6610843539237976,
|
|
"num_tokens": 926431149.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.3721470548038814,
|
|
"grad_norm": 0.27573238340985584,
|
|
"learning_rate": 1.6186416795829223e-06,
|
|
"loss": 1.6759,
|
|
"mean_token_accuracy": 0.6497121810913086,
|
|
"num_tokens": 927369269.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 1.3735137351373514,
|
|
"grad_norm": 0.21110532931951942,
|
|
"learning_rate": 1.6151190643934058e-06,
|
|
"loss": 1.706,
|
|
"mean_token_accuracy": 0.6472294926643372,
|
|
"num_tokens": 928305796.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.3748804154708214,
|
|
"grad_norm": 0.23132391082934498,
|
|
"learning_rate": 1.6115964492038892e-06,
|
|
"loss": 1.5888,
|
|
"mean_token_accuracy": 0.6674111843109131,
|
|
"num_tokens": 929220830.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.3762470958042914,
|
|
"grad_norm": 0.1726864294051105,
|
|
"learning_rate": 1.6080738340143723e-06,
|
|
"loss": 1.608,
|
|
"mean_token_accuracy": 0.6603127717971802,
|
|
"num_tokens": 930162536.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.3776137761377614,
|
|
"grad_norm": 0.17512131408682174,
|
|
"learning_rate": 1.6045512188248559e-06,
|
|
"loss": 1.5447,
|
|
"mean_token_accuracy": 0.6719793498516082,
|
|
"num_tokens": 931038570.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 1.3789804564712314,
|
|
"grad_norm": 0.18323676057111032,
|
|
"learning_rate": 1.601028603635339e-06,
|
|
"loss": 1.5968,
|
|
"mean_token_accuracy": 0.6634364128112793,
|
|
"num_tokens": 931936079.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.3803471368047013,
|
|
"grad_norm": 0.19407635167214896,
|
|
"learning_rate": 1.5975059884458225e-06,
|
|
"loss": 1.6722,
|
|
"mean_token_accuracy": 0.651249623298645,
|
|
"num_tokens": 932901304.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.3817138171381713,
|
|
"grad_norm": 0.26453879102830047,
|
|
"learning_rate": 1.5939833732563056e-06,
|
|
"loss": 1.6124,
|
|
"mean_token_accuracy": 0.6618454575538635,
|
|
"num_tokens": 933829688.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.3830804974716413,
|
|
"grad_norm": 0.25617717269354,
|
|
"learning_rate": 1.5904607580667888e-06,
|
|
"loss": 1.6274,
|
|
"mean_token_accuracy": 0.6585960149765014,
|
|
"num_tokens": 934778529.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.3844471778051113,
|
|
"grad_norm": 0.2142837779298258,
|
|
"learning_rate": 1.5869381428772723e-06,
|
|
"loss": 1.63,
|
|
"mean_token_accuracy": 0.6582683742046356,
|
|
"num_tokens": 935692792.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.3858138581385813,
|
|
"grad_norm": 0.1713041596023399,
|
|
"learning_rate": 1.5834155276877554e-06,
|
|
"loss": 1.639,
|
|
"mean_token_accuracy": 0.6573312163352967,
|
|
"num_tokens": 936595972.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.3871805384720512,
|
|
"grad_norm": 0.2447119533077377,
|
|
"learning_rate": 1.5798929124982388e-06,
|
|
"loss": 1.6276,
|
|
"mean_token_accuracy": 0.6565417766571044,
|
|
"num_tokens": 937480232.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.3885472188055215,
|
|
"grad_norm": 0.24193082726083476,
|
|
"learning_rate": 1.5763702973087221e-06,
|
|
"loss": 1.6445,
|
|
"mean_token_accuracy": 0.655534315109253,
|
|
"num_tokens": 938375388.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 1.56484375,
|
|
"epoch": 1.3899138991389914,
|
|
"grad_norm": 0.42007854896229085,
|
|
"learning_rate": 1.5728476821192054e-06,
|
|
"loss": 1.5695,
|
|
"mean_token_accuracy": 0.6703463733196259,
|
|
"num_tokens": 939288290.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.3912805794724614,
|
|
"grad_norm": 0.17025178577969535,
|
|
"learning_rate": 1.5693250669296886e-06,
|
|
"loss": 1.6953,
|
|
"mean_token_accuracy": 0.6500791370868683,
|
|
"num_tokens": 940240224.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.3926472598059314,
|
|
"grad_norm": 0.2103350524356204,
|
|
"learning_rate": 1.5658024517401721e-06,
|
|
"loss": 1.5579,
|
|
"mean_token_accuracy": 0.6694684088230133,
|
|
"num_tokens": 941160848.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 1.56796875,
|
|
"epoch": 1.3940139401394014,
|
|
"grad_norm": 0.1854712330972291,
|
|
"learning_rate": 1.5622798365506552e-06,
|
|
"loss": 1.5791,
|
|
"mean_token_accuracy": 0.6672878265380859,
|
|
"num_tokens": 942103258.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.3953806204728714,
|
|
"grad_norm": 0.23693742616813976,
|
|
"learning_rate": 1.5587572213611388e-06,
|
|
"loss": 1.5998,
|
|
"mean_token_accuracy": 0.665112966299057,
|
|
"num_tokens": 943025001.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.3967473008063414,
|
|
"grad_norm": 0.17378027221581738,
|
|
"learning_rate": 1.555234606171622e-06,
|
|
"loss": 1.623,
|
|
"mean_token_accuracy": 0.6608273148536682,
|
|
"num_tokens": 943981206.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 1.3981139811398113,
|
|
"grad_norm": 0.17239139945176177,
|
|
"learning_rate": 1.5517119909821052e-06,
|
|
"loss": 1.6762,
|
|
"mean_token_accuracy": 0.6529516041278839,
|
|
"num_tokens": 944931856.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.3994806614732813,
|
|
"grad_norm": 0.2116884584198221,
|
|
"learning_rate": 1.5481893757925886e-06,
|
|
"loss": 1.5999,
|
|
"mean_token_accuracy": 0.6625640094280243,
|
|
"num_tokens": 945880931.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.4008473418067515,
|
|
"grad_norm": 0.18529927979631466,
|
|
"learning_rate": 1.544666760603072e-06,
|
|
"loss": 1.6517,
|
|
"mean_token_accuracy": 0.6528003334999084,
|
|
"num_tokens": 946760955.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.4022140221402215,
|
|
"grad_norm": 0.17416680357295497,
|
|
"learning_rate": 1.541144145413555e-06,
|
|
"loss": 1.6695,
|
|
"mean_token_accuracy": 0.650529146194458,
|
|
"num_tokens": 947708812.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.4035807024736915,
|
|
"grad_norm": 0.2178412147235267,
|
|
"learning_rate": 1.5376215302240386e-06,
|
|
"loss": 1.6686,
|
|
"mean_token_accuracy": 0.6538994073867798,
|
|
"num_tokens": 948701638.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.4049473828071615,
|
|
"grad_norm": 0.21542694845354773,
|
|
"learning_rate": 1.5340989150345217e-06,
|
|
"loss": 1.638,
|
|
"mean_token_accuracy": 0.6571428000926971,
|
|
"num_tokens": 949631306.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 1.51484375,
|
|
"epoch": 1.4063140631406315,
|
|
"grad_norm": 0.1960101488483535,
|
|
"learning_rate": 1.5305762998450048e-06,
|
|
"loss": 1.5346,
|
|
"mean_token_accuracy": 0.6752156317234039,
|
|
"num_tokens": 950542676.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 1.4076807434741014,
|
|
"grad_norm": 0.19363604824650846,
|
|
"learning_rate": 1.5270536846554884e-06,
|
|
"loss": 1.7276,
|
|
"mean_token_accuracy": 0.6433395206928253,
|
|
"num_tokens": 951462136.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.4090474238075714,
|
|
"grad_norm": 0.22190879058421842,
|
|
"learning_rate": 1.5235310694659717e-06,
|
|
"loss": 1.635,
|
|
"mean_token_accuracy": 0.6558987379074097,
|
|
"num_tokens": 952402897.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 1.5625,
|
|
"epoch": 1.4104141041410414,
|
|
"grad_norm": 0.28057192651947377,
|
|
"learning_rate": 1.5200084542764548e-06,
|
|
"loss": 1.5531,
|
|
"mean_token_accuracy": 0.6701358377933502,
|
|
"num_tokens": 953343741.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.4117807844745114,
|
|
"grad_norm": 0.22475411820028462,
|
|
"learning_rate": 1.5164858390869384e-06,
|
|
"loss": 1.6156,
|
|
"mean_token_accuracy": 0.658899313211441,
|
|
"num_tokens": 954265246.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 1.4131474648079814,
|
|
"grad_norm": 0.24838743534692106,
|
|
"learning_rate": 1.5129632238974215e-06,
|
|
"loss": 1.6961,
|
|
"mean_token_accuracy": 0.6494625866413116,
|
|
"num_tokens": 955158695.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.4145141451414514,
|
|
"grad_norm": 0.2155655067762768,
|
|
"learning_rate": 1.509440608707905e-06,
|
|
"loss": 1.6129,
|
|
"mean_token_accuracy": 0.6607892096042634,
|
|
"num_tokens": 956111767.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.4158808254749213,
|
|
"grad_norm": 0.21031612811973188,
|
|
"learning_rate": 1.5059179935183882e-06,
|
|
"loss": 1.654,
|
|
"mean_token_accuracy": 0.6528097987174988,
|
|
"num_tokens": 957026357.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.4172475058083913,
|
|
"grad_norm": 0.20155296431656425,
|
|
"learning_rate": 1.5023953783288713e-06,
|
|
"loss": 1.5845,
|
|
"mean_token_accuracy": 0.6668376803398133,
|
|
"num_tokens": 957964171.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.4186141861418613,
|
|
"grad_norm": 0.28039607459494825,
|
|
"learning_rate": 1.4988727631393549e-06,
|
|
"loss": 1.6179,
|
|
"mean_token_accuracy": 0.6587039589881897,
|
|
"num_tokens": 958875748.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.4199808664753313,
|
|
"grad_norm": 0.16830985772644558,
|
|
"learning_rate": 1.495350147949838e-06,
|
|
"loss": 1.682,
|
|
"mean_token_accuracy": 0.6527386367321014,
|
|
"num_tokens": 959836527.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 1.4213475468088015,
|
|
"grad_norm": 0.19787888667905534,
|
|
"learning_rate": 1.4918275327603213e-06,
|
|
"loss": 1.5783,
|
|
"mean_token_accuracy": 0.669136780500412,
|
|
"num_tokens": 960724673.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.4227142271422715,
|
|
"grad_norm": 0.19360332305531686,
|
|
"learning_rate": 1.4883049175708047e-06,
|
|
"loss": 1.5955,
|
|
"mean_token_accuracy": 0.665308701992035,
|
|
"num_tokens": 961597273.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.4240809074757415,
|
|
"grad_norm": 0.41260968315814944,
|
|
"learning_rate": 1.484782302381288e-06,
|
|
"loss": 1.6143,
|
|
"mean_token_accuracy": 0.659363204240799,
|
|
"num_tokens": 962519485.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.4254475878092114,
|
|
"grad_norm": 0.24709079429762412,
|
|
"learning_rate": 1.4812596871917711e-06,
|
|
"loss": 1.6367,
|
|
"mean_token_accuracy": 0.6592309892177581,
|
|
"num_tokens": 963423438.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 1.57421875,
|
|
"epoch": 1.4268142681426814,
|
|
"grad_norm": 0.17664010672671862,
|
|
"learning_rate": 1.4777370720022547e-06,
|
|
"loss": 1.58,
|
|
"mean_token_accuracy": 0.6659643113613128,
|
|
"num_tokens": 964341941.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.4281809484761514,
|
|
"grad_norm": 0.21658566823263792,
|
|
"learning_rate": 1.4742144568127378e-06,
|
|
"loss": 1.6075,
|
|
"mean_token_accuracy": 0.6604258000850678,
|
|
"num_tokens": 965292129.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 1.54453125,
|
|
"epoch": 1.4295476288096214,
|
|
"grad_norm": 0.22252179039811448,
|
|
"learning_rate": 1.4706918416232213e-06,
|
|
"loss": 1.5517,
|
|
"mean_token_accuracy": 0.6726219773292541,
|
|
"num_tokens": 966213489.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.4309143091430914,
|
|
"grad_norm": 0.21407103445868608,
|
|
"learning_rate": 1.4671692264337045e-06,
|
|
"loss": 1.6693,
|
|
"mean_token_accuracy": 0.6544330239295959,
|
|
"num_tokens": 967164213.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.4322809894765614,
|
|
"grad_norm": 0.15952591723504933,
|
|
"learning_rate": 1.4636466112441878e-06,
|
|
"loss": 1.6387,
|
|
"mean_token_accuracy": 0.6548370659351349,
|
|
"num_tokens": 968113245.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.4336476698100316,
|
|
"grad_norm": 0.21089062440103587,
|
|
"learning_rate": 1.4601239960546711e-06,
|
|
"loss": 1.6029,
|
|
"mean_token_accuracy": 0.6624026000499725,
|
|
"num_tokens": 969035531.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 1.4350143501435015,
|
|
"grad_norm": 0.19555149972164163,
|
|
"learning_rate": 1.4566013808651545e-06,
|
|
"loss": 1.6802,
|
|
"mean_token_accuracy": 0.6508715212345123,
|
|
"num_tokens": 970003824.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.4363810304769715,
|
|
"grad_norm": 0.19662920413722326,
|
|
"learning_rate": 1.4530787656756376e-06,
|
|
"loss": 1.58,
|
|
"mean_token_accuracy": 0.6672145783901214,
|
|
"num_tokens": 970964060.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.4377477108104415,
|
|
"grad_norm": 0.2058642385954336,
|
|
"learning_rate": 1.4495561504861211e-06,
|
|
"loss": 1.653,
|
|
"mean_token_accuracy": 0.654794180393219,
|
|
"num_tokens": 971937779.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.4391143911439115,
|
|
"grad_norm": 0.20270105030302335,
|
|
"learning_rate": 1.4460335352966043e-06,
|
|
"loss": 1.5893,
|
|
"mean_token_accuracy": 0.6660375654697418,
|
|
"num_tokens": 972855578.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 1.70234375,
|
|
"epoch": 1.4404810714773815,
|
|
"grad_norm": 0.1947356110183493,
|
|
"learning_rate": 1.4425109201070876e-06,
|
|
"loss": 1.7034,
|
|
"mean_token_accuracy": 0.6455981016159058,
|
|
"num_tokens": 973862324.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.4418477518108515,
|
|
"grad_norm": 0.17572195664064455,
|
|
"learning_rate": 1.438988304917571e-06,
|
|
"loss": 1.6081,
|
|
"mean_token_accuracy": 0.6601494073867797,
|
|
"num_tokens": 974806875.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.4432144321443214,
|
|
"grad_norm": 0.23172294160020798,
|
|
"learning_rate": 1.4354656897280543e-06,
|
|
"loss": 1.62,
|
|
"mean_token_accuracy": 0.659467351436615,
|
|
"num_tokens": 975738884.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.4445811124777914,
|
|
"grad_norm": 0.20898674917270807,
|
|
"learning_rate": 1.4319430745385376e-06,
|
|
"loss": 1.6122,
|
|
"mean_token_accuracy": 0.6604095995426178,
|
|
"num_tokens": 976654580.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.4459477928112614,
|
|
"grad_norm": 0.2106082757652883,
|
|
"learning_rate": 1.428420459349021e-06,
|
|
"loss": 1.6127,
|
|
"mean_token_accuracy": 0.6625046908855439,
|
|
"num_tokens": 977575232.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 1.52578125,
|
|
"epoch": 1.4473144731447314,
|
|
"grad_norm": 0.20603412515860653,
|
|
"learning_rate": 1.424897844159504e-06,
|
|
"loss": 1.5461,
|
|
"mean_token_accuracy": 0.6724742710590362,
|
|
"num_tokens": 978500364.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 1.578125,
|
|
"epoch": 1.4486811534782014,
|
|
"grad_norm": 0.17231138354298553,
|
|
"learning_rate": 1.4213752289699876e-06,
|
|
"loss": 1.5781,
|
|
"mean_token_accuracy": 0.6661147534847259,
|
|
"num_tokens": 979405875.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 1.521875,
|
|
"epoch": 1.4500478338116713,
|
|
"grad_norm": 0.1819334836961154,
|
|
"learning_rate": 1.4178526137804707e-06,
|
|
"loss": 1.5261,
|
|
"mean_token_accuracy": 0.6754115521907806,
|
|
"num_tokens": 980315706.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.4514145141451413,
|
|
"grad_norm": 0.17544050761577268,
|
|
"learning_rate": 1.4143299985909538e-06,
|
|
"loss": 1.548,
|
|
"mean_token_accuracy": 0.6724161267280578,
|
|
"num_tokens": 981218026.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 1.5625,
|
|
"epoch": 1.4527811944786113,
|
|
"grad_norm": 0.17212088352950922,
|
|
"learning_rate": 1.4108073834014374e-06,
|
|
"loss": 1.5508,
|
|
"mean_token_accuracy": 0.6709322571754456,
|
|
"num_tokens": 982174404.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 1.5484375,
|
|
"epoch": 1.4541478748120815,
|
|
"grad_norm": 0.1720673767998382,
|
|
"learning_rate": 1.4072847682119205e-06,
|
|
"loss": 1.5581,
|
|
"mean_token_accuracy": 0.669014823436737,
|
|
"num_tokens": 983122266.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.4555145551455515,
|
|
"grad_norm": 0.167506545796236,
|
|
"learning_rate": 1.4037621530224039e-06,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.657724130153656,
|
|
"num_tokens": 984102866.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.4568812354790215,
|
|
"grad_norm": 0.338774154958823,
|
|
"learning_rate": 1.4002395378328872e-06,
|
|
"loss": 1.5978,
|
|
"mean_token_accuracy": 0.6620022177696228,
|
|
"num_tokens": 985061527.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.4582479158124915,
|
|
"grad_norm": 0.21415264291311553,
|
|
"learning_rate": 1.3967169226433705e-06,
|
|
"loss": 1.5504,
|
|
"mean_token_accuracy": 0.672389167547226,
|
|
"num_tokens": 985985727.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.4596145961459615,
|
|
"grad_norm": 0.2099649416280029,
|
|
"learning_rate": 1.393194307453854e-06,
|
|
"loss": 1.6388,
|
|
"mean_token_accuracy": 0.6572908759117126,
|
|
"num_tokens": 986926809.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.4609812764794314,
|
|
"grad_norm": 0.1557295107145437,
|
|
"learning_rate": 1.3896716922643372e-06,
|
|
"loss": 1.6366,
|
|
"mean_token_accuracy": 0.6561380803585053,
|
|
"num_tokens": 987829586.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.4623479568129014,
|
|
"grad_norm": 0.26361348050693456,
|
|
"learning_rate": 1.3861490770748203e-06,
|
|
"loss": 1.581,
|
|
"mean_token_accuracy": 0.6670800387859345,
|
|
"num_tokens": 988775214.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.4637146371463714,
|
|
"grad_norm": 0.20923099817387691,
|
|
"learning_rate": 1.3826264618853039e-06,
|
|
"loss": 1.6151,
|
|
"mean_token_accuracy": 0.6579020738601684,
|
|
"num_tokens": 989650395.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.4650813174798414,
|
|
"grad_norm": 0.16153939750613416,
|
|
"learning_rate": 1.379103846695787e-06,
|
|
"loss": 1.6631,
|
|
"mean_token_accuracy": 0.6538627862930297,
|
|
"num_tokens": 990641888.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.4664479978133116,
|
|
"grad_norm": 0.21351101835879713,
|
|
"learning_rate": 1.3755812315062703e-06,
|
|
"loss": 1.6309,
|
|
"mean_token_accuracy": 0.6593694448471069,
|
|
"num_tokens": 991559417.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 1.4678146781467816,
|
|
"grad_norm": 0.2224415475287536,
|
|
"learning_rate": 1.3720586163167537e-06,
|
|
"loss": 1.6017,
|
|
"mean_token_accuracy": 0.6614203035831452,
|
|
"num_tokens": 992481840.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.4691813584802516,
|
|
"grad_norm": 0.21829040258645005,
|
|
"learning_rate": 1.368536001127237e-06,
|
|
"loss": 1.5991,
|
|
"mean_token_accuracy": 0.6634079575538635,
|
|
"num_tokens": 993380320.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.4705480388137215,
|
|
"grad_norm": 0.20507995206548468,
|
|
"learning_rate": 1.3650133859377201e-06,
|
|
"loss": 1.5894,
|
|
"mean_token_accuracy": 0.6621441960334777,
|
|
"num_tokens": 994290871.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.4719147191471915,
|
|
"grad_norm": 0.1828189559817601,
|
|
"learning_rate": 1.3614907707482037e-06,
|
|
"loss": 1.6123,
|
|
"mean_token_accuracy": 0.6618849515914917,
|
|
"num_tokens": 995192959.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 1.4732813994806615,
|
|
"grad_norm": 0.17192731390309907,
|
|
"learning_rate": 1.3579681555586868e-06,
|
|
"loss": 1.7059,
|
|
"mean_token_accuracy": 0.6480711162090301,
|
|
"num_tokens": 996146287.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.4746480798141315,
|
|
"grad_norm": 0.2677614615206252,
|
|
"learning_rate": 1.3544455403691703e-06,
|
|
"loss": 1.631,
|
|
"mean_token_accuracy": 0.6581042826175689,
|
|
"num_tokens": 997097345.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.4760147601476015,
|
|
"grad_norm": 0.2046570374991536,
|
|
"learning_rate": 1.3509229251796535e-06,
|
|
"loss": 1.5566,
|
|
"mean_token_accuracy": 0.6696434378623962,
|
|
"num_tokens": 997982347.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.4773814404810715,
|
|
"grad_norm": 0.19246848025573907,
|
|
"learning_rate": 1.3474003099901368e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.657231330871582,
|
|
"num_tokens": 998877907.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.4787481208145414,
|
|
"grad_norm": 0.23031395218022524,
|
|
"learning_rate": 1.3438776948006201e-06,
|
|
"loss": 1.594,
|
|
"mean_token_accuracy": 0.6660185515880584,
|
|
"num_tokens": 999783942.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 1.56953125,
|
|
"epoch": 1.4801148011480114,
|
|
"grad_norm": 0.17085461571217486,
|
|
"learning_rate": 1.3403550796111035e-06,
|
|
"loss": 1.5858,
|
|
"mean_token_accuracy": 0.6670781672000885,
|
|
"num_tokens": 1000711560.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 0.2511226505819412,
|
|
"learning_rate": 1.3368324644215866e-06,
|
|
"loss": 1.6202,
|
|
"mean_token_accuracy": 0.6605205774307251,
|
|
"num_tokens": 1001634503.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 1.54296875,
|
|
"epoch": 1.4828481618149514,
|
|
"grad_norm": 0.1909977397888176,
|
|
"learning_rate": 1.3333098492320701e-06,
|
|
"loss": 1.5474,
|
|
"mean_token_accuracy": 0.6696652114391327,
|
|
"num_tokens": 1002569885.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 1.4842148421484214,
|
|
"grad_norm": 0.2692440299018327,
|
|
"learning_rate": 1.3297872340425533e-06,
|
|
"loss": 1.6719,
|
|
"mean_token_accuracy": 0.6512989640235901,
|
|
"num_tokens": 1003462438.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.4855815224818913,
|
|
"grad_norm": 0.23430114836860955,
|
|
"learning_rate": 1.3262646188530364e-06,
|
|
"loss": 1.6992,
|
|
"mean_token_accuracy": 0.6495297729969025,
|
|
"num_tokens": 1004430542.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.4869482028153616,
|
|
"grad_norm": 0.17781519323420586,
|
|
"learning_rate": 1.32274200366352e-06,
|
|
"loss": 1.5739,
|
|
"mean_token_accuracy": 0.668750673532486,
|
|
"num_tokens": 1005358953.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 1.5546875,
|
|
"epoch": 1.4883148831488315,
|
|
"grad_norm": 0.2043764150028308,
|
|
"learning_rate": 1.319219388474003e-06,
|
|
"loss": 1.5576,
|
|
"mean_token_accuracy": 0.6710604429244995,
|
|
"num_tokens": 1006238755.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.4896815634823015,
|
|
"grad_norm": 0.1944288239769646,
|
|
"learning_rate": 1.3156967732844866e-06,
|
|
"loss": 1.6453,
|
|
"mean_token_accuracy": 0.6549010396003723,
|
|
"num_tokens": 1007172626.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 1.4910482438157715,
|
|
"grad_norm": 0.20180570380618318,
|
|
"learning_rate": 1.31217415809497e-06,
|
|
"loss": 1.5475,
|
|
"mean_token_accuracy": 0.6711002588272095,
|
|
"num_tokens": 1008120727.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.4924149241492415,
|
|
"grad_norm": 0.18143548051209604,
|
|
"learning_rate": 1.308651542905453e-06,
|
|
"loss": 1.7133,
|
|
"mean_token_accuracy": 0.6435099005699157,
|
|
"num_tokens": 1009095094.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.4937816044827115,
|
|
"grad_norm": 0.16052999861019185,
|
|
"learning_rate": 1.3051289277159366e-06,
|
|
"loss": 1.5772,
|
|
"mean_token_accuracy": 0.6655795037746429,
|
|
"num_tokens": 1009998978.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 1.5390625,
|
|
"epoch": 1.4951482848161814,
|
|
"grad_norm": 0.2557339059804848,
|
|
"learning_rate": 1.3016063125264197e-06,
|
|
"loss": 1.5508,
|
|
"mean_token_accuracy": 0.6719403684139251,
|
|
"num_tokens": 1010903975.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.4965149651496514,
|
|
"grad_norm": 0.20698639442597258,
|
|
"learning_rate": 1.2980836973369029e-06,
|
|
"loss": 1.6578,
|
|
"mean_token_accuracy": 0.6515170991420746,
|
|
"num_tokens": 1011794207.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.4978816454831214,
|
|
"grad_norm": 0.162138126479479,
|
|
"learning_rate": 1.2945610821473864e-06,
|
|
"loss": 1.5691,
|
|
"mean_token_accuracy": 0.6692234575748444,
|
|
"num_tokens": 1012684083.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.4992483258165916,
|
|
"grad_norm": 0.22994248575669524,
|
|
"learning_rate": 1.2910384669578695e-06,
|
|
"loss": 1.5703,
|
|
"mean_token_accuracy": 0.670361328125,
|
|
"num_tokens": 1013615383.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.5006150061500616,
|
|
"grad_norm": 0.2124301438436308,
|
|
"learning_rate": 1.2875158517683529e-06,
|
|
"loss": 1.5822,
|
|
"mean_token_accuracy": 0.6667513608932495,
|
|
"num_tokens": 1014513146.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.5019816864835316,
|
|
"grad_norm": 0.24281414479777488,
|
|
"learning_rate": 1.2839932365788362e-06,
|
|
"loss": 1.6569,
|
|
"mean_token_accuracy": 0.6537173748016357,
|
|
"num_tokens": 1015433634.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 1.5033483668170016,
|
|
"grad_norm": 0.2070994634844455,
|
|
"learning_rate": 1.2804706213893195e-06,
|
|
"loss": 1.596,
|
|
"mean_token_accuracy": 0.6634191393852233,
|
|
"num_tokens": 1016371833.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.5047150471504716,
|
|
"grad_norm": 0.17122398990264323,
|
|
"learning_rate": 1.2769480061998029e-06,
|
|
"loss": 1.6524,
|
|
"mean_token_accuracy": 0.6564097106456757,
|
|
"num_tokens": 1017305119.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.5060817274839415,
|
|
"grad_norm": 0.1653508065274219,
|
|
"learning_rate": 1.2734253910102862e-06,
|
|
"loss": 1.5917,
|
|
"mean_token_accuracy": 0.6657720625400543,
|
|
"num_tokens": 1018269495.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.5074484078174115,
|
|
"grad_norm": 0.15756946140230974,
|
|
"learning_rate": 1.2699027758207693e-06,
|
|
"loss": 1.6122,
|
|
"mean_token_accuracy": 0.6609314560890198,
|
|
"num_tokens": 1019189041.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 1.5088150881508815,
|
|
"grad_norm": 0.2120400824811079,
|
|
"learning_rate": 1.2663801606312529e-06,
|
|
"loss": 1.7119,
|
|
"mean_token_accuracy": 0.6414566218852997,
|
|
"num_tokens": 1020170170.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.5101817684843515,
|
|
"grad_norm": 0.2226681237219333,
|
|
"learning_rate": 1.262857545441736e-06,
|
|
"loss": 1.537,
|
|
"mean_token_accuracy": 0.6739375412464141,
|
|
"num_tokens": 1021058158.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.5115484488178215,
|
|
"grad_norm": 0.26446906196364695,
|
|
"learning_rate": 1.2593349302522193e-06,
|
|
"loss": 1.6206,
|
|
"mean_token_accuracy": 0.6587363660335541,
|
|
"num_tokens": 1021971024.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.5129151291512914,
|
|
"grad_norm": 0.20604578523136854,
|
|
"learning_rate": 1.2558123150627027e-06,
|
|
"loss": 1.6748,
|
|
"mean_token_accuracy": 0.6512810945510864,
|
|
"num_tokens": 1022905339.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.5142818094847614,
|
|
"grad_norm": 0.19077554591495263,
|
|
"learning_rate": 1.252289699873186e-06,
|
|
"loss": 1.6815,
|
|
"mean_token_accuracy": 0.6477606356143951,
|
|
"num_tokens": 1023833136.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.5156484898182314,
|
|
"grad_norm": 0.1914902642656663,
|
|
"learning_rate": 1.2487670846836693e-06,
|
|
"loss": 1.6362,
|
|
"mean_token_accuracy": 0.6564597845077514,
|
|
"num_tokens": 1024771408.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 1.73125,
|
|
"epoch": 1.5170151701517014,
|
|
"grad_norm": 0.2651985744260206,
|
|
"learning_rate": 1.2452444694941527e-06,
|
|
"loss": 1.7413,
|
|
"mean_token_accuracy": 0.6406595408916473,
|
|
"num_tokens": 1025658511.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.5183818504851714,
|
|
"grad_norm": 0.17937052908457204,
|
|
"learning_rate": 1.2417218543046358e-06,
|
|
"loss": 1.5937,
|
|
"mean_token_accuracy": 0.6639249503612519,
|
|
"num_tokens": 1026589499.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.5197485308186414,
|
|
"grad_norm": 0.18111140642168705,
|
|
"learning_rate": 1.2381992391151191e-06,
|
|
"loss": 1.5754,
|
|
"mean_token_accuracy": 0.6685408532619477,
|
|
"num_tokens": 1027502226.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.5211152111521116,
|
|
"grad_norm": 0.19779505934428876,
|
|
"learning_rate": 1.2346766239256025e-06,
|
|
"loss": 1.6092,
|
|
"mean_token_accuracy": 0.6589212894439698,
|
|
"num_tokens": 1028450554.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.5224818914855816,
|
|
"grad_norm": 0.1917751594320491,
|
|
"learning_rate": 1.2311540087360858e-06,
|
|
"loss": 1.6028,
|
|
"mean_token_accuracy": 0.6623551905155182,
|
|
"num_tokens": 1029413410.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 1.55703125,
|
|
"epoch": 1.5238485718190515,
|
|
"grad_norm": 0.1957893418334549,
|
|
"learning_rate": 1.2276313935465691e-06,
|
|
"loss": 1.5473,
|
|
"mean_token_accuracy": 0.6726354420185089,
|
|
"num_tokens": 1030335100.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 1.56328125,
|
|
"epoch": 1.5252152521525215,
|
|
"grad_norm": 0.2940615082778477,
|
|
"learning_rate": 1.2241087783570525e-06,
|
|
"loss": 1.586,
|
|
"mean_token_accuracy": 0.6644219815731048,
|
|
"num_tokens": 1031262979.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.5265819324859915,
|
|
"grad_norm": 0.19204996617810535,
|
|
"learning_rate": 1.2205861631675358e-06,
|
|
"loss": 1.605,
|
|
"mean_token_accuracy": 0.6600903332233429,
|
|
"num_tokens": 1032195573.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 1.5279486128194615,
|
|
"grad_norm": 0.2734360527909416,
|
|
"learning_rate": 1.217063547978019e-06,
|
|
"loss": 1.6726,
|
|
"mean_token_accuracy": 0.6497792601585388,
|
|
"num_tokens": 1033105796.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.5293152931529317,
|
|
"grad_norm": 0.2259673985264969,
|
|
"learning_rate": 1.2135409327885023e-06,
|
|
"loss": 1.6003,
|
|
"mean_token_accuracy": 0.6639312624931335,
|
|
"num_tokens": 1034023745.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.5306819734864017,
|
|
"grad_norm": 0.17798161338694554,
|
|
"learning_rate": 1.2100183175989856e-06,
|
|
"loss": 1.5784,
|
|
"mean_token_accuracy": 0.666279649734497,
|
|
"num_tokens": 1034923970.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 1.52265625,
|
|
"epoch": 1.5320486538198717,
|
|
"grad_norm": 0.22935751016420336,
|
|
"learning_rate": 1.206495702409469e-06,
|
|
"loss": 1.5361,
|
|
"mean_token_accuracy": 0.6744216680526733,
|
|
"num_tokens": 1035829973.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 1.52734375,
|
|
"epoch": 1.5334153341533416,
|
|
"grad_norm": 0.1930720211998891,
|
|
"learning_rate": 1.202973087219952e-06,
|
|
"loss": 1.5339,
|
|
"mean_token_accuracy": 0.670494019985199,
|
|
"num_tokens": 1036751945.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.5347820144868116,
|
|
"grad_norm": 0.312189843548537,
|
|
"learning_rate": 1.1994504720304354e-06,
|
|
"loss": 1.6121,
|
|
"mean_token_accuracy": 0.6603140413761139,
|
|
"num_tokens": 1037699862.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.5361486948202816,
|
|
"grad_norm": 0.29660251409818417,
|
|
"learning_rate": 1.1959278568409187e-06,
|
|
"loss": 1.6055,
|
|
"mean_token_accuracy": 0.6605474710464477,
|
|
"num_tokens": 1038642354.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 1.5515625,
|
|
"epoch": 1.5375153751537516,
|
|
"grad_norm": 0.20455454535540915,
|
|
"learning_rate": 1.192405241651402e-06,
|
|
"loss": 1.5482,
|
|
"mean_token_accuracy": 0.668239289522171,
|
|
"num_tokens": 1039564382.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 1.57734375,
|
|
"epoch": 1.5388820554872216,
|
|
"grad_norm": 0.22106988251409398,
|
|
"learning_rate": 1.1888826264618854e-06,
|
|
"loss": 1.5806,
|
|
"mean_token_accuracy": 0.6662236094474793,
|
|
"num_tokens": 1040481220.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.5402487358206916,
|
|
"grad_norm": 0.28056963038512933,
|
|
"learning_rate": 1.1853600112723687e-06,
|
|
"loss": 1.6121,
|
|
"mean_token_accuracy": 0.6614476382732392,
|
|
"num_tokens": 1041421547.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.5416154161541615,
|
|
"grad_norm": 0.17019649306226114,
|
|
"learning_rate": 1.181837396082852e-06,
|
|
"loss": 1.5784,
|
|
"mean_token_accuracy": 0.6681182444095611,
|
|
"num_tokens": 1042361706.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.5429820964876315,
|
|
"grad_norm": 0.21722375065931462,
|
|
"learning_rate": 1.1783147808933352e-06,
|
|
"loss": 1.5756,
|
|
"mean_token_accuracy": 0.6688835144042968,
|
|
"num_tokens": 1043298491.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 1.5443487768211015,
|
|
"grad_norm": 0.20436059426318987,
|
|
"learning_rate": 1.1747921657038185e-06,
|
|
"loss": 1.7219,
|
|
"mean_token_accuracy": 0.6425449132919312,
|
|
"num_tokens": 1044220007.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 1.540625,
|
|
"epoch": 1.5457154571545715,
|
|
"grad_norm": 0.2579743934310812,
|
|
"learning_rate": 1.1712695505143019e-06,
|
|
"loss": 1.5363,
|
|
"mean_token_accuracy": 0.6713921666145325,
|
|
"num_tokens": 1045114421.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.5470821374880415,
|
|
"grad_norm": 0.17252863882036412,
|
|
"learning_rate": 1.1677469353247852e-06,
|
|
"loss": 1.6176,
|
|
"mean_token_accuracy": 0.6594865798950196,
|
|
"num_tokens": 1045981063.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.5484488178215114,
|
|
"grad_norm": 0.23106783040858672,
|
|
"learning_rate": 1.1642243201352685e-06,
|
|
"loss": 1.631,
|
|
"mean_token_accuracy": 0.6563633739948272,
|
|
"num_tokens": 1046936684.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.5498154981549814,
|
|
"grad_norm": 0.2246162774708911,
|
|
"learning_rate": 1.1607017049457519e-06,
|
|
"loss": 1.5853,
|
|
"mean_token_accuracy": 0.6634556114673614,
|
|
"num_tokens": 1047875696.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.5511821784884514,
|
|
"grad_norm": 0.23805736197897445,
|
|
"learning_rate": 1.1571790897562352e-06,
|
|
"loss": 1.6053,
|
|
"mean_token_accuracy": 0.6615194141864776,
|
|
"num_tokens": 1048776371.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 1.55546875,
|
|
"epoch": 1.5525488588219214,
|
|
"grad_norm": 0.2180536655643796,
|
|
"learning_rate": 1.1536564745667183e-06,
|
|
"loss": 1.5524,
|
|
"mean_token_accuracy": 0.6713928818702698,
|
|
"num_tokens": 1049687450.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.5539155391553916,
|
|
"grad_norm": 0.2079309059303467,
|
|
"learning_rate": 1.1501338593772017e-06,
|
|
"loss": 1.5873,
|
|
"mean_token_accuracy": 0.6637887299060822,
|
|
"num_tokens": 1050593800.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.5552822194888616,
|
|
"grad_norm": 0.22053573366900298,
|
|
"learning_rate": 1.146611244187685e-06,
|
|
"loss": 1.6513,
|
|
"mean_token_accuracy": 0.6585900008678436,
|
|
"num_tokens": 1051545843.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.5566488998223316,
|
|
"grad_norm": 0.15393327532807355,
|
|
"learning_rate": 1.1430886289981683e-06,
|
|
"loss": 1.639,
|
|
"mean_token_accuracy": 0.6540607869625091,
|
|
"num_tokens": 1052463695.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 1.5234375,
|
|
"epoch": 1.5580155801558015,
|
|
"grad_norm": 0.19931128657398878,
|
|
"learning_rate": 1.1395660138086517e-06,
|
|
"loss": 1.505,
|
|
"mean_token_accuracy": 0.6774047791957856,
|
|
"num_tokens": 1053333610.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.5593822604892715,
|
|
"grad_norm": 0.18971216842997296,
|
|
"learning_rate": 1.136043398619135e-06,
|
|
"loss": 1.5682,
|
|
"mean_token_accuracy": 0.670491099357605,
|
|
"num_tokens": 1054251939.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.5607489408227415,
|
|
"grad_norm": 0.1592090938493591,
|
|
"learning_rate": 1.1325207834296184e-06,
|
|
"loss": 1.6734,
|
|
"mean_token_accuracy": 0.65104039311409,
|
|
"num_tokens": 1055187558.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.5621156211562117,
|
|
"grad_norm": 0.18342081976098903,
|
|
"learning_rate": 1.1289981682401017e-06,
|
|
"loss": 1.6284,
|
|
"mean_token_accuracy": 0.659857577085495,
|
|
"num_tokens": 1056074097.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 1.571875,
|
|
"epoch": 1.5634823014896817,
|
|
"grad_norm": 0.19690322311581243,
|
|
"learning_rate": 1.1254755530505848e-06,
|
|
"loss": 1.5718,
|
|
"mean_token_accuracy": 0.6673674643039703,
|
|
"num_tokens": 1056998605.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.5648489818231517,
|
|
"grad_norm": 0.18517021498037883,
|
|
"learning_rate": 1.1219529378610681e-06,
|
|
"loss": 1.6503,
|
|
"mean_token_accuracy": 0.6537310123443604,
|
|
"num_tokens": 1057917855.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.5662156621566217,
|
|
"grad_norm": 0.20450792117279062,
|
|
"learning_rate": 1.1184303226715515e-06,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6618548095226288,
|
|
"num_tokens": 1058859012.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.5675823424900917,
|
|
"grad_norm": 0.21372159311104366,
|
|
"learning_rate": 1.1149077074820346e-06,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.6559384942054749,
|
|
"num_tokens": 1059795366.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.5689490228235616,
|
|
"grad_norm": 0.25595951592520366,
|
|
"learning_rate": 1.111385092292518e-06,
|
|
"loss": 1.6352,
|
|
"mean_token_accuracy": 0.6575400233268738,
|
|
"num_tokens": 1060705740.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 1.55546875,
|
|
"epoch": 1.5703157031570316,
|
|
"grad_norm": 0.1946595152497268,
|
|
"learning_rate": 1.1078624771030013e-06,
|
|
"loss": 1.571,
|
|
"mean_token_accuracy": 0.6667645514011383,
|
|
"num_tokens": 1061629577.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.5716823834905016,
|
|
"grad_norm": 0.21607471770844994,
|
|
"learning_rate": 1.1043398619134846e-06,
|
|
"loss": 1.6574,
|
|
"mean_token_accuracy": 0.6525882601737976,
|
|
"num_tokens": 1062588746.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.5730490638239716,
|
|
"grad_norm": 0.23077835876147224,
|
|
"learning_rate": 1.100817246723968e-06,
|
|
"loss": 1.6262,
|
|
"mean_token_accuracy": 0.6585572838783265,
|
|
"num_tokens": 1063494979.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.5744157441574416,
|
|
"grad_norm": 0.20467289609626066,
|
|
"learning_rate": 1.0972946315344513e-06,
|
|
"loss": 1.6455,
|
|
"mean_token_accuracy": 0.6573407292366028,
|
|
"num_tokens": 1064451243.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 1.5625,
|
|
"epoch": 1.5757824244909115,
|
|
"grad_norm": 0.1810548044438284,
|
|
"learning_rate": 1.0937720163449346e-06,
|
|
"loss": 1.5799,
|
|
"mean_token_accuracy": 0.6683111190795898,
|
|
"num_tokens": 1065364647.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.5771491048243815,
|
|
"grad_norm": 0.19008237383303744,
|
|
"learning_rate": 1.0902494011554177e-06,
|
|
"loss": 1.6172,
|
|
"mean_token_accuracy": 0.6561345636844635,
|
|
"num_tokens": 1066297609.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.5785157851578515,
|
|
"grad_norm": 0.23085643413267073,
|
|
"learning_rate": 1.086726785965901e-06,
|
|
"loss": 1.6005,
|
|
"mean_token_accuracy": 0.6642379820346832,
|
|
"num_tokens": 1067268015.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.5798824654913215,
|
|
"grad_norm": 0.2075343427556431,
|
|
"learning_rate": 1.0832041707763844e-06,
|
|
"loss": 1.6186,
|
|
"mean_token_accuracy": 0.6608716785907746,
|
|
"num_tokens": 1068201628.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.5812491458247915,
|
|
"grad_norm": 0.18785457745772574,
|
|
"learning_rate": 1.0796815555868678e-06,
|
|
"loss": 1.6277,
|
|
"mean_token_accuracy": 0.6599371492862701,
|
|
"num_tokens": 1069191944.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.5826158261582615,
|
|
"grad_norm": 0.1775445672202799,
|
|
"learning_rate": 1.076158940397351e-06,
|
|
"loss": 1.6797,
|
|
"mean_token_accuracy": 0.650270527601242,
|
|
"num_tokens": 1070119180.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 1.5640625,
|
|
"epoch": 1.5839825064917314,
|
|
"grad_norm": 0.23824240815113018,
|
|
"learning_rate": 1.0726363252078344e-06,
|
|
"loss": 1.5656,
|
|
"mean_token_accuracy": 0.6695829153060913,
|
|
"num_tokens": 1071017805.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 1.56328125,
|
|
"epoch": 1.5853491868252014,
|
|
"grad_norm": 0.2255093461996791,
|
|
"learning_rate": 1.0691137100183178e-06,
|
|
"loss": 1.56,
|
|
"mean_token_accuracy": 0.6722392916679383,
|
|
"num_tokens": 1071931107.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.5867158671586716,
|
|
"grad_norm": 0.16251749065932705,
|
|
"learning_rate": 1.065591094828801e-06,
|
|
"loss": 1.6041,
|
|
"mean_token_accuracy": 0.6597062766551971,
|
|
"num_tokens": 1072901075.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.5880825474921416,
|
|
"grad_norm": 0.20688186579348877,
|
|
"learning_rate": 1.0620684796392842e-06,
|
|
"loss": 1.6278,
|
|
"mean_token_accuracy": 0.6578494012355804,
|
|
"num_tokens": 1073768568.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 1.5234375,
|
|
"epoch": 1.5894492278256116,
|
|
"grad_norm": 0.18848954492460898,
|
|
"learning_rate": 1.0585458644497676e-06,
|
|
"loss": 1.5163,
|
|
"mean_token_accuracy": 0.6755284667015076,
|
|
"num_tokens": 1074652139.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.5908159081590816,
|
|
"grad_norm": 0.17567695289825963,
|
|
"learning_rate": 1.0550232492602509e-06,
|
|
"loss": 1.6394,
|
|
"mean_token_accuracy": 0.6556938648223877,
|
|
"num_tokens": 1075557810.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.5921825884925516,
|
|
"grad_norm": 0.14822318040779106,
|
|
"learning_rate": 1.0515006340707342e-06,
|
|
"loss": 1.6113,
|
|
"mean_token_accuracy": 0.6623947620391846,
|
|
"num_tokens": 1076466513.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 1.51015625,
|
|
"epoch": 1.5935492688260215,
|
|
"grad_norm": 0.20084616129846508,
|
|
"learning_rate": 1.0479780188812176e-06,
|
|
"loss": 1.5163,
|
|
"mean_token_accuracy": 0.6773155570030213,
|
|
"num_tokens": 1077346078.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.5949159491594918,
|
|
"grad_norm": 0.20049291657086007,
|
|
"learning_rate": 1.0444554036917009e-06,
|
|
"loss": 1.6518,
|
|
"mean_token_accuracy": 0.6573941648006439,
|
|
"num_tokens": 1078280803.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 1.5962826294929617,
|
|
"grad_norm": 0.19109052226865783,
|
|
"learning_rate": 1.0409327885021842e-06,
|
|
"loss": 1.6918,
|
|
"mean_token_accuracy": 0.6465728461742402,
|
|
"num_tokens": 1079222090.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 1.54921875,
|
|
"epoch": 1.5976493098264317,
|
|
"grad_norm": 0.2252621971697312,
|
|
"learning_rate": 1.0374101733126674e-06,
|
|
"loss": 1.5743,
|
|
"mean_token_accuracy": 0.6670605540275574,
|
|
"num_tokens": 1080146456.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 1.525,
|
|
"epoch": 1.5990159901599017,
|
|
"grad_norm": 0.20739569740519836,
|
|
"learning_rate": 1.0338875581231507e-06,
|
|
"loss": 1.5241,
|
|
"mean_token_accuracy": 0.6746061682701111,
|
|
"num_tokens": 1081014964.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.6003826704933717,
|
|
"grad_norm": 0.204392087068521,
|
|
"learning_rate": 1.030364942933634e-06,
|
|
"loss": 1.6913,
|
|
"mean_token_accuracy": 0.648235559463501,
|
|
"num_tokens": 1081908974.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.6017493508268417,
|
|
"grad_norm": 0.16731126183466788,
|
|
"learning_rate": 1.0268423277441174e-06,
|
|
"loss": 1.6179,
|
|
"mean_token_accuracy": 0.6606800854206085,
|
|
"num_tokens": 1082806038.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 1.56953125,
|
|
"epoch": 1.6031160311603116,
|
|
"grad_norm": 0.21136069426210422,
|
|
"learning_rate": 1.0233197125546005e-06,
|
|
"loss": 1.5502,
|
|
"mean_token_accuracy": 0.6718994975090027,
|
|
"num_tokens": 1083679251.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.6044827114937816,
|
|
"grad_norm": 0.23136623619837168,
|
|
"learning_rate": 1.0197970973650838e-06,
|
|
"loss": 1.6204,
|
|
"mean_token_accuracy": 0.6600921213626861,
|
|
"num_tokens": 1084609037.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.6058493918272516,
|
|
"grad_norm": 0.21300968862150563,
|
|
"learning_rate": 1.0162744821755674e-06,
|
|
"loss": 1.6007,
|
|
"mean_token_accuracy": 0.6628746807575225,
|
|
"num_tokens": 1085541497.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 1.6072160721607216,
|
|
"grad_norm": 0.3044525142915778,
|
|
"learning_rate": 1.0127518669860505e-06,
|
|
"loss": 1.5987,
|
|
"mean_token_accuracy": 0.6651219129562378,
|
|
"num_tokens": 1086466921.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.6085827524941916,
|
|
"grad_norm": 0.21501213364276184,
|
|
"learning_rate": 1.0092292517965338e-06,
|
|
"loss": 1.6567,
|
|
"mean_token_accuracy": 0.6552139222621918,
|
|
"num_tokens": 1087414908.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.6099494328276616,
|
|
"grad_norm": 0.17837543808962122,
|
|
"learning_rate": 1.0057066366070172e-06,
|
|
"loss": 1.6142,
|
|
"mean_token_accuracy": 0.6597462713718414,
|
|
"num_tokens": 1088324654.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 1.52109375,
|
|
"epoch": 1.6113161131611315,
|
|
"grad_norm": 0.16557889837129997,
|
|
"learning_rate": 1.0021840214175005e-06,
|
|
"loss": 1.5239,
|
|
"mean_token_accuracy": 0.6751788794994354,
|
|
"num_tokens": 1089247381.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.6126827934946015,
|
|
"grad_norm": 0.17453387380421678,
|
|
"learning_rate": 9.986614062279836e-07,
|
|
"loss": 1.5998,
|
|
"mean_token_accuracy": 0.664456444978714,
|
|
"num_tokens": 1090190408.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.6140494738280715,
|
|
"grad_norm": 0.19930964158391806,
|
|
"learning_rate": 9.95138791038467e-07,
|
|
"loss": 1.5426,
|
|
"mean_token_accuracy": 0.6720292747020722,
|
|
"num_tokens": 1091130111.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.6154161541615415,
|
|
"grad_norm": 0.1717096377120702,
|
|
"learning_rate": 9.916161758489503e-07,
|
|
"loss": 1.561,
|
|
"mean_token_accuracy": 0.6713361799716949,
|
|
"num_tokens": 1092059654.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.6167828344950115,
|
|
"grad_norm": 0.21571800392553153,
|
|
"learning_rate": 9.880935606594336e-07,
|
|
"loss": 1.6794,
|
|
"mean_token_accuracy": 0.6515142440795898,
|
|
"num_tokens": 1093036733.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 1.578125,
|
|
"epoch": 1.6181495148284815,
|
|
"grad_norm": 0.24418286019619603,
|
|
"learning_rate": 9.84570945469917e-07,
|
|
"loss": 1.5896,
|
|
"mean_token_accuracy": 0.666592663526535,
|
|
"num_tokens": 1093962793.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.6195161951619517,
|
|
"grad_norm": 0.21464734308147776,
|
|
"learning_rate": 9.810483302804003e-07,
|
|
"loss": 1.6435,
|
|
"mean_token_accuracy": 0.6554070889949799,
|
|
"num_tokens": 1094875076.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 1.59921875,
|
|
"epoch": 1.6208828754954216,
|
|
"grad_norm": 0.24902878003599171,
|
|
"learning_rate": 9.775257150908836e-07,
|
|
"loss": 1.6006,
|
|
"mean_token_accuracy": 0.6630610942840576,
|
|
"num_tokens": 1095768682.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.6222495558288916,
|
|
"grad_norm": 0.19956231292627505,
|
|
"learning_rate": 9.740030999013668e-07,
|
|
"loss": 1.6149,
|
|
"mean_token_accuracy": 0.661281019449234,
|
|
"num_tokens": 1096669997.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 1.528125,
|
|
"epoch": 1.6236162361623616,
|
|
"grad_norm": 0.17983542028237712,
|
|
"learning_rate": 9.7048048471185e-07,
|
|
"loss": 1.5314,
|
|
"mean_token_accuracy": 0.6699572205543518,
|
|
"num_tokens": 1097565885.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.6249829164958316,
|
|
"grad_norm": 0.191029321316628,
|
|
"learning_rate": 9.669578695223334e-07,
|
|
"loss": 1.6182,
|
|
"mean_token_accuracy": 0.6607585370540618,
|
|
"num_tokens": 1098508905.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.6263495968293016,
|
|
"grad_norm": 0.20095941674614995,
|
|
"learning_rate": 9.634352543328168e-07,
|
|
"loss": 1.5909,
|
|
"mean_token_accuracy": 0.6633926331996918,
|
|
"num_tokens": 1099403928.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.6277162771627718,
|
|
"grad_norm": 0.20731995954925186,
|
|
"learning_rate": 9.599126391433e-07,
|
|
"loss": 1.6065,
|
|
"mean_token_accuracy": 0.6618742227554322,
|
|
"num_tokens": 1100318405.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.6290829574962418,
|
|
"grad_norm": 0.15435159643848076,
|
|
"learning_rate": 9.563900239537834e-07,
|
|
"loss": 1.6327,
|
|
"mean_token_accuracy": 0.6583415806293488,
|
|
"num_tokens": 1101230396.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 1.5640625,
|
|
"epoch": 1.6304496378297118,
|
|
"grad_norm": 0.31003286029699295,
|
|
"learning_rate": 9.528674087642667e-07,
|
|
"loss": 1.5894,
|
|
"mean_token_accuracy": 0.6647438108921051,
|
|
"num_tokens": 1102182518.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 1.5484375,
|
|
"epoch": 1.6318163181631817,
|
|
"grad_norm": 0.17653218455219272,
|
|
"learning_rate": 9.4934479357475e-07,
|
|
"loss": 1.5398,
|
|
"mean_token_accuracy": 0.674804937839508,
|
|
"num_tokens": 1103119257.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 1.6331829984966517,
|
|
"grad_norm": 0.24764814035293664,
|
|
"learning_rate": 9.458221783852332e-07,
|
|
"loss": 1.6851,
|
|
"mean_token_accuracy": 0.6490575730800628,
|
|
"num_tokens": 1104080020.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.6345496788301217,
|
|
"grad_norm": 0.25756815822935986,
|
|
"learning_rate": 9.422995631957166e-07,
|
|
"loss": 1.5663,
|
|
"mean_token_accuracy": 0.6680497765541077,
|
|
"num_tokens": 1104967189.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.6359163591635917,
|
|
"grad_norm": 0.21644793213660204,
|
|
"learning_rate": 9.387769480061999e-07,
|
|
"loss": 1.6517,
|
|
"mean_token_accuracy": 0.6553209722042084,
|
|
"num_tokens": 1105882821.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.6372830394970617,
|
|
"grad_norm": 0.2106678325682779,
|
|
"learning_rate": 9.352543328166831e-07,
|
|
"loss": 1.6133,
|
|
"mean_token_accuracy": 0.6597493886947632,
|
|
"num_tokens": 1106845479.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.6386497198305316,
|
|
"grad_norm": 0.23930248070130997,
|
|
"learning_rate": 9.317317176271665e-07,
|
|
"loss": 1.5966,
|
|
"mean_token_accuracy": 0.6645245969295501,
|
|
"num_tokens": 1107727592.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.6400164001640016,
|
|
"grad_norm": 0.25412410052434875,
|
|
"learning_rate": 9.282091024376498e-07,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.6473673462867737,
|
|
"num_tokens": 1108708720.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 1.70078125,
|
|
"epoch": 1.6413830804974716,
|
|
"grad_norm": 0.23487987983852207,
|
|
"learning_rate": 9.246864872481331e-07,
|
|
"loss": 1.7155,
|
|
"mean_token_accuracy": 0.6423250079154968,
|
|
"num_tokens": 1109590011.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.6427497608309416,
|
|
"grad_norm": 0.20474398039884828,
|
|
"learning_rate": 9.211638720586164e-07,
|
|
"loss": 1.6038,
|
|
"mean_token_accuracy": 0.6606861174106597,
|
|
"num_tokens": 1110474456.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.6441164411644116,
|
|
"grad_norm": 0.29763728158999003,
|
|
"learning_rate": 9.176412568690997e-07,
|
|
"loss": 1.564,
|
|
"mean_token_accuracy": 0.6670376181602478,
|
|
"num_tokens": 1111396411.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.6454831214978816,
|
|
"grad_norm": 0.20414316736055527,
|
|
"learning_rate": 9.14118641679583e-07,
|
|
"loss": 1.622,
|
|
"mean_token_accuracy": 0.6572720944881439,
|
|
"num_tokens": 1112331738.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.6468498018313515,
|
|
"grad_norm": 0.21136218065830445,
|
|
"learning_rate": 9.105960264900663e-07,
|
|
"loss": 1.5807,
|
|
"mean_token_accuracy": 0.6667589128017426,
|
|
"num_tokens": 1113208751.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.6482164821648215,
|
|
"grad_norm": 0.24804782719819995,
|
|
"learning_rate": 9.070734113005496e-07,
|
|
"loss": 1.6075,
|
|
"mean_token_accuracy": 0.6626385092735291,
|
|
"num_tokens": 1114186388.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.6495831624982915,
|
|
"grad_norm": 0.19875346602129376,
|
|
"learning_rate": 9.035507961110329e-07,
|
|
"loss": 1.6572,
|
|
"mean_token_accuracy": 0.6528082489967346,
|
|
"num_tokens": 1115090693.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 1.57421875,
|
|
"epoch": 1.6509498428317615,
|
|
"grad_norm": 0.23130745011138876,
|
|
"learning_rate": 9.000281809215163e-07,
|
|
"loss": 1.599,
|
|
"mean_token_accuracy": 0.6638910412788391,
|
|
"num_tokens": 1116004420.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 1.53984375,
|
|
"epoch": 1.6523165231652317,
|
|
"grad_norm": 0.18715113621526835,
|
|
"learning_rate": 8.965055657319995e-07,
|
|
"loss": 1.5459,
|
|
"mean_token_accuracy": 0.6725758254528046,
|
|
"num_tokens": 1116921054.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.6536832034987017,
|
|
"grad_norm": 0.1601648241528968,
|
|
"learning_rate": 8.929829505424828e-07,
|
|
"loss": 1.5843,
|
|
"mean_token_accuracy": 0.6649998307228089,
|
|
"num_tokens": 1117846888.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 1.55625,
|
|
"epoch": 1.6550498838321717,
|
|
"grad_norm": 0.21828688227948922,
|
|
"learning_rate": 8.894603353529662e-07,
|
|
"loss": 1.5528,
|
|
"mean_token_accuracy": 0.6691407024860382,
|
|
"num_tokens": 1118759725.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.6564165641656416,
|
|
"grad_norm": 0.22075318419184353,
|
|
"learning_rate": 8.859377201634495e-07,
|
|
"loss": 1.5967,
|
|
"mean_token_accuracy": 0.6648205995559693,
|
|
"num_tokens": 1119722311.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.6577832444991116,
|
|
"grad_norm": 0.19238358090143018,
|
|
"learning_rate": 8.824151049739326e-07,
|
|
"loss": 1.6524,
|
|
"mean_token_accuracy": 0.6503417193889618,
|
|
"num_tokens": 1120656795.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.6591499248325816,
|
|
"grad_norm": 0.1710276022408864,
|
|
"learning_rate": 8.788924897844161e-07,
|
|
"loss": 1.5766,
|
|
"mean_token_accuracy": 0.6675157308578491,
|
|
"num_tokens": 1121615991.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 1.57265625,
|
|
"epoch": 1.6605166051660518,
|
|
"grad_norm": 0.1825562052897187,
|
|
"learning_rate": 8.753698745948994e-07,
|
|
"loss": 1.5725,
|
|
"mean_token_accuracy": 0.6676976680755615,
|
|
"num_tokens": 1122563385.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 1.55078125,
|
|
"epoch": 1.6618832854995218,
|
|
"grad_norm": 0.21472295062525476,
|
|
"learning_rate": 8.718472594053825e-07,
|
|
"loss": 1.5377,
|
|
"mean_token_accuracy": 0.6717290282249451,
|
|
"num_tokens": 1123441128.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.6632499658329918,
|
|
"grad_norm": 0.19717315756496903,
|
|
"learning_rate": 8.683246442158659e-07,
|
|
"loss": 1.5885,
|
|
"mean_token_accuracy": 0.6649854063987732,
|
|
"num_tokens": 1124337945.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.6646166461664618,
|
|
"grad_norm": 0.18907111669473484,
|
|
"learning_rate": 8.648020290263492e-07,
|
|
"loss": 1.5604,
|
|
"mean_token_accuracy": 0.6690690577030182,
|
|
"num_tokens": 1125221478.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.6659833264999317,
|
|
"grad_norm": 0.19198219551599371,
|
|
"learning_rate": 8.612794138368325e-07,
|
|
"loss": 1.606,
|
|
"mean_token_accuracy": 0.661073100566864,
|
|
"num_tokens": 1126153979.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.6673500068334017,
|
|
"grad_norm": 0.19662324826248465,
|
|
"learning_rate": 8.577567986473158e-07,
|
|
"loss": 1.6166,
|
|
"mean_token_accuracy": 0.6612836599349976,
|
|
"num_tokens": 1127042335.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 1.5546875,
|
|
"epoch": 1.6687166871668717,
|
|
"grad_norm": 0.1804822211280158,
|
|
"learning_rate": 8.542341834577991e-07,
|
|
"loss": 1.5576,
|
|
"mean_token_accuracy": 0.6699792087078095,
|
|
"num_tokens": 1127950316.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 1.5375,
|
|
"epoch": 1.6700833675003417,
|
|
"grad_norm": 0.18277290236598295,
|
|
"learning_rate": 8.507115682682824e-07,
|
|
"loss": 1.5416,
|
|
"mean_token_accuracy": 0.6712436020374298,
|
|
"num_tokens": 1128858134.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 1.6714500478338117,
|
|
"grad_norm": 0.1990840130895324,
|
|
"learning_rate": 8.471889530787658e-07,
|
|
"loss": 1.676,
|
|
"mean_token_accuracy": 0.6529980540275574,
|
|
"num_tokens": 1129819995.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.6728167281672817,
|
|
"grad_norm": 0.1773585063915157,
|
|
"learning_rate": 8.43666337889249e-07,
|
|
"loss": 1.607,
|
|
"mean_token_accuracy": 0.6630298912525177,
|
|
"num_tokens": 1130730510.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.6741834085007516,
|
|
"grad_norm": 0.2289353665861745,
|
|
"learning_rate": 8.401437226997323e-07,
|
|
"loss": 1.6276,
|
|
"mean_token_accuracy": 0.6618961989879608,
|
|
"num_tokens": 1131677329.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 1.58203125,
|
|
"epoch": 1.6755500888342216,
|
|
"grad_norm": 0.19504524392442238,
|
|
"learning_rate": 8.366211075102157e-07,
|
|
"loss": 1.595,
|
|
"mean_token_accuracy": 0.6622853398323059,
|
|
"num_tokens": 1132542780.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.6769167691676916,
|
|
"grad_norm": 0.21912663930928175,
|
|
"learning_rate": 8.330984923206989e-07,
|
|
"loss": 1.574,
|
|
"mean_token_accuracy": 0.6656503677368164,
|
|
"num_tokens": 1133492252.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.6782834495011616,
|
|
"grad_norm": 0.2078077449310967,
|
|
"learning_rate": 8.295758771311822e-07,
|
|
"loss": 1.6554,
|
|
"mean_token_accuracy": 0.6522331535816193,
|
|
"num_tokens": 1134355802.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.6796501298346316,
|
|
"grad_norm": 0.22995624166531836,
|
|
"learning_rate": 8.260532619416656e-07,
|
|
"loss": 1.5888,
|
|
"mean_token_accuracy": 0.6645109057426453,
|
|
"num_tokens": 1135251414.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.6810168101681016,
|
|
"grad_norm": 0.17625849613479255,
|
|
"learning_rate": 8.225306467521489e-07,
|
|
"loss": 1.5965,
|
|
"mean_token_accuracy": 0.6641799747943878,
|
|
"num_tokens": 1136173543.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.6823834905015715,
|
|
"grad_norm": 0.18089859203346176,
|
|
"learning_rate": 8.190080315626321e-07,
|
|
"loss": 1.6445,
|
|
"mean_token_accuracy": 0.6559728801250457,
|
|
"num_tokens": 1137067628.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.6837501708350415,
|
|
"grad_norm": 0.23264152126977783,
|
|
"learning_rate": 8.154854163731155e-07,
|
|
"loss": 1.5716,
|
|
"mean_token_accuracy": 0.668320769071579,
|
|
"num_tokens": 1138014738.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.6851168511685117,
|
|
"grad_norm": 0.26503508914986773,
|
|
"learning_rate": 8.119628011835988e-07,
|
|
"loss": 1.6395,
|
|
"mean_token_accuracy": 0.6530840218067169,
|
|
"num_tokens": 1138919184.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.6864835315019817,
|
|
"grad_norm": 0.16280717630335798,
|
|
"learning_rate": 8.084401859940821e-07,
|
|
"loss": 1.5895,
|
|
"mean_token_accuracy": 0.6640872418880462,
|
|
"num_tokens": 1139812263.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.6878502118354517,
|
|
"grad_norm": 0.16842646740177503,
|
|
"learning_rate": 8.049175708045654e-07,
|
|
"loss": 1.6351,
|
|
"mean_token_accuracy": 0.6560622870922088,
|
|
"num_tokens": 1140745965.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.6892168921689217,
|
|
"grad_norm": 0.1590244152160721,
|
|
"learning_rate": 8.013949556150487e-07,
|
|
"loss": 1.5568,
|
|
"mean_token_accuracy": 0.6699958503246307,
|
|
"num_tokens": 1141621825.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 1.5359375,
|
|
"epoch": 1.6905835725023917,
|
|
"grad_norm": 0.15901623595283215,
|
|
"learning_rate": 7.97872340425532e-07,
|
|
"loss": 1.5345,
|
|
"mean_token_accuracy": 0.6718881368637085,
|
|
"num_tokens": 1142557212.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.6919502528358616,
|
|
"grad_norm": 0.20215883814821803,
|
|
"learning_rate": 7.943497252360153e-07,
|
|
"loss": 1.5923,
|
|
"mean_token_accuracy": 0.6610711336135864,
|
|
"num_tokens": 1143456791.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.6933169331693319,
|
|
"grad_norm": 0.19029394285875895,
|
|
"learning_rate": 7.908271100464986e-07,
|
|
"loss": 1.6513,
|
|
"mean_token_accuracy": 0.6518423378467559,
|
|
"num_tokens": 1144407838.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.6946836135028018,
|
|
"grad_norm": 0.18305344631197631,
|
|
"learning_rate": 7.873044948569819e-07,
|
|
"loss": 1.6411,
|
|
"mean_token_accuracy": 0.6581804871559143,
|
|
"num_tokens": 1145334341.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.6960502938362718,
|
|
"grad_norm": 0.24776703779861786,
|
|
"learning_rate": 7.837818796674653e-07,
|
|
"loss": 1.5879,
|
|
"mean_token_accuracy": 0.6655481457710266,
|
|
"num_tokens": 1146270493.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.6974169741697418,
|
|
"grad_norm": 0.1913288777687939,
|
|
"learning_rate": 7.802592644779484e-07,
|
|
"loss": 1.6108,
|
|
"mean_token_accuracy": 0.6588037967681885,
|
|
"num_tokens": 1147229318.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.6987836545032118,
|
|
"grad_norm": 0.18373011445623505,
|
|
"learning_rate": 7.767366492884317e-07,
|
|
"loss": 1.6138,
|
|
"mean_token_accuracy": 0.660340142250061,
|
|
"num_tokens": 1148138002.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.7001503348366818,
|
|
"grad_norm": 0.23242716835669033,
|
|
"learning_rate": 7.732140340989152e-07,
|
|
"loss": 1.5958,
|
|
"mean_token_accuracy": 0.6626798987388611,
|
|
"num_tokens": 1149099682.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.7015170151701517,
|
|
"grad_norm": 0.23400281753964694,
|
|
"learning_rate": 7.696914189093985e-07,
|
|
"loss": 1.6333,
|
|
"mean_token_accuracy": 0.6581875920295716,
|
|
"num_tokens": 1150012324.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 1.5546875,
|
|
"epoch": 1.7028836955036217,
|
|
"grad_norm": 0.19765960073994845,
|
|
"learning_rate": 7.661688037198816e-07,
|
|
"loss": 1.5611,
|
|
"mean_token_accuracy": 0.6676224589347839,
|
|
"num_tokens": 1150948654.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 1.53671875,
|
|
"epoch": 1.7042503758370917,
|
|
"grad_norm": 0.17905982094416031,
|
|
"learning_rate": 7.62646188530365e-07,
|
|
"loss": 1.5464,
|
|
"mean_token_accuracy": 0.6709006667137146,
|
|
"num_tokens": 1151863543.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 1.51640625,
|
|
"epoch": 1.7056170561705617,
|
|
"grad_norm": 0.23066982225798302,
|
|
"learning_rate": 7.591235733408483e-07,
|
|
"loss": 1.5135,
|
|
"mean_token_accuracy": 0.6761376976966857,
|
|
"num_tokens": 1152801173.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.7069837365040317,
|
|
"grad_norm": 0.2701633513513373,
|
|
"learning_rate": 7.556009581513315e-07,
|
|
"loss": 1.5547,
|
|
"mean_token_accuracy": 0.6696928560733795,
|
|
"num_tokens": 1153710113.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 1.50859375,
|
|
"epoch": 1.7083504168375017,
|
|
"grad_norm": 0.1886991030102688,
|
|
"learning_rate": 7.520783429618149e-07,
|
|
"loss": 1.5379,
|
|
"mean_token_accuracy": 0.6729556798934937,
|
|
"num_tokens": 1154664729.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 1.55625,
|
|
"epoch": 1.7097170971709716,
|
|
"grad_norm": 0.22633656015868706,
|
|
"learning_rate": 7.485557277722982e-07,
|
|
"loss": 1.5714,
|
|
"mean_token_accuracy": 0.6685551166534424,
|
|
"num_tokens": 1155599905.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.7110837775044416,
|
|
"grad_norm": 0.20934948658841465,
|
|
"learning_rate": 7.450331125827815e-07,
|
|
"loss": 1.6138,
|
|
"mean_token_accuracy": 0.6596962034702301,
|
|
"num_tokens": 1156535011.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.7124504578379116,
|
|
"grad_norm": 0.21596124851931792,
|
|
"learning_rate": 7.415104973932648e-07,
|
|
"loss": 1.6214,
|
|
"mean_token_accuracy": 0.6583315134048462,
|
|
"num_tokens": 1157473902.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 1.52421875,
|
|
"epoch": 1.7138171381713816,
|
|
"grad_norm": 0.2680935938428816,
|
|
"learning_rate": 7.379878822037481e-07,
|
|
"loss": 1.5217,
|
|
"mean_token_accuracy": 0.6736512303352356,
|
|
"num_tokens": 1158338308.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.7151838185048516,
|
|
"grad_norm": 0.18831251936807292,
|
|
"learning_rate": 7.344652670142314e-07,
|
|
"loss": 1.6249,
|
|
"mean_token_accuracy": 0.6577916324138642,
|
|
"num_tokens": 1159271667.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.7165504988383216,
|
|
"grad_norm": 0.16847625219812565,
|
|
"learning_rate": 7.309426518247147e-07,
|
|
"loss": 1.6441,
|
|
"mean_token_accuracy": 0.6559228479862214,
|
|
"num_tokens": 1160216897.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 1.58203125,
|
|
"epoch": 1.7179171791717918,
|
|
"grad_norm": 0.17637903965281157,
|
|
"learning_rate": 7.27420036635198e-07,
|
|
"loss": 1.6026,
|
|
"mean_token_accuracy": 0.6642269492149353,
|
|
"num_tokens": 1161117254.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.7192838595052617,
|
|
"grad_norm": 0.22522891995105443,
|
|
"learning_rate": 7.238974214456813e-07,
|
|
"loss": 1.6756,
|
|
"mean_token_accuracy": 0.6492803275585175,
|
|
"num_tokens": 1162064119.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.7206505398387317,
|
|
"grad_norm": 0.2262108642753715,
|
|
"learning_rate": 7.203748062561647e-07,
|
|
"loss": 1.6063,
|
|
"mean_token_accuracy": 0.6634253799915314,
|
|
"num_tokens": 1163059446.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.7220172201722017,
|
|
"grad_norm": 0.17459613657611076,
|
|
"learning_rate": 7.168521910666479e-07,
|
|
"loss": 1.6053,
|
|
"mean_token_accuracy": 0.6631161153316498,
|
|
"num_tokens": 1163980127.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.7233839005056717,
|
|
"grad_norm": 0.20207876424226326,
|
|
"learning_rate": 7.133295758771312e-07,
|
|
"loss": 1.6169,
|
|
"mean_token_accuracy": 0.6612747251987457,
|
|
"num_tokens": 1164906350.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.7247505808391417,
|
|
"grad_norm": 0.20829880538751913,
|
|
"learning_rate": 7.098069606876146e-07,
|
|
"loss": 1.5951,
|
|
"mean_token_accuracy": 0.6637654662132263,
|
|
"num_tokens": 1165812503.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 1.5296875,
|
|
"epoch": 1.7261172611726119,
|
|
"grad_norm": 0.1971349818794153,
|
|
"learning_rate": 7.062843454980979e-07,
|
|
"loss": 1.5351,
|
|
"mean_token_accuracy": 0.6741915702819824,
|
|
"num_tokens": 1166709758.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 1.53515625,
|
|
"epoch": 1.7274839415060819,
|
|
"grad_norm": 0.16645855837277812,
|
|
"learning_rate": 7.027617303085811e-07,
|
|
"loss": 1.5406,
|
|
"mean_token_accuracy": 0.6714229464530945,
|
|
"num_tokens": 1167654236.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.7288506218395518,
|
|
"grad_norm": 0.2005810622059388,
|
|
"learning_rate": 6.992391151190645e-07,
|
|
"loss": 1.5805,
|
|
"mean_token_accuracy": 0.6663422107696533,
|
|
"num_tokens": 1168593602.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.7302173021730218,
|
|
"grad_norm": 0.2264180257152066,
|
|
"learning_rate": 6.957164999295478e-07,
|
|
"loss": 1.6284,
|
|
"mean_token_accuracy": 0.6558364689350128,
|
|
"num_tokens": 1169473218.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.7315839825064918,
|
|
"grad_norm": 0.2077458114458553,
|
|
"learning_rate": 6.921938847400309e-07,
|
|
"loss": 1.5819,
|
|
"mean_token_accuracy": 0.6649188578128815,
|
|
"num_tokens": 1170351244.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.7329506628399618,
|
|
"grad_norm": 0.26597813428831274,
|
|
"learning_rate": 6.886712695505144e-07,
|
|
"loss": 1.5994,
|
|
"mean_token_accuracy": 0.6636836588382721,
|
|
"num_tokens": 1171300655.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.7343173431734318,
|
|
"grad_norm": 0.21715400519618214,
|
|
"learning_rate": 6.851486543609977e-07,
|
|
"loss": 1.6958,
|
|
"mean_token_accuracy": 0.6470343172550201,
|
|
"num_tokens": 1172234312.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.7356840235069018,
|
|
"grad_norm": 0.20152628424374033,
|
|
"learning_rate": 6.816260391714811e-07,
|
|
"loss": 1.6234,
|
|
"mean_token_accuracy": 0.6583557784557342,
|
|
"num_tokens": 1173141399.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.7370507038403717,
|
|
"grad_norm": 0.1899937495979984,
|
|
"learning_rate": 6.781034239819642e-07,
|
|
"loss": 1.6125,
|
|
"mean_token_accuracy": 0.6595246195793152,
|
|
"num_tokens": 1174070017.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.7384173841738417,
|
|
"grad_norm": 0.17109194781148604,
|
|
"learning_rate": 6.745808087924475e-07,
|
|
"loss": 1.5853,
|
|
"mean_token_accuracy": 0.6642462193965912,
|
|
"num_tokens": 1174982591.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.7397840645073117,
|
|
"grad_norm": 0.23800944815329225,
|
|
"learning_rate": 6.710581936029308e-07,
|
|
"loss": 1.6342,
|
|
"mean_token_accuracy": 0.6597916126251221,
|
|
"num_tokens": 1175885528.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.7411507448407817,
|
|
"grad_norm": 0.1806014730959099,
|
|
"learning_rate": 6.675355784134143e-07,
|
|
"loss": 1.6022,
|
|
"mean_token_accuracy": 0.6617702841758728,
|
|
"num_tokens": 1176839849.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 1.56875,
|
|
"epoch": 1.7425174251742517,
|
|
"grad_norm": 0.20353154986923586,
|
|
"learning_rate": 6.640129632238974e-07,
|
|
"loss": 1.5647,
|
|
"mean_token_accuracy": 0.6718940794467926,
|
|
"num_tokens": 1177767258.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.7438841055077217,
|
|
"grad_norm": 0.19705217788860874,
|
|
"learning_rate": 6.604903480343807e-07,
|
|
"loss": 1.6414,
|
|
"mean_token_accuracy": 0.6537597417831421,
|
|
"num_tokens": 1178711333.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.7452507858411916,
|
|
"grad_norm": 0.22541024433437443,
|
|
"learning_rate": 6.569677328448641e-07,
|
|
"loss": 1.5746,
|
|
"mean_token_accuracy": 0.6701597571372986,
|
|
"num_tokens": 1179611785.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 1.53203125,
|
|
"epoch": 1.7466174661746616,
|
|
"grad_norm": 0.2990856446747329,
|
|
"learning_rate": 6.534451176553473e-07,
|
|
"loss": 1.5214,
|
|
"mean_token_accuracy": 0.6785311818122863,
|
|
"num_tokens": 1180541044.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.7479841465081316,
|
|
"grad_norm": 0.18226234663972565,
|
|
"learning_rate": 6.499225024658306e-07,
|
|
"loss": 1.5887,
|
|
"mean_token_accuracy": 0.6640773713588715,
|
|
"num_tokens": 1181487390.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 1.53359375,
|
|
"epoch": 1.7493508268416016,
|
|
"grad_norm": 0.18957329432683448,
|
|
"learning_rate": 6.46399887276314e-07,
|
|
"loss": 1.5335,
|
|
"mean_token_accuracy": 0.672640460729599,
|
|
"num_tokens": 1182365237.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.7507175071750718,
|
|
"grad_norm": 0.19250748928329858,
|
|
"learning_rate": 6.428772720867973e-07,
|
|
"loss": 1.6807,
|
|
"mean_token_accuracy": 0.6493535876274109,
|
|
"num_tokens": 1183274421.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.7520841875085418,
|
|
"grad_norm": 0.1701639029224005,
|
|
"learning_rate": 6.393546568972805e-07,
|
|
"loss": 1.5876,
|
|
"mean_token_accuracy": 0.6640045464038848,
|
|
"num_tokens": 1184164745.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.7534508678420118,
|
|
"grad_norm": 0.1512151221201853,
|
|
"learning_rate": 6.358320417077639e-07,
|
|
"loss": 1.6063,
|
|
"mean_token_accuracy": 0.6615230202674866,
|
|
"num_tokens": 1185080229.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.7548175481754817,
|
|
"grad_norm": 0.19251454246325592,
|
|
"learning_rate": 6.323094265182472e-07,
|
|
"loss": 1.5902,
|
|
"mean_token_accuracy": 0.6635743021965027,
|
|
"num_tokens": 1185957851.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.7561842285089517,
|
|
"grad_norm": 0.21600073157513214,
|
|
"learning_rate": 6.287868113287306e-07,
|
|
"loss": 1.5523,
|
|
"mean_token_accuracy": 0.6708834230899811,
|
|
"num_tokens": 1186910273.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.7575509088424217,
|
|
"grad_norm": 0.16827132450787602,
|
|
"learning_rate": 6.252641961392138e-07,
|
|
"loss": 1.6004,
|
|
"mean_token_accuracy": 0.6620803415775299,
|
|
"num_tokens": 1187856813.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 1.52109375,
|
|
"epoch": 1.758917589175892,
|
|
"grad_norm": 0.18041158835896104,
|
|
"learning_rate": 6.217415809496971e-07,
|
|
"loss": 1.5399,
|
|
"mean_token_accuracy": 0.6722652971744537,
|
|
"num_tokens": 1188780811.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.760284269509362,
|
|
"grad_norm": 0.2204069560330548,
|
|
"learning_rate": 6.182189657601804e-07,
|
|
"loss": 1.6173,
|
|
"mean_token_accuracy": 0.6605358004570008,
|
|
"num_tokens": 1189731780.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.7616509498428319,
|
|
"grad_norm": 0.18655596621313106,
|
|
"learning_rate": 6.146963505706637e-07,
|
|
"loss": 1.5984,
|
|
"mean_token_accuracy": 0.6635606169700623,
|
|
"num_tokens": 1190654772.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.7630176301763019,
|
|
"grad_norm": 0.16706143646885735,
|
|
"learning_rate": 6.11173735381147e-07,
|
|
"loss": 1.5773,
|
|
"mean_token_accuracy": 0.6675014853477478,
|
|
"num_tokens": 1191548398.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.7643843105097718,
|
|
"grad_norm": 0.2027141044477451,
|
|
"learning_rate": 6.076511201916304e-07,
|
|
"loss": 1.6401,
|
|
"mean_token_accuracy": 0.6583770155906677,
|
|
"num_tokens": 1192458043.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.7657509908432418,
|
|
"grad_norm": 0.21500965547836562,
|
|
"learning_rate": 6.041285050021136e-07,
|
|
"loss": 1.6058,
|
|
"mean_token_accuracy": 0.6598479449748993,
|
|
"num_tokens": 1193315213.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.7671176711767118,
|
|
"grad_norm": 0.20936132498609553,
|
|
"learning_rate": 6.006058898125969e-07,
|
|
"loss": 1.6331,
|
|
"mean_token_accuracy": 0.6577678859233856,
|
|
"num_tokens": 1194279440.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.7684843515101818,
|
|
"grad_norm": 0.23692067655869045,
|
|
"learning_rate": 5.970832746230803e-07,
|
|
"loss": 1.6622,
|
|
"mean_token_accuracy": 0.6534433126449585,
|
|
"num_tokens": 1195278492.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 1.7698510318436518,
|
|
"grad_norm": 0.1969012457534531,
|
|
"learning_rate": 5.935606594335636e-07,
|
|
"loss": 1.6067,
|
|
"mean_token_accuracy": 0.662521916627884,
|
|
"num_tokens": 1196254491.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.7712177121771218,
|
|
"grad_norm": 0.3106090436757807,
|
|
"learning_rate": 5.900380442440468e-07,
|
|
"loss": 1.6277,
|
|
"mean_token_accuracy": 0.659892463684082,
|
|
"num_tokens": 1197174657.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 1.7725843925105917,
|
|
"grad_norm": 0.36738431630718676,
|
|
"learning_rate": 5.8651542905453e-07,
|
|
"loss": 1.6936,
|
|
"mean_token_accuracy": 0.6484021365642547,
|
|
"num_tokens": 1198113827.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.7739510728440617,
|
|
"grad_norm": 0.20952655233061362,
|
|
"learning_rate": 5.829928138650135e-07,
|
|
"loss": 1.5827,
|
|
"mean_token_accuracy": 0.667050689458847,
|
|
"num_tokens": 1199026537.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.7753177531775317,
|
|
"grad_norm": 0.22725645676969966,
|
|
"learning_rate": 5.794701986754967e-07,
|
|
"loss": 1.593,
|
|
"mean_token_accuracy": 0.6624612987041474,
|
|
"num_tokens": 1199939942.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.7766844335110017,
|
|
"grad_norm": 0.1722351675840386,
|
|
"learning_rate": 5.759475834859801e-07,
|
|
"loss": 1.6022,
|
|
"mean_token_accuracy": 0.6642412722110749,
|
|
"num_tokens": 1200865912.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.7780511138444717,
|
|
"grad_norm": 0.19317793974981864,
|
|
"learning_rate": 5.724249682964633e-07,
|
|
"loss": 1.6726,
|
|
"mean_token_accuracy": 0.6519534707069397,
|
|
"num_tokens": 1201757639.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.7794177941779417,
|
|
"grad_norm": 0.22022652445141744,
|
|
"learning_rate": 5.689023531069466e-07,
|
|
"loss": 1.6155,
|
|
"mean_token_accuracy": 0.6601132392883301,
|
|
"num_tokens": 1202729072.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.7807844745114116,
|
|
"grad_norm": 0.2256630520266229,
|
|
"learning_rate": 5.6537973791743e-07,
|
|
"loss": 1.6222,
|
|
"mean_token_accuracy": 0.6619493067264557,
|
|
"num_tokens": 1203644151.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 1.525,
|
|
"epoch": 1.7821511548448816,
|
|
"grad_norm": 0.17980486957821132,
|
|
"learning_rate": 5.618571227279133e-07,
|
|
"loss": 1.5364,
|
|
"mean_token_accuracy": 0.6743068218231201,
|
|
"num_tokens": 1204596233.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 1.54296875,
|
|
"epoch": 1.7835178351783518,
|
|
"grad_norm": 0.18059282560814685,
|
|
"learning_rate": 5.583345075383965e-07,
|
|
"loss": 1.5505,
|
|
"mean_token_accuracy": 0.6707299530506134,
|
|
"num_tokens": 1205549129.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 1.55078125,
|
|
"epoch": 1.7848845155118218,
|
|
"grad_norm": 0.22903943442391242,
|
|
"learning_rate": 5.548118923488799e-07,
|
|
"loss": 1.5459,
|
|
"mean_token_accuracy": 0.6708059012889862,
|
|
"num_tokens": 1206428344.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.7862511958452918,
|
|
"grad_norm": 0.16423385667353876,
|
|
"learning_rate": 5.512892771593632e-07,
|
|
"loss": 1.5796,
|
|
"mean_token_accuracy": 0.6677222192287445,
|
|
"num_tokens": 1207361595.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.7876178761787618,
|
|
"grad_norm": 0.22946661498052673,
|
|
"learning_rate": 5.477666619698464e-07,
|
|
"loss": 1.634,
|
|
"mean_token_accuracy": 0.6595315754413604,
|
|
"num_tokens": 1208303084.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 1.57265625,
|
|
"epoch": 1.7889845565122318,
|
|
"grad_norm": 0.1885585857166039,
|
|
"learning_rate": 5.442440467803298e-07,
|
|
"loss": 1.5873,
|
|
"mean_token_accuracy": 0.6630272924900055,
|
|
"num_tokens": 1209214170.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.7903512368457017,
|
|
"grad_norm": 0.19478369717896066,
|
|
"learning_rate": 5.407214315908131e-07,
|
|
"loss": 1.6077,
|
|
"mean_token_accuracy": 0.6602996408939361,
|
|
"num_tokens": 1210155792.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.791717917179172,
|
|
"grad_norm": 0.26100891848785723,
|
|
"learning_rate": 5.371988164012964e-07,
|
|
"loss": 1.6695,
|
|
"mean_token_accuracy": 0.6500359952449799,
|
|
"num_tokens": 1211114415.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 1.52265625,
|
|
"epoch": 1.793084597512642,
|
|
"grad_norm": 0.20557656640461874,
|
|
"learning_rate": 5.336762012117797e-07,
|
|
"loss": 1.5401,
|
|
"mean_token_accuracy": 0.6733694970607758,
|
|
"num_tokens": 1212056236.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.794451277846112,
|
|
"grad_norm": 0.219826598877685,
|
|
"learning_rate": 5.30153586022263e-07,
|
|
"loss": 1.6374,
|
|
"mean_token_accuracy": 0.6554373800754547,
|
|
"num_tokens": 1212998331.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.795817958179582,
|
|
"grad_norm": 0.17821766122150473,
|
|
"learning_rate": 5.266309708327462e-07,
|
|
"loss": 1.5741,
|
|
"mean_token_accuracy": 0.6679556906223297,
|
|
"num_tokens": 1213912667.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 1.53046875,
|
|
"epoch": 1.7971846385130519,
|
|
"grad_norm": 0.19152408383383432,
|
|
"learning_rate": 5.231083556432296e-07,
|
|
"loss": 1.5509,
|
|
"mean_token_accuracy": 0.6736198723316192,
|
|
"num_tokens": 1214819461.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.7985513188465219,
|
|
"grad_norm": 0.15473731571304475,
|
|
"learning_rate": 5.195857404537129e-07,
|
|
"loss": 1.6153,
|
|
"mean_token_accuracy": 0.6624296188354493,
|
|
"num_tokens": 1215734191.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 1.59921875,
|
|
"epoch": 1.7999179991799918,
|
|
"grad_norm": 0.2116611714228843,
|
|
"learning_rate": 5.160631252641961e-07,
|
|
"loss": 1.5759,
|
|
"mean_token_accuracy": 0.6674711287021637,
|
|
"num_tokens": 1216631169.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.8012846795134618,
|
|
"grad_norm": 0.2567374386578471,
|
|
"learning_rate": 5.125405100746795e-07,
|
|
"loss": 1.5762,
|
|
"mean_token_accuracy": 0.6687758147716523,
|
|
"num_tokens": 1217582632.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.8026513598469318,
|
|
"grad_norm": 0.16085578760521982,
|
|
"learning_rate": 5.090178948851628e-07,
|
|
"loss": 1.6083,
|
|
"mean_token_accuracy": 0.6577802777290345,
|
|
"num_tokens": 1218533416.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.8040180401804018,
|
|
"grad_norm": 0.19047475506831232,
|
|
"learning_rate": 5.054952796956461e-07,
|
|
"loss": 1.6676,
|
|
"mean_token_accuracy": 0.6492537438869477,
|
|
"num_tokens": 1219435180.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 1.534375,
|
|
"epoch": 1.8053847205138718,
|
|
"grad_norm": 0.18035958885301928,
|
|
"learning_rate": 5.019726645061294e-07,
|
|
"loss": 1.5345,
|
|
"mean_token_accuracy": 0.6757398068904876,
|
|
"num_tokens": 1220380712.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.8067514008473418,
|
|
"grad_norm": 0.18591641206991025,
|
|
"learning_rate": 4.984500493166127e-07,
|
|
"loss": 1.6438,
|
|
"mean_token_accuracy": 0.6583085358142853,
|
|
"num_tokens": 1221318857.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 1.521875,
|
|
"epoch": 1.8081180811808117,
|
|
"grad_norm": 0.1730561485715138,
|
|
"learning_rate": 4.94927434127096e-07,
|
|
"loss": 1.5318,
|
|
"mean_token_accuracy": 0.6745397627353669,
|
|
"num_tokens": 1222225082.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 1.55703125,
|
|
"epoch": 1.8094847615142817,
|
|
"grad_norm": 0.19403889605810323,
|
|
"learning_rate": 4.914048189375794e-07,
|
|
"loss": 1.5601,
|
|
"mean_token_accuracy": 0.6679836511611938,
|
|
"num_tokens": 1223137093.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.8108514418477517,
|
|
"grad_norm": 0.16939491088178552,
|
|
"learning_rate": 4.878822037480626e-07,
|
|
"loss": 1.5809,
|
|
"mean_token_accuracy": 0.6658199191093445,
|
|
"num_tokens": 1223975428.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.8122181221812217,
|
|
"grad_norm": 0.21206613450496112,
|
|
"learning_rate": 4.843595885585459e-07,
|
|
"loss": 1.5942,
|
|
"mean_token_accuracy": 0.6627681136131287,
|
|
"num_tokens": 1224874166.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.8135848025146917,
|
|
"grad_norm": 0.27918481868616807,
|
|
"learning_rate": 4.808369733690292e-07,
|
|
"loss": 1.6733,
|
|
"mean_token_accuracy": 0.6511035263538361,
|
|
"num_tokens": 1225823221.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 1.8149514828481617,
|
|
"grad_norm": 0.18898884929849963,
|
|
"learning_rate": 4.773143581795125e-07,
|
|
"loss": 1.6853,
|
|
"mean_token_accuracy": 0.6488528430461884,
|
|
"num_tokens": 1226797421.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 1.52734375,
|
|
"epoch": 1.8163181631816319,
|
|
"grad_norm": 0.23986721179145223,
|
|
"learning_rate": 4.7379174298999583e-07,
|
|
"loss": 1.5307,
|
|
"mean_token_accuracy": 0.6732611179351806,
|
|
"num_tokens": 1227706857.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.8176848435151018,
|
|
"grad_norm": 0.20712083705510143,
|
|
"learning_rate": 4.702691278004791e-07,
|
|
"loss": 1.6363,
|
|
"mean_token_accuracy": 0.6590192139148712,
|
|
"num_tokens": 1228668339.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 1.58125,
|
|
"epoch": 1.8190515238485718,
|
|
"grad_norm": 0.17907364381627633,
|
|
"learning_rate": 4.6674651261096245e-07,
|
|
"loss": 1.5919,
|
|
"mean_token_accuracy": 0.6631990253925324,
|
|
"num_tokens": 1229596401.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 1.540625,
|
|
"epoch": 1.8204182041820418,
|
|
"grad_norm": 0.21721784991005635,
|
|
"learning_rate": 4.632238974214457e-07,
|
|
"loss": 1.551,
|
|
"mean_token_accuracy": 0.6708670198917389,
|
|
"num_tokens": 1230526295.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.8217848845155118,
|
|
"grad_norm": 0.22051970100143858,
|
|
"learning_rate": 4.5970128223192907e-07,
|
|
"loss": 1.5674,
|
|
"mean_token_accuracy": 0.6693876445293426,
|
|
"num_tokens": 1231452551.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 1.5234375,
|
|
"epoch": 1.8231515648489818,
|
|
"grad_norm": 0.2240452140234634,
|
|
"learning_rate": 4.561786670424123e-07,
|
|
"loss": 1.5296,
|
|
"mean_token_accuracy": 0.6715866029262543,
|
|
"num_tokens": 1232356562.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.824518245182452,
|
|
"grad_norm": 0.2122290045237586,
|
|
"learning_rate": 4.5265605185289563e-07,
|
|
"loss": 1.6457,
|
|
"mean_token_accuracy": 0.6575959682464599,
|
|
"num_tokens": 1233307570.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 1.55703125,
|
|
"epoch": 1.825884925515922,
|
|
"grad_norm": 0.2181221586344587,
|
|
"learning_rate": 4.491334366633789e-07,
|
|
"loss": 1.5488,
|
|
"mean_token_accuracy": 0.6720501482486725,
|
|
"num_tokens": 1234250828.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 1.55625,
|
|
"epoch": 1.827251605849392,
|
|
"grad_norm": 0.26884560316138806,
|
|
"learning_rate": 4.456108214738622e-07,
|
|
"loss": 1.5599,
|
|
"mean_token_accuracy": 0.6689077317714691,
|
|
"num_tokens": 1235179294.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.828618286182862,
|
|
"grad_norm": 0.1790470759603954,
|
|
"learning_rate": 4.4208820628434553e-07,
|
|
"loss": 1.612,
|
|
"mean_token_accuracy": 0.6613450288772583,
|
|
"num_tokens": 1236101583.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.829984966516332,
|
|
"grad_norm": 0.17839775527769813,
|
|
"learning_rate": 4.385655910948288e-07,
|
|
"loss": 1.6291,
|
|
"mean_token_accuracy": 0.6569505035877228,
|
|
"num_tokens": 1237028915.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.831351646849802,
|
|
"grad_norm": 0.30846593563182256,
|
|
"learning_rate": 4.3504297590531215e-07,
|
|
"loss": 1.6277,
|
|
"mean_token_accuracy": 0.6580079674720765,
|
|
"num_tokens": 1237966756.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 1.54453125,
|
|
"epoch": 1.8327183271832719,
|
|
"grad_norm": 0.23413818641799083,
|
|
"learning_rate": 4.3152036071579543e-07,
|
|
"loss": 1.5883,
|
|
"mean_token_accuracy": 0.6667121767997741,
|
|
"num_tokens": 1238915363.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 1.53125,
|
|
"epoch": 1.8340850075167419,
|
|
"grad_norm": 0.15588770283650633,
|
|
"learning_rate": 4.2799774552627877e-07,
|
|
"loss": 1.5457,
|
|
"mean_token_accuracy": 0.6711512565612793,
|
|
"num_tokens": 1239860741.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.8354516878502118,
|
|
"grad_norm": 0.21211469560608173,
|
|
"learning_rate": 4.2447513033676205e-07,
|
|
"loss": 1.5713,
|
|
"mean_token_accuracy": 0.6675057291984559,
|
|
"num_tokens": 1240770863.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.8368183681836818,
|
|
"grad_norm": 0.24647241690827157,
|
|
"learning_rate": 4.209525151472454e-07,
|
|
"loss": 1.5544,
|
|
"mean_token_accuracy": 0.6702269852161408,
|
|
"num_tokens": 1241672741.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.8381850485171518,
|
|
"grad_norm": 0.2188330802835415,
|
|
"learning_rate": 4.1742989995772867e-07,
|
|
"loss": 1.6201,
|
|
"mean_token_accuracy": 0.656230491399765,
|
|
"num_tokens": 1242622289.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.8395517288506218,
|
|
"grad_norm": 0.174473569567192,
|
|
"learning_rate": 4.13907284768212e-07,
|
|
"loss": 1.6741,
|
|
"mean_token_accuracy": 0.6514579474925994,
|
|
"num_tokens": 1243514162.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.8409184091840918,
|
|
"grad_norm": 0.20468397688310902,
|
|
"learning_rate": 4.1038466957869523e-07,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6622476220130921,
|
|
"num_tokens": 1244412950.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.8422850895175618,
|
|
"grad_norm": 0.1646700614967265,
|
|
"learning_rate": 4.068620543891785e-07,
|
|
"loss": 1.6552,
|
|
"mean_token_accuracy": 0.6570702195167542,
|
|
"num_tokens": 1245408793.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.8436517698510317,
|
|
"grad_norm": 0.2042223428199208,
|
|
"learning_rate": 4.0333943919966185e-07,
|
|
"loss": 1.6215,
|
|
"mean_token_accuracy": 0.6604909241199494,
|
|
"num_tokens": 1246389239.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 1.51796875,
|
|
"epoch": 1.8450184501845017,
|
|
"grad_norm": 0.18564743201204542,
|
|
"learning_rate": 3.9981682401014513e-07,
|
|
"loss": 1.5186,
|
|
"mean_token_accuracy": 0.6765841007232666,
|
|
"num_tokens": 1247357460.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.8463851305179717,
|
|
"grad_norm": 0.23163201635083255,
|
|
"learning_rate": 3.9629420882062847e-07,
|
|
"loss": 1.5789,
|
|
"mean_token_accuracy": 0.6670846462249755,
|
|
"num_tokens": 1248274845.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.8477518108514417,
|
|
"grad_norm": 0.20655510156952622,
|
|
"learning_rate": 3.9277159363111175e-07,
|
|
"loss": 1.6215,
|
|
"mean_token_accuracy": 0.6613776803016662,
|
|
"num_tokens": 1249208955.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.849118491184912,
|
|
"grad_norm": 0.18043884234283863,
|
|
"learning_rate": 3.892489784415951e-07,
|
|
"loss": 1.6127,
|
|
"mean_token_accuracy": 0.6591443538665771,
|
|
"num_tokens": 1250118611.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.8504851715183819,
|
|
"grad_norm": 0.1676509378090482,
|
|
"learning_rate": 3.8572636325207837e-07,
|
|
"loss": 1.6284,
|
|
"mean_token_accuracy": 0.6571697175502778,
|
|
"num_tokens": 1250971078.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 0.18306344281809114,
|
|
"learning_rate": 3.822037480625617e-07,
|
|
"loss": 1.6688,
|
|
"mean_token_accuracy": 0.6529488086700439,
|
|
"num_tokens": 1251895588.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 1.52578125,
|
|
"epoch": 1.8532185321853218,
|
|
"grad_norm": 0.18337586757775345,
|
|
"learning_rate": 3.78681132873045e-07,
|
|
"loss": 1.5263,
|
|
"mean_token_accuracy": 0.673325389623642,
|
|
"num_tokens": 1252889387.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.8545852125187918,
|
|
"grad_norm": 0.2423580794921671,
|
|
"learning_rate": 3.7515851768352827e-07,
|
|
"loss": 1.6382,
|
|
"mean_token_accuracy": 0.6563648223876953,
|
|
"num_tokens": 1253803906.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.8559518928522618,
|
|
"grad_norm": 0.2285896704007995,
|
|
"learning_rate": 3.716359024940116e-07,
|
|
"loss": 1.646,
|
|
"mean_token_accuracy": 0.653173816204071,
|
|
"num_tokens": 1254691171.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.857318573185732,
|
|
"grad_norm": 0.21231356667193776,
|
|
"learning_rate": 3.6811328730449484e-07,
|
|
"loss": 1.6304,
|
|
"mean_token_accuracy": 0.6568703889846802,
|
|
"num_tokens": 1255630525.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.858685253519202,
|
|
"grad_norm": 0.17456972660826545,
|
|
"learning_rate": 3.645906721149782e-07,
|
|
"loss": 1.5871,
|
|
"mean_token_accuracy": 0.664020711183548,
|
|
"num_tokens": 1256499117.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.860051933852672,
|
|
"grad_norm": 0.3684696650591512,
|
|
"learning_rate": 3.6106805692546145e-07,
|
|
"loss": 1.6453,
|
|
"mean_token_accuracy": 0.6549951910972596,
|
|
"num_tokens": 1257465470.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.861418614186142,
|
|
"grad_norm": 0.2085888018955089,
|
|
"learning_rate": 3.575454417359448e-07,
|
|
"loss": 1.6564,
|
|
"mean_token_accuracy": 0.6528635859489441,
|
|
"num_tokens": 1258365809.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.862785294519612,
|
|
"grad_norm": 0.18350406836816352,
|
|
"learning_rate": 3.5402282654642807e-07,
|
|
"loss": 1.5749,
|
|
"mean_token_accuracy": 0.6671357393264771,
|
|
"num_tokens": 1259268861.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.864151974853082,
|
|
"grad_norm": 0.2071140473643246,
|
|
"learning_rate": 3.505002113569114e-07,
|
|
"loss": 1.6282,
|
|
"mean_token_accuracy": 0.6569033980369567,
|
|
"num_tokens": 1260212246.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.865518655186552,
|
|
"grad_norm": 0.29469319277179656,
|
|
"learning_rate": 3.469775961673947e-07,
|
|
"loss": 1.6248,
|
|
"mean_token_accuracy": 0.660232937335968,
|
|
"num_tokens": 1261098884.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 1.534375,
|
|
"epoch": 1.866885335520022,
|
|
"grad_norm": 0.20375744000902432,
|
|
"learning_rate": 3.43454980977878e-07,
|
|
"loss": 1.5455,
|
|
"mean_token_accuracy": 0.6755730628967285,
|
|
"num_tokens": 1262006418.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.8682520158534919,
|
|
"grad_norm": 0.25589769405092333,
|
|
"learning_rate": 3.399323657883613e-07,
|
|
"loss": 1.6254,
|
|
"mean_token_accuracy": 0.6572736322879791,
|
|
"num_tokens": 1262919407.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.8696186961869619,
|
|
"grad_norm": 0.19189878633225557,
|
|
"learning_rate": 3.364097505988446e-07,
|
|
"loss": 1.6508,
|
|
"mean_token_accuracy": 0.6541313767433167,
|
|
"num_tokens": 1263876950.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.8709853765204318,
|
|
"grad_norm": 0.220054851130556,
|
|
"learning_rate": 3.328871354093279e-07,
|
|
"loss": 1.6596,
|
|
"mean_token_accuracy": 0.6516327142715455,
|
|
"num_tokens": 1264833722.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.8723520568539018,
|
|
"grad_norm": 0.21574694747707096,
|
|
"learning_rate": 3.293645202198112e-07,
|
|
"loss": 1.6009,
|
|
"mean_token_accuracy": 0.6608662784099579,
|
|
"num_tokens": 1265766741.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.8737187371873718,
|
|
"grad_norm": 0.1877619101767416,
|
|
"learning_rate": 3.2584190503029454e-07,
|
|
"loss": 1.6099,
|
|
"mean_token_accuracy": 0.660447895526886,
|
|
"num_tokens": 1266654049.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.8750854175208418,
|
|
"grad_norm": 0.26279291020009216,
|
|
"learning_rate": 3.223192898407778e-07,
|
|
"loss": 1.5786,
|
|
"mean_token_accuracy": 0.6674093425273895,
|
|
"num_tokens": 1267587731.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.8764520978543118,
|
|
"grad_norm": 0.43859352213674524,
|
|
"learning_rate": 3.1879667465126116e-07,
|
|
"loss": 1.6064,
|
|
"mean_token_accuracy": 0.6660899519920349,
|
|
"num_tokens": 1268527378.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.8778187781877818,
|
|
"grad_norm": 0.19954105102960068,
|
|
"learning_rate": 3.152740594617444e-07,
|
|
"loss": 1.6144,
|
|
"mean_token_accuracy": 0.663240784406662,
|
|
"num_tokens": 1269507909.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.8791854585212517,
|
|
"grad_norm": 0.20051495861511984,
|
|
"learning_rate": 3.117514442722277e-07,
|
|
"loss": 1.6185,
|
|
"mean_token_accuracy": 0.6626424193382263,
|
|
"num_tokens": 1270461305.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 1.5078125,
|
|
"epoch": 1.8805521388547217,
|
|
"grad_norm": 0.19974259970366962,
|
|
"learning_rate": 3.08228829082711e-07,
|
|
"loss": 1.5185,
|
|
"mean_token_accuracy": 0.6746940791606904,
|
|
"num_tokens": 1271430824.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.881918819188192,
|
|
"grad_norm": 0.1874118284612634,
|
|
"learning_rate": 3.0470621389319434e-07,
|
|
"loss": 1.6128,
|
|
"mean_token_accuracy": 0.6615552186965943,
|
|
"num_tokens": 1272332978.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 1.54921875,
|
|
"epoch": 1.883285499521662,
|
|
"grad_norm": 0.16718472535621864,
|
|
"learning_rate": 3.0118359870367763e-07,
|
|
"loss": 1.5648,
|
|
"mean_token_accuracy": 0.6703563153743743,
|
|
"num_tokens": 1273213909.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 1.57265625,
|
|
"epoch": 1.884652179855132,
|
|
"grad_norm": 0.19848501613922304,
|
|
"learning_rate": 2.9766098351416096e-07,
|
|
"loss": 1.5596,
|
|
"mean_token_accuracy": 0.6706547796726227,
|
|
"num_tokens": 1274148256.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.8860188601886019,
|
|
"grad_norm": 0.16311149556567267,
|
|
"learning_rate": 2.9413836832464424e-07,
|
|
"loss": 1.6287,
|
|
"mean_token_accuracy": 0.6580433249473572,
|
|
"num_tokens": 1275110674.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.8873855405220719,
|
|
"grad_norm": 0.18108201022906265,
|
|
"learning_rate": 2.906157531351276e-07,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6575561940670014,
|
|
"num_tokens": 1276044667.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.8887522208555418,
|
|
"grad_norm": 0.20580243386759592,
|
|
"learning_rate": 2.8709313794561086e-07,
|
|
"loss": 1.6045,
|
|
"mean_token_accuracy": 0.6645366847515106,
|
|
"num_tokens": 1276996454.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.890118901189012,
|
|
"grad_norm": 0.17091470155458993,
|
|
"learning_rate": 2.8357052275609415e-07,
|
|
"loss": 1.6112,
|
|
"mean_token_accuracy": 0.6604999423027038,
|
|
"num_tokens": 1277875012.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 1.891485581522482,
|
|
"grad_norm": 0.2380226824336483,
|
|
"learning_rate": 2.8004790756657743e-07,
|
|
"loss": 1.6994,
|
|
"mean_token_accuracy": 0.6512650489807129,
|
|
"num_tokens": 1278801045.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.892852261855952,
|
|
"grad_norm": 0.20696331398673376,
|
|
"learning_rate": 2.7652529237706076e-07,
|
|
"loss": 1.5949,
|
|
"mean_token_accuracy": 0.6643336594104767,
|
|
"num_tokens": 1279726793.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 1.53046875,
|
|
"epoch": 1.894218942189422,
|
|
"grad_norm": 0.20372715663290789,
|
|
"learning_rate": 2.7300267718754405e-07,
|
|
"loss": 1.5382,
|
|
"mean_token_accuracy": 0.6729359567165375,
|
|
"num_tokens": 1280621790.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 1.49765625,
|
|
"epoch": 1.895585622522892,
|
|
"grad_norm": 0.1778382304169913,
|
|
"learning_rate": 2.694800619980274e-07,
|
|
"loss": 1.5126,
|
|
"mean_token_accuracy": 0.6790998935699463,
|
|
"num_tokens": 1281550903.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 1.56953125,
|
|
"epoch": 1.896952302856362,
|
|
"grad_norm": 0.16614691006515772,
|
|
"learning_rate": 2.6595744680851066e-07,
|
|
"loss": 1.5782,
|
|
"mean_token_accuracy": 0.6650060594081879,
|
|
"num_tokens": 1282507927.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 1.898318983189832,
|
|
"grad_norm": 0.18056240529269157,
|
|
"learning_rate": 2.6243483161899395e-07,
|
|
"loss": 1.6476,
|
|
"mean_token_accuracy": 0.6561441123485565,
|
|
"num_tokens": 1283413609.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.899685663523302,
|
|
"grad_norm": 0.1780168509962306,
|
|
"learning_rate": 2.589122164294773e-07,
|
|
"loss": 1.6349,
|
|
"mean_token_accuracy": 0.6577652752399444,
|
|
"num_tokens": 1284385423.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.901052343856772,
|
|
"grad_norm": 0.22825142251670708,
|
|
"learning_rate": 2.5538960123996056e-07,
|
|
"loss": 1.6474,
|
|
"mean_token_accuracy": 0.6571348905563354,
|
|
"num_tokens": 1285302893.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.902419024190242,
|
|
"grad_norm": 0.20123285835716856,
|
|
"learning_rate": 2.518669860504439e-07,
|
|
"loss": 1.5924,
|
|
"mean_token_accuracy": 0.6602917015552521,
|
|
"num_tokens": 1286175430.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.9037857045237119,
|
|
"grad_norm": 0.17693987520934487,
|
|
"learning_rate": 2.483443708609272e-07,
|
|
"loss": 1.6204,
|
|
"mean_token_accuracy": 0.6570366322994232,
|
|
"num_tokens": 1287080554.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 1.5421875,
|
|
"epoch": 1.9051523848571819,
|
|
"grad_norm": 0.17649384921953787,
|
|
"learning_rate": 2.4482175567141046e-07,
|
|
"loss": 1.5518,
|
|
"mean_token_accuracy": 0.6690324127674103,
|
|
"num_tokens": 1288020909.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.9065190651906518,
|
|
"grad_norm": 0.19959281661652564,
|
|
"learning_rate": 2.4129914048189375e-07,
|
|
"loss": 1.6679,
|
|
"mean_token_accuracy": 0.6511024951934814,
|
|
"num_tokens": 1288988728.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.9078857455241218,
|
|
"grad_norm": 0.19091521502532116,
|
|
"learning_rate": 2.3777652529237708e-07,
|
|
"loss": 1.6489,
|
|
"mean_token_accuracy": 0.6576599836349487,
|
|
"num_tokens": 1289927004.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.9092524258575918,
|
|
"grad_norm": 0.1923662315801649,
|
|
"learning_rate": 2.342539101028604e-07,
|
|
"loss": 1.6537,
|
|
"mean_token_accuracy": 0.6565833926200867,
|
|
"num_tokens": 1290895048.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.9106191061910618,
|
|
"grad_norm": 0.22187525002066852,
|
|
"learning_rate": 2.307312949133437e-07,
|
|
"loss": 1.5933,
|
|
"mean_token_accuracy": 0.6626911044120789,
|
|
"num_tokens": 1291803545.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 1.5515625,
|
|
"epoch": 1.9119857865245318,
|
|
"grad_norm": 0.20453831153166022,
|
|
"learning_rate": 2.2720867972382698e-07,
|
|
"loss": 1.5655,
|
|
"mean_token_accuracy": 0.668301236629486,
|
|
"num_tokens": 1292722877.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.9133524668580018,
|
|
"grad_norm": 0.16290253730134585,
|
|
"learning_rate": 2.236860645343103e-07,
|
|
"loss": 1.6006,
|
|
"mean_token_accuracy": 0.6626541554927826,
|
|
"num_tokens": 1293621674.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 1.57265625,
|
|
"epoch": 1.914719147191472,
|
|
"grad_norm": 0.2504158069436708,
|
|
"learning_rate": 2.201634493447936e-07,
|
|
"loss": 1.579,
|
|
"mean_token_accuracy": 0.6658330678939819,
|
|
"num_tokens": 1294515623.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 1.5609375,
|
|
"epoch": 1.916085827524942,
|
|
"grad_norm": 0.173289034889824,
|
|
"learning_rate": 2.166408341552769e-07,
|
|
"loss": 1.5828,
|
|
"mean_token_accuracy": 0.6647621154785156,
|
|
"num_tokens": 1295417785.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.917452507858412,
|
|
"grad_norm": 0.1865315591546125,
|
|
"learning_rate": 2.131182189657602e-07,
|
|
"loss": 1.5984,
|
|
"mean_token_accuracy": 0.6636813104152679,
|
|
"num_tokens": 1296331237.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.918819188191882,
|
|
"grad_norm": 0.17724532075831428,
|
|
"learning_rate": 2.095956037762435e-07,
|
|
"loss": 1.6077,
|
|
"mean_token_accuracy": 0.6597939550876617,
|
|
"num_tokens": 1297254031.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.920185868525352,
|
|
"grad_norm": 0.20014189138151955,
|
|
"learning_rate": 2.0607298858672678e-07,
|
|
"loss": 1.6168,
|
|
"mean_token_accuracy": 0.6638847529888153,
|
|
"num_tokens": 1298186927.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 1.5546875,
|
|
"epoch": 1.9215525488588219,
|
|
"grad_norm": 0.19500709782722564,
|
|
"learning_rate": 2.025503733972101e-07,
|
|
"loss": 1.5763,
|
|
"mean_token_accuracy": 0.6684407532215119,
|
|
"num_tokens": 1299121298.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 1.922919229192292,
|
|
"grad_norm": 0.21102415304965325,
|
|
"learning_rate": 1.990277582076934e-07,
|
|
"loss": 1.6846,
|
|
"mean_token_accuracy": 0.6504097223281861,
|
|
"num_tokens": 1300078906.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.924285909525762,
|
|
"grad_norm": 0.20007360684967018,
|
|
"learning_rate": 1.955051430181767e-07,
|
|
"loss": 1.5473,
|
|
"mean_token_accuracy": 0.667876273393631,
|
|
"num_tokens": 1300947261.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 1.5359375,
|
|
"epoch": 1.925652589859232,
|
|
"grad_norm": 0.17220930842739238,
|
|
"learning_rate": 1.9198252782866002e-07,
|
|
"loss": 1.5359,
|
|
"mean_token_accuracy": 0.6744569003582,
|
|
"num_tokens": 1301870071.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.927019270192702,
|
|
"grad_norm": 0.15586182663995404,
|
|
"learning_rate": 1.8845991263914333e-07,
|
|
"loss": 1.579,
|
|
"mean_token_accuracy": 0.6664115786552429,
|
|
"num_tokens": 1302800684.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.928385950526172,
|
|
"grad_norm": 0.16203502293085192,
|
|
"learning_rate": 1.8493729744962664e-07,
|
|
"loss": 1.601,
|
|
"mean_token_accuracy": 0.6625315606594085,
|
|
"num_tokens": 1303648903.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.929752630859642,
|
|
"grad_norm": 0.20134685235316593,
|
|
"learning_rate": 1.8141468226010995e-07,
|
|
"loss": 1.6282,
|
|
"mean_token_accuracy": 0.6591674745082855,
|
|
"num_tokens": 1304591020.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 1.55703125,
|
|
"epoch": 1.931119311193112,
|
|
"grad_norm": 0.18828295327028655,
|
|
"learning_rate": 1.778920670705932e-07,
|
|
"loss": 1.5494,
|
|
"mean_token_accuracy": 0.6720081806182862,
|
|
"num_tokens": 1305519698.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.932485991526582,
|
|
"grad_norm": 0.15206595646303409,
|
|
"learning_rate": 1.743694518810765e-07,
|
|
"loss": 1.5969,
|
|
"mean_token_accuracy": 0.6623228967189789,
|
|
"num_tokens": 1306464150.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.933852671860052,
|
|
"grad_norm": 0.22060934143040256,
|
|
"learning_rate": 1.7084683669155982e-07,
|
|
"loss": 1.5919,
|
|
"mean_token_accuracy": 0.6649036169052124,
|
|
"num_tokens": 1307399985.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.935219352193522,
|
|
"grad_norm": 0.2712591335590226,
|
|
"learning_rate": 1.6732422150204313e-07,
|
|
"loss": 1.6196,
|
|
"mean_token_accuracy": 0.6600660026073456,
|
|
"num_tokens": 1308298839.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.936586032526992,
|
|
"grad_norm": 0.22782972462015547,
|
|
"learning_rate": 1.6380160631252644e-07,
|
|
"loss": 1.5853,
|
|
"mean_token_accuracy": 0.6641750752925872,
|
|
"num_tokens": 1309268385.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.937952712860462,
|
|
"grad_norm": 0.1880108459375757,
|
|
"learning_rate": 1.6027899112300975e-07,
|
|
"loss": 1.5331,
|
|
"mean_token_accuracy": 0.6735607028007508,
|
|
"num_tokens": 1310180662.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 1.9393193931939319,
|
|
"grad_norm": 0.19907403609225133,
|
|
"learning_rate": 1.5675637593349303e-07,
|
|
"loss": 1.5597,
|
|
"mean_token_accuracy": 0.6715898513793945,
|
|
"num_tokens": 1311138156.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.9406860735274019,
|
|
"grad_norm": 0.2927753854262742,
|
|
"learning_rate": 1.5323376074397634e-07,
|
|
"loss": 1.5878,
|
|
"mean_token_accuracy": 0.6641757309436798,
|
|
"num_tokens": 1312085366.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.9420527538608718,
|
|
"grad_norm": 0.1967243310210184,
|
|
"learning_rate": 1.4971114555445965e-07,
|
|
"loss": 1.7036,
|
|
"mean_token_accuracy": 0.6439588487148284,
|
|
"num_tokens": 1313017864.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.9434194341943418,
|
|
"grad_norm": 0.20988964970242935,
|
|
"learning_rate": 1.4618853036494293e-07,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.658420592546463,
|
|
"num_tokens": 1313917714.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.9447861145278118,
|
|
"grad_norm": 0.1881608851731163,
|
|
"learning_rate": 1.4266591517542624e-07,
|
|
"loss": 1.6204,
|
|
"mean_token_accuracy": 0.6608135640621186,
|
|
"num_tokens": 1314814153.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.9461527948612818,
|
|
"grad_norm": 0.1700057522949756,
|
|
"learning_rate": 1.3914329998590955e-07,
|
|
"loss": 1.568,
|
|
"mean_token_accuracy": 0.6688362717628479,
|
|
"num_tokens": 1315678497.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 1.57265625,
|
|
"epoch": 1.947519475194752,
|
|
"grad_norm": 0.23715229105809987,
|
|
"learning_rate": 1.3562068479639286e-07,
|
|
"loss": 1.5805,
|
|
"mean_token_accuracy": 0.6661898553371429,
|
|
"num_tokens": 1316642694.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 1.53125,
|
|
"epoch": 1.948886155528222,
|
|
"grad_norm": 0.2193501033218001,
|
|
"learning_rate": 1.3209806960687614e-07,
|
|
"loss": 1.5324,
|
|
"mean_token_accuracy": 0.6730566084384918,
|
|
"num_tokens": 1317528943.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.950252835861692,
|
|
"grad_norm": 0.2508442540933123,
|
|
"learning_rate": 1.2857545441735945e-07,
|
|
"loss": 1.5927,
|
|
"mean_token_accuracy": 0.6635078370571137,
|
|
"num_tokens": 1318438623.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.951619516195162,
|
|
"grad_norm": 0.21422300122921564,
|
|
"learning_rate": 1.2505283922784276e-07,
|
|
"loss": 1.6215,
|
|
"mean_token_accuracy": 0.6621395528316498,
|
|
"num_tokens": 1319396453.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 1.54453125,
|
|
"epoch": 1.952986196528632,
|
|
"grad_norm": 0.16996452253159886,
|
|
"learning_rate": 1.2153022403832607e-07,
|
|
"loss": 1.5661,
|
|
"mean_token_accuracy": 0.6673184752464294,
|
|
"num_tokens": 1320331782.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.954352876862102,
|
|
"grad_norm": 0.19462969692307205,
|
|
"learning_rate": 1.1800760884880937e-07,
|
|
"loss": 1.6189,
|
|
"mean_token_accuracy": 0.6583600342273712,
|
|
"num_tokens": 1321296681.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 1.578125,
|
|
"epoch": 1.9557195571955721,
|
|
"grad_norm": 0.21687093416562161,
|
|
"learning_rate": 1.1448499365929266e-07,
|
|
"loss": 1.5852,
|
|
"mean_token_accuracy": 0.6634842216968536,
|
|
"num_tokens": 1322226830.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.957086237529042,
|
|
"grad_norm": 0.18840438857225728,
|
|
"learning_rate": 1.1096237846977597e-07,
|
|
"loss": 1.621,
|
|
"mean_token_accuracy": 0.6604576289653779,
|
|
"num_tokens": 1323142549.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.958452917862512,
|
|
"grad_norm": 0.258106491733413,
|
|
"learning_rate": 1.0743976328025928e-07,
|
|
"loss": 1.6236,
|
|
"mean_token_accuracy": 0.6576037228107452,
|
|
"num_tokens": 1324071845.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.959819598195982,
|
|
"grad_norm": 0.16902066136167695,
|
|
"learning_rate": 1.0391714809074258e-07,
|
|
"loss": 1.5634,
|
|
"mean_token_accuracy": 0.6699787378311157,
|
|
"num_tokens": 1325008686.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 1.56875,
|
|
"epoch": 1.961186278529452,
|
|
"grad_norm": 0.23952617446980967,
|
|
"learning_rate": 1.0039453290122588e-07,
|
|
"loss": 1.5896,
|
|
"mean_token_accuracy": 0.6626493990421295,
|
|
"num_tokens": 1325919816.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 1.5109375,
|
|
"epoch": 1.962552958862922,
|
|
"grad_norm": 0.19474531855125474,
|
|
"learning_rate": 9.687191771170918e-08,
|
|
"loss": 1.5155,
|
|
"mean_token_accuracy": 0.6752154707908631,
|
|
"num_tokens": 1326822653.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.963919639196392,
|
|
"grad_norm": 0.22734918545953667,
|
|
"learning_rate": 9.334930252219248e-08,
|
|
"loss": 1.6451,
|
|
"mean_token_accuracy": 0.6562467515468597,
|
|
"num_tokens": 1327784552.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.965286319529862,
|
|
"grad_norm": 0.18111711314461873,
|
|
"learning_rate": 8.982668733267578e-08,
|
|
"loss": 1.5881,
|
|
"mean_token_accuracy": 0.6648685336112976,
|
|
"num_tokens": 1328729233.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.966652999863332,
|
|
"grad_norm": 0.22100077693894224,
|
|
"learning_rate": 8.630407214315909e-08,
|
|
"loss": 1.547,
|
|
"mean_token_accuracy": 0.6730900883674622,
|
|
"num_tokens": 1329645452.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 1.968019680196802,
|
|
"grad_norm": 0.2312601272290707,
|
|
"learning_rate": 8.27814569536424e-08,
|
|
"loss": 1.6797,
|
|
"mean_token_accuracy": 0.6534142136573792,
|
|
"num_tokens": 1330580950.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.969386360530272,
|
|
"grad_norm": 0.22500923649551335,
|
|
"learning_rate": 7.925884176412568e-08,
|
|
"loss": 1.6337,
|
|
"mean_token_accuracy": 0.6567195653915405,
|
|
"num_tokens": 1331484980.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.970753040863742,
|
|
"grad_norm": 0.18436137698587865,
|
|
"learning_rate": 7.573622657460899e-08,
|
|
"loss": 1.6676,
|
|
"mean_token_accuracy": 0.6530327796936035,
|
|
"num_tokens": 1332406292.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 1.52734375,
|
|
"epoch": 1.972119721197212,
|
|
"grad_norm": 0.17750602342652314,
|
|
"learning_rate": 7.22136113850923e-08,
|
|
"loss": 1.5269,
|
|
"mean_token_accuracy": 0.67576105594635,
|
|
"num_tokens": 1333339325.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.9734864015306819,
|
|
"grad_norm": 0.23006346112396664,
|
|
"learning_rate": 6.869099619557561e-08,
|
|
"loss": 1.5536,
|
|
"mean_token_accuracy": 0.6691858351230622,
|
|
"num_tokens": 1334260464.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 1.55703125,
|
|
"epoch": 1.9748530818641519,
|
|
"grad_norm": 0.22349399957462918,
|
|
"learning_rate": 6.51683810060589e-08,
|
|
"loss": 1.578,
|
|
"mean_token_accuracy": 0.665081912279129,
|
|
"num_tokens": 1335220260.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.9762197621976219,
|
|
"grad_norm": 0.22176382045530482,
|
|
"learning_rate": 6.164576581654221e-08,
|
|
"loss": 1.6476,
|
|
"mean_token_accuracy": 0.6546904861927032,
|
|
"num_tokens": 1336105959.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 1.57421875,
|
|
"epoch": 1.9775864425310918,
|
|
"grad_norm": 0.20148849683266837,
|
|
"learning_rate": 5.8123150627025515e-08,
|
|
"loss": 1.584,
|
|
"mean_token_accuracy": 0.664056408405304,
|
|
"num_tokens": 1337004485.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.9789531228645618,
|
|
"grad_norm": 0.18235816765741228,
|
|
"learning_rate": 5.460053543750881e-08,
|
|
"loss": 1.565,
|
|
"mean_token_accuracy": 0.6694414019584656,
|
|
"num_tokens": 1337867969.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.980319803198032,
|
|
"grad_norm": 0.1740978664661024,
|
|
"learning_rate": 5.107792024799211e-08,
|
|
"loss": 1.6574,
|
|
"mean_token_accuracy": 0.6537964284420014,
|
|
"num_tokens": 1338787574.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.981686483531502,
|
|
"grad_norm": 0.24986967867334348,
|
|
"learning_rate": 4.7555305058475415e-08,
|
|
"loss": 1.6679,
|
|
"mean_token_accuracy": 0.6512709975242614,
|
|
"num_tokens": 1339672672.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.983053163864972,
|
|
"grad_norm": 0.24420579067323261,
|
|
"learning_rate": 4.403268986895872e-08,
|
|
"loss": 1.6128,
|
|
"mean_token_accuracy": 0.6614326238632202,
|
|
"num_tokens": 1340639556.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.984419844198442,
|
|
"grad_norm": 0.20493863369098633,
|
|
"learning_rate": 4.0510074679442026e-08,
|
|
"loss": 1.6514,
|
|
"mean_token_accuracy": 0.6546407103538513,
|
|
"num_tokens": 1341565520.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.985786524531912,
|
|
"grad_norm": 0.21423481604991568,
|
|
"learning_rate": 3.698745948992532e-08,
|
|
"loss": 1.5814,
|
|
"mean_token_accuracy": 0.6636654675006867,
|
|
"num_tokens": 1342423360.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 1.534375,
|
|
"epoch": 1.987153204865382,
|
|
"grad_norm": 0.21501410483647534,
|
|
"learning_rate": 3.3464844300408624e-08,
|
|
"loss": 1.5366,
|
|
"mean_token_accuracy": 0.6696887612342834,
|
|
"num_tokens": 1343301024.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 1.54296875,
|
|
"epoch": 1.9885198851988521,
|
|
"grad_norm": 0.18200876564244256,
|
|
"learning_rate": 2.994222911089193e-08,
|
|
"loss": 1.5381,
|
|
"mean_token_accuracy": 0.6738729357719422,
|
|
"num_tokens": 1344252584.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.9898865655323221,
|
|
"grad_norm": 0.3346998546110333,
|
|
"learning_rate": 2.6419613921375232e-08,
|
|
"loss": 1.6157,
|
|
"mean_token_accuracy": 0.6608076810836792,
|
|
"num_tokens": 1345208093.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 1.56875,
|
|
"epoch": 1.991253245865792,
|
|
"grad_norm": 0.33188890922057707,
|
|
"learning_rate": 2.289699873185853e-08,
|
|
"loss": 1.5711,
|
|
"mean_token_accuracy": 0.6633019864559173,
|
|
"num_tokens": 1346125444.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.992619926199262,
|
|
"grad_norm": 0.19700221219551378,
|
|
"learning_rate": 1.9374383542341837e-08,
|
|
"loss": 1.6631,
|
|
"mean_token_accuracy": 0.6515511393547058,
|
|
"num_tokens": 1347010932.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 1.51953125,
|
|
"epoch": 1.993986606532732,
|
|
"grad_norm": 0.1871187773318557,
|
|
"learning_rate": 1.585176835282514e-08,
|
|
"loss": 1.507,
|
|
"mean_token_accuracy": 0.6784947454929352,
|
|
"num_tokens": 1347934049.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 1.995353286866202,
|
|
"grad_norm": 0.1931860987240444,
|
|
"learning_rate": 1.2329153163308442e-08,
|
|
"loss": 1.6969,
|
|
"mean_token_accuracy": 0.6486114084720611,
|
|
"num_tokens": 1348871583.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.996719967199672,
|
|
"grad_norm": 0.17930549906939344,
|
|
"learning_rate": 8.806537973791744e-09,
|
|
"loss": 1.5788,
|
|
"mean_token_accuracy": 0.6657912135124207,
|
|
"num_tokens": 1349832279.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.998086647533142,
|
|
"grad_norm": 0.18460242769164104,
|
|
"learning_rate": 5.2839227842750465e-09,
|
|
"loss": 1.6091,
|
|
"mean_token_accuracy": 0.6605981290340424,
|
|
"num_tokens": 1350784833.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.999453327866612,
|
|
"grad_norm": 0.26549271326611246,
|
|
"learning_rate": 1.7613075947583486e-09,
|
|
"loss": 1.5849,
|
|
"mean_token_accuracy": 0.6646957159042358,
|
|
"num_tokens": 1351710512.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 1.599609375,
|
|
"epoch": 2.0,
|
|
"mean_token_accuracy": 0.6595872640609741,
|
|
"num_tokens": 1352093209.0,
|
|
"step": 14634,
|
|
"total_flos": 2710644215513088.0,
|
|
"train_loss": 1.6633207928015101,
|
|
"train_runtime": 25736.6854,
|
|
"train_samples_per_second": 72.776,
|
|
"train_steps_per_second": 0.569
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 14634,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2710644215513088.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|