5384 lines
150 KiB
JSON
5384 lines
150 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 534,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 0.5478973388671875,
|
||
|
|
"epoch": 0.011235955056179775,
|
||
|
|
"grad_norm": 383.7328462293848,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 8.3388,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 844265.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5487899780273438,
|
||
|
|
"epoch": 0.02247191011235955,
|
||
|
|
"grad_norm": 382.2250379061165,
|
||
|
|
"learning_rate": 1.8518518518518518e-07,
|
||
|
|
"loss": 8.3331,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 1688860.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5494003295898438,
|
||
|
|
"epoch": 0.033707865168539325,
|
||
|
|
"grad_norm": 385.6801519392013,
|
||
|
|
"learning_rate": 3.7037037037037036e-07,
|
||
|
|
"loss": 8.2895,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 2512910.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5540695190429688,
|
||
|
|
"epoch": 0.0449438202247191,
|
||
|
|
"grad_norm": 387.5163435160337,
|
||
|
|
"learning_rate": 5.555555555555555e-07,
|
||
|
|
"loss": 8.2596,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 3345813.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5646514892578125,
|
||
|
|
"epoch": 0.056179775280898875,
|
||
|
|
"grad_norm": 390.56658814859105,
|
||
|
|
"learning_rate": 7.407407407407407e-07,
|
||
|
|
"loss": 8.1342,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 4158244.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.56011962890625,
|
||
|
|
"epoch": 0.06741573033707865,
|
||
|
|
"grad_norm": 396.82031188523996,
|
||
|
|
"learning_rate": 9.259259259259259e-07,
|
||
|
|
"loss": 8.0144,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 4967109.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5560760498046875,
|
||
|
|
"epoch": 0.07865168539325842,
|
||
|
|
"grad_norm": 399.44225638760815,
|
||
|
|
"learning_rate": 1.111111111111111e-06,
|
||
|
|
"loss": 7.4644,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 5797482.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5502700805664062,
|
||
|
|
"epoch": 0.0898876404494382,
|
||
|
|
"grad_norm": 271.48936847645507,
|
||
|
|
"learning_rate": 1.2962962962962962e-06,
|
||
|
|
"loss": 5.8786,
|
||
|
|
"mean_token_accuracy": 0.0026041667442768812,
|
||
|
|
"num_tokens": 6640065.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54205322265625,
|
||
|
|
"epoch": 0.10112359550561797,
|
||
|
|
"grad_norm": 230.51967558204245,
|
||
|
|
"learning_rate": 1.4814814814814815e-06,
|
||
|
|
"loss": 5.5918,
|
||
|
|
"mean_token_accuracy": 0.006510416860692203,
|
||
|
|
"num_tokens": 7494647.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.553466796875,
|
||
|
|
"epoch": 0.11235955056179775,
|
||
|
|
"grad_norm": 186.8557668882384,
|
||
|
|
"learning_rate": 1.6666666666666667e-06,
|
||
|
|
"loss": 5.264,
|
||
|
|
"mean_token_accuracy": 0.01953125058207661,
|
||
|
|
"num_tokens": 8336619.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5583953857421875,
|
||
|
|
"epoch": 0.12359550561797752,
|
||
|
|
"grad_norm": 102.72564448300426,
|
||
|
|
"learning_rate": 1.8518518518518519e-06,
|
||
|
|
"loss": 4.112,
|
||
|
|
"mean_token_accuracy": 0.5247395989717916,
|
||
|
|
"num_tokens": 9153404.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.550445556640625,
|
||
|
|
"epoch": 0.1348314606741573,
|
||
|
|
"grad_norm": 96.89583143635592,
|
||
|
|
"learning_rate": 2.037037037037037e-06,
|
||
|
|
"loss": 4.0343,
|
||
|
|
"mean_token_accuracy": 0.5078125151339918,
|
||
|
|
"num_tokens": 10007098.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5568389892578125,
|
||
|
|
"epoch": 0.14606741573033707,
|
||
|
|
"grad_norm": 82.87420696019375,
|
||
|
|
"learning_rate": 2.222222222222222e-06,
|
||
|
|
"loss": 3.8298,
|
||
|
|
"mean_token_accuracy": 0.5117187652504072,
|
||
|
|
"num_tokens": 10832783.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5591354370117188,
|
||
|
|
"epoch": 0.15730337078651685,
|
||
|
|
"grad_norm": 74.53414115193272,
|
||
|
|
"learning_rate": 2.4074074074074075e-06,
|
||
|
|
"loss": 3.7077,
|
||
|
|
"mean_token_accuracy": 0.5299479324603453,
|
||
|
|
"num_tokens": 11666567.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5399703979492188,
|
||
|
|
"epoch": 0.16853932584269662,
|
||
|
|
"grad_norm": 59.49474774838589,
|
||
|
|
"learning_rate": 2.5925925925925925e-06,
|
||
|
|
"loss": 3.2713,
|
||
|
|
"mean_token_accuracy": 0.4973958481568843,
|
||
|
|
"num_tokens": 12505279.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5487747192382812,
|
||
|
|
"epoch": 0.1797752808988764,
|
||
|
|
"grad_norm": 58.34965057030908,
|
||
|
|
"learning_rate": 2.7777777777777783e-06,
|
||
|
|
"loss": 3.2007,
|
||
|
|
"mean_token_accuracy": 0.5299479324603453,
|
||
|
|
"num_tokens": 13330043.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.561126708984375,
|
||
|
|
"epoch": 0.19101123595505617,
|
||
|
|
"grad_norm": 57.55503720354528,
|
||
|
|
"learning_rate": 2.962962962962963e-06,
|
||
|
|
"loss": 3.1543,
|
||
|
|
"mean_token_accuracy": 0.5169270987389609,
|
||
|
|
"num_tokens": 14127916.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5448684692382812,
|
||
|
|
"epoch": 0.20224719101123595,
|
||
|
|
"grad_norm": 57.669979135570635,
|
||
|
|
"learning_rate": 3.1481481481481483e-06,
|
||
|
|
"loss": 3.0899,
|
||
|
|
"mean_token_accuracy": 0.537760432693176,
|
||
|
|
"num_tokens": 14969669.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.56536865234375,
|
||
|
|
"epoch": 0.21348314606741572,
|
||
|
|
"grad_norm": 57.626889014580236,
|
||
|
|
"learning_rate": 3.3333333333333333e-06,
|
||
|
|
"loss": 3.0513,
|
||
|
|
"mean_token_accuracy": 0.5273437657160684,
|
||
|
|
"num_tokens": 15742573.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.536224365234375,
|
||
|
|
"epoch": 0.2247191011235955,
|
||
|
|
"grad_norm": 57.83925364642696,
|
||
|
|
"learning_rate": 3.5185185185185187e-06,
|
||
|
|
"loss": 2.9626,
|
||
|
|
"mean_token_accuracy": 0.5403645994374529,
|
||
|
|
"num_tokens": 16604330.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5584945678710938,
|
||
|
|
"epoch": 0.23595505617977527,
|
||
|
|
"grad_norm": 57.73861838272076,
|
||
|
|
"learning_rate": 3.7037037037037037e-06,
|
||
|
|
"loss": 2.9248,
|
||
|
|
"mean_token_accuracy": 0.5416666828095913,
|
||
|
|
"num_tokens": 17401202.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5339431762695312,
|
||
|
|
"epoch": 0.24719101123595505,
|
||
|
|
"grad_norm": 58.12584008334574,
|
||
|
|
"learning_rate": 3.88888888888889e-06,
|
||
|
|
"loss": 2.9143,
|
||
|
|
"mean_token_accuracy": 0.5325520992046222,
|
||
|
|
"num_tokens": 18227760.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5297012329101562,
|
||
|
|
"epoch": 0.25842696629213485,
|
||
|
|
"grad_norm": 63.216344809460054,
|
||
|
|
"learning_rate": 4.074074074074074e-06,
|
||
|
|
"loss": 2.9183,
|
||
|
|
"mean_token_accuracy": 0.5364583493210375,
|
||
|
|
"num_tokens": 19085561.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.538482666015625,
|
||
|
|
"epoch": 0.2696629213483146,
|
||
|
|
"grad_norm": 56.95197027840473,
|
||
|
|
"learning_rate": 4.2592592592592596e-06,
|
||
|
|
"loss": 2.8617,
|
||
|
|
"mean_token_accuracy": 0.5429687661817297,
|
||
|
|
"num_tokens": 19905165.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5329513549804688,
|
||
|
|
"epoch": 0.2808988764044944,
|
||
|
|
"grad_norm": 58.492963708867535,
|
||
|
|
"learning_rate": 4.444444444444444e-06,
|
||
|
|
"loss": 2.8492,
|
||
|
|
"mean_token_accuracy": 0.5364583493210375,
|
||
|
|
"num_tokens": 20746635.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5399398803710938,
|
||
|
|
"epoch": 0.29213483146067415,
|
||
|
|
"grad_norm": 57.60957116637501,
|
||
|
|
"learning_rate": 4.62962962962963e-06,
|
||
|
|
"loss": 2.815,
|
||
|
|
"mean_token_accuracy": 0.5390625160653144,
|
||
|
|
"num_tokens": 21555958.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5479583740234375,
|
||
|
|
"epoch": 0.30337078651685395,
|
||
|
|
"grad_norm": 57.30072476472811,
|
||
|
|
"learning_rate": 4.814814814814815e-06,
|
||
|
|
"loss": 2.7716,
|
||
|
|
"mean_token_accuracy": 0.5651041835080832,
|
||
|
|
"num_tokens": 22363221.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5298919677734375,
|
||
|
|
"epoch": 0.3146067415730337,
|
||
|
|
"grad_norm": 57.24926645812191,
|
||
|
|
"learning_rate": 5e-06,
|
||
|
|
"loss": 2.7507,
|
||
|
|
"mean_token_accuracy": 0.5442708495538682,
|
||
|
|
"num_tokens": 23200357.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5341720581054688,
|
||
|
|
"epoch": 0.3258426966292135,
|
||
|
|
"grad_norm": 57.30241919840906,
|
||
|
|
"learning_rate": 4.999952005391863e-06,
|
||
|
|
"loss": 2.7141,
|
||
|
|
"mean_token_accuracy": 0.5520833497866988,
|
||
|
|
"num_tokens": 24032340.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5347061157226562,
|
||
|
|
"epoch": 0.33707865168539325,
|
||
|
|
"grad_norm": 57.286490592876675,
|
||
|
|
"learning_rate": 4.999808023410233e-06,
|
||
|
|
"loss": 2.6785,
|
||
|
|
"mean_token_accuracy": 0.5546875165309757,
|
||
|
|
"num_tokens": 24875983.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.523590087890625,
|
||
|
|
"epoch": 0.34831460674157305,
|
||
|
|
"grad_norm": 57.68819839964531,
|
||
|
|
"learning_rate": 4.999568059583401e-06,
|
||
|
|
"loss": 2.6613,
|
||
|
|
"mean_token_accuracy": 0.5533854331588373,
|
||
|
|
"num_tokens": 25724705.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5200119018554688,
|
||
|
|
"epoch": 0.3595505617977528,
|
||
|
|
"grad_norm": 58.4245124088433,
|
||
|
|
"learning_rate": 4.9992321231249425e-06,
|
||
|
|
"loss": 2.6243,
|
||
|
|
"mean_token_accuracy": 0.5638021001359448,
|
||
|
|
"num_tokens": 26582233.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5226516723632812,
|
||
|
|
"epoch": 0.3707865168539326,
|
||
|
|
"grad_norm": 58.32722377916457,
|
||
|
|
"learning_rate": 4.998800226933367e-06,
|
||
|
|
"loss": 2.5931,
|
||
|
|
"mean_token_accuracy": 0.570312516996637,
|
||
|
|
"num_tokens": 27422365.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.540557861328125,
|
||
|
|
"epoch": 0.38202247191011235,
|
||
|
|
"grad_norm": 58.734213041913115,
|
||
|
|
"learning_rate": 4.998272387591625e-06,
|
||
|
|
"loss": 2.5598,
|
||
|
|
"mean_token_accuracy": 0.5794271006016061,
|
||
|
|
"num_tokens": 28242002.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5180587768554688,
|
||
|
|
"epoch": 0.39325842696629215,
|
||
|
|
"grad_norm": 59.22626114064095,
|
||
|
|
"learning_rate": 4.997648625366471e-06,
|
||
|
|
"loss": 2.5452,
|
||
|
|
"mean_token_accuracy": 0.5677083502523601,
|
||
|
|
"num_tokens": 29080530.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5167007446289062,
|
||
|
|
"epoch": 0.4044943820224719,
|
||
|
|
"grad_norm": 60.7804863069846,
|
||
|
|
"learning_rate": 4.996928964207685e-06,
|
||
|
|
"loss": 2.5519,
|
||
|
|
"mean_token_accuracy": 0.5651041835080832,
|
||
|
|
"num_tokens": 29911104.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5139617919921875,
|
||
|
|
"epoch": 0.4157303370786517,
|
||
|
|
"grad_norm": 59.02823200509279,
|
||
|
|
"learning_rate": 4.99611343174715e-06,
|
||
|
|
"loss": 2.4802,
|
||
|
|
"mean_token_accuracy": 0.5664062668802217,
|
||
|
|
"num_tokens": 30785022.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5275650024414062,
|
||
|
|
"epoch": 0.42696629213483145,
|
||
|
|
"grad_norm": 59.560232474144826,
|
||
|
|
"learning_rate": 4.995202059297795e-06,
|
||
|
|
"loss": 2.4654,
|
||
|
|
"mean_token_accuracy": 0.5729166837409139,
|
||
|
|
"num_tokens": 31611994.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5351028442382812,
|
||
|
|
"epoch": 0.43820224719101125,
|
||
|
|
"grad_norm": 59.72600726311165,
|
||
|
|
"learning_rate": 4.99419488185239e-06,
|
||
|
|
"loss": 2.4412,
|
||
|
|
"mean_token_accuracy": 0.570312516996637,
|
||
|
|
"num_tokens": 32393343.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.527740478515625,
|
||
|
|
"epoch": 0.449438202247191,
|
||
|
|
"grad_norm": 59.57641648822912,
|
||
|
|
"learning_rate": 4.993091938082206e-06,
|
||
|
|
"loss": 2.4243,
|
||
|
|
"mean_token_accuracy": 0.5690104336244985,
|
||
|
|
"num_tokens": 33198379.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515106201171875,
|
||
|
|
"epoch": 0.4606741573033708,
|
||
|
|
"grad_norm": 60.12617134689506,
|
||
|
|
"learning_rate": 4.991893270335526e-06,
|
||
|
|
"loss": 2.4111,
|
||
|
|
"mean_token_accuracy": 0.558593766647391,
|
||
|
|
"num_tokens": 34054233.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5322647094726562,
|
||
|
|
"epoch": 0.47191011235955055,
|
||
|
|
"grad_norm": 59.660173498647474,
|
||
|
|
"learning_rate": 4.990598924636019e-06,
|
||
|
|
"loss": 2.3815,
|
||
|
|
"mean_token_accuracy": 0.5625000167638063,
|
||
|
|
"num_tokens": 34878164.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5286941528320312,
|
||
|
|
"epoch": 0.48314606741573035,
|
||
|
|
"grad_norm": 60.54371226870739,
|
||
|
|
"learning_rate": 4.989208950680979e-06,
|
||
|
|
"loss": 2.3666,
|
||
|
|
"mean_token_accuracy": 0.558593766647391,
|
||
|
|
"num_tokens": 35703689.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5433731079101562,
|
||
|
|
"epoch": 0.4943820224719101,
|
||
|
|
"grad_norm": 60.28449067908698,
|
||
|
|
"learning_rate": 4.987723401839409e-06,
|
||
|
|
"loss": 2.3225,
|
||
|
|
"mean_token_accuracy": 0.5950521006016061,
|
||
|
|
"num_tokens": 36503596.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5287551879882812,
|
||
|
|
"epoch": 0.5056179775280899,
|
||
|
|
"grad_norm": 60.062655344477925,
|
||
|
|
"learning_rate": 4.9861423351499786e-06,
|
||
|
|
"loss": 2.3121,
|
||
|
|
"mean_token_accuracy": 0.6861979308305308,
|
||
|
|
"num_tokens": 37321035.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5376815795898438,
|
||
|
|
"epoch": 0.5168539325842697,
|
||
|
|
"grad_norm": 61.069743149192924,
|
||
|
|
"learning_rate": 4.984465811318826e-06,
|
||
|
|
"loss": 2.2812,
|
||
|
|
"mean_token_accuracy": 0.826822925475426,
|
||
|
|
"num_tokens": 38143678.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5267181396484375,
|
||
|
|
"epoch": 0.5280898876404494,
|
||
|
|
"grad_norm": 60.3671855103254,
|
||
|
|
"learning_rate": 4.982693894717237e-06,
|
||
|
|
"loss": 2.2576,
|
||
|
|
"mean_token_accuracy": 0.8984375060535967,
|
||
|
|
"num_tokens": 39005372.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5442123413085938,
|
||
|
|
"epoch": 0.5393258426966292,
|
||
|
|
"grad_norm": 60.49661142976516,
|
||
|
|
"learning_rate": 4.980826653379163e-06,
|
||
|
|
"loss": 2.2092,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 39808850.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.538360595703125,
|
||
|
|
"epoch": 0.550561797752809,
|
||
|
|
"grad_norm": 60.88409758608409,
|
||
|
|
"learning_rate": 4.97886415899862e-06,
|
||
|
|
"loss": 2.1876,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 40635450.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5340042114257812,
|
||
|
|
"epoch": 0.5617977528089888,
|
||
|
|
"grad_norm": 60.57809081511029,
|
||
|
|
"learning_rate": 4.976806486926926e-06,
|
||
|
|
"loss": 2.176,
|
||
|
|
"mean_token_accuracy": 0.9166666716337204,
|
||
|
|
"num_tokens": 41464069.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5406875610351562,
|
||
|
|
"epoch": 0.5730337078651685,
|
||
|
|
"grad_norm": 61.75540998165706,
|
||
|
|
"learning_rate": 4.9746537161698125e-06,
|
||
|
|
"loss": 2.1636,
|
||
|
|
"mean_token_accuracy": 0.901041672565043,
|
||
|
|
"num_tokens": 42275662.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.560394287109375,
|
||
|
|
"epoch": 0.5842696629213483,
|
||
|
|
"grad_norm": 60.16814853435,
|
||
|
|
"learning_rate": 4.972405929384391e-06,
|
||
|
|
"loss": 2.1153,
|
||
|
|
"mean_token_accuracy": 0.9114583386108279,
|
||
|
|
"num_tokens": 43057777.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5278244018554688,
|
||
|
|
"epoch": 0.5955056179775281,
|
||
|
|
"grad_norm": 60.91973249877592,
|
||
|
|
"learning_rate": 4.970063212875979e-06,
|
||
|
|
"loss": 2.1079,
|
||
|
|
"mean_token_accuracy": 0.8984375060535967,
|
||
|
|
"num_tokens": 43898689.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5439300537109375,
|
||
|
|
"epoch": 0.6067415730337079,
|
||
|
|
"grad_norm": 59.911507237292454,
|
||
|
|
"learning_rate": 4.967625656594782e-06,
|
||
|
|
"loss": 2.0699,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 44744002.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5342025756835938,
|
||
|
|
"epoch": 0.6179775280898876,
|
||
|
|
"grad_norm": 59.838647030374474,
|
||
|
|
"learning_rate": 4.965093354132451e-06,
|
||
|
|
"loss": 2.044,
|
||
|
|
"mean_token_accuracy": 0.9192708381451666,
|
||
|
|
"num_tokens": 45628240.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.536773681640625,
|
||
|
|
"epoch": 0.6292134831460674,
|
||
|
|
"grad_norm": 60.35729117488591,
|
||
|
|
"learning_rate": 4.962466402718475e-06,
|
||
|
|
"loss": 2.0351,
|
||
|
|
"mean_token_accuracy": 0.9114583386108279,
|
||
|
|
"num_tokens": 46468344.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5527420043945312,
|
||
|
|
"epoch": 0.6404494382022472,
|
||
|
|
"grad_norm": 59.71609925946788,
|
||
|
|
"learning_rate": 4.959744903216458e-06,
|
||
|
|
"loss": 1.9982,
|
||
|
|
"mean_token_accuracy": 0.9140625051222742,
|
||
|
|
"num_tokens": 47283576.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53558349609375,
|
||
|
|
"epoch": 0.651685393258427,
|
||
|
|
"grad_norm": 59.63500869576208,
|
||
|
|
"learning_rate": 4.9569289601202405e-06,
|
||
|
|
"loss": 1.9785,
|
||
|
|
"mean_token_accuracy": 0.9023437558207661,
|
||
|
|
"num_tokens": 48111866.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.551788330078125,
|
||
|
|
"epoch": 0.6629213483146067,
|
||
|
|
"grad_norm": 60.06109571239089,
|
||
|
|
"learning_rate": 4.954018681549891e-06,
|
||
|
|
"loss": 1.9583,
|
||
|
|
"mean_token_accuracy": 0.901041672565043,
|
||
|
|
"num_tokens": 48917746.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.538818359375,
|
||
|
|
"epoch": 0.6741573033707865,
|
||
|
|
"grad_norm": 59.102981798917874,
|
||
|
|
"learning_rate": 4.951014179247555e-06,
|
||
|
|
"loss": 1.9142,
|
||
|
|
"mean_token_accuracy": 0.9322916707023978,
|
||
|
|
"num_tokens": 49747914.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5444412231445312,
|
||
|
|
"epoch": 0.6853932584269663,
|
||
|
|
"grad_norm": 59.269303577585205,
|
||
|
|
"learning_rate": 4.9479155685731595e-06,
|
||
|
|
"loss": 1.9104,
|
||
|
|
"mean_token_accuracy": 0.9062500055879354,
|
||
|
|
"num_tokens": 50576281.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.543304443359375,
|
||
|
|
"epoch": 0.6966292134831461,
|
||
|
|
"grad_norm": 58.631307105889,
|
||
|
|
"learning_rate": 4.944722968499989e-06,
|
||
|
|
"loss": 1.8554,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 51410056.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5558624267578125,
|
||
|
|
"epoch": 0.7078651685393258,
|
||
|
|
"grad_norm": 58.765453799476205,
|
||
|
|
"learning_rate": 4.9414365016101144e-06,
|
||
|
|
"loss": 1.8217,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 52208397.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5417938232421875,
|
||
|
|
"epoch": 0.7191011235955056,
|
||
|
|
"grad_norm": 59.35803234115679,
|
||
|
|
"learning_rate": 4.938056294089689e-06,
|
||
|
|
"loss": 1.8217,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 53054896.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5324325561523438,
|
||
|
|
"epoch": 0.7303370786516854,
|
||
|
|
"grad_norm": 58.481973969356844,
|
||
|
|
"learning_rate": 4.934582475724101e-06,
|
||
|
|
"loss": 1.7979,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 53923624.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5482559204101562,
|
||
|
|
"epoch": 0.7415730337078652,
|
||
|
|
"grad_norm": 59.26597816016183,
|
||
|
|
"learning_rate": 4.93101517989299e-06,
|
||
|
|
"loss": 1.7507,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 54743941.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.533050537109375,
|
||
|
|
"epoch": 0.7528089887640449,
|
||
|
|
"grad_norm": 58.16716667827391,
|
||
|
|
"learning_rate": 4.927354543565131e-06,
|
||
|
|
"loss": 1.7286,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 55583113.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5493240356445312,
|
||
|
|
"epoch": 0.7640449438202247,
|
||
|
|
"grad_norm": 58.77068748544961,
|
||
|
|
"learning_rate": 4.923600707293166e-06,
|
||
|
|
"loss": 1.7072,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 56411372.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5332260131835938,
|
||
|
|
"epoch": 0.7752808988764045,
|
||
|
|
"grad_norm": 58.030678964904006,
|
||
|
|
"learning_rate": 4.919753815208218e-06,
|
||
|
|
"loss": 1.6664,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 57243688.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5486373901367188,
|
||
|
|
"epoch": 0.7865168539325843,
|
||
|
|
"grad_norm": 58.850517970345265,
|
||
|
|
"learning_rate": 4.915814015014349e-06,
|
||
|
|
"loss": 1.6563,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 58065169.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5496139526367188,
|
||
|
|
"epoch": 0.797752808988764,
|
||
|
|
"grad_norm": 58.53066867858742,
|
||
|
|
"learning_rate": 4.91178145798289e-06,
|
||
|
|
"loss": 1.6189,
|
||
|
|
"mean_token_accuracy": 0.9322916707023978,
|
||
|
|
"num_tokens": 58862994.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531982421875,
|
||
|
|
"epoch": 0.8089887640449438,
|
||
|
|
"grad_norm": 59.11337145551544,
|
||
|
|
"learning_rate": 4.90765629894664e-06,
|
||
|
|
"loss": 1.5988,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 59712732.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5450286865234375,
|
||
|
|
"epoch": 0.8202247191011236,
|
||
|
|
"grad_norm": 58.17233978561868,
|
||
|
|
"learning_rate": 4.90343869629391e-06,
|
||
|
|
"loss": 1.5525,
|
||
|
|
"mean_token_accuracy": 0.9401041702367365,
|
||
|
|
"num_tokens": 60540733.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5311508178710938,
|
||
|
|
"epoch": 0.8314606741573034,
|
||
|
|
"grad_norm": 58.64402722570219,
|
||
|
|
"learning_rate": 4.89912881196245e-06,
|
||
|
|
"loss": 1.5378,
|
||
|
|
"mean_token_accuracy": 0.9270833376795053,
|
||
|
|
"num_tokens": 61392029.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5337066650390625,
|
||
|
|
"epoch": 0.8426966292134831,
|
||
|
|
"grad_norm": 58.691322870736734,
|
||
|
|
"learning_rate": 4.8947268114332276e-06,
|
||
|
|
"loss": 1.5081,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 62239350.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5332107543945312,
|
||
|
|
"epoch": 0.8539325842696629,
|
||
|
|
"grad_norm": 59.494275039320314,
|
||
|
|
"learning_rate": 4.890232863724075e-06,
|
||
|
|
"loss": 1.4997,
|
||
|
|
"mean_token_accuracy": 0.9218750046566129,
|
||
|
|
"num_tokens": 63070023.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5474319458007812,
|
||
|
|
"epoch": 0.8651685393258427,
|
||
|
|
"grad_norm": 58.54978299815276,
|
||
|
|
"learning_rate": 4.8856471413831995e-06,
|
||
|
|
"loss": 1.4526,
|
||
|
|
"mean_token_accuracy": 0.9322916707023978,
|
||
|
|
"num_tokens": 63896515.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5431747436523438,
|
||
|
|
"epoch": 0.8764044943820225,
|
||
|
|
"grad_norm": 58.6331170246829,
|
||
|
|
"learning_rate": 4.880969820482559e-06,
|
||
|
|
"loss": 1.4351,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 64711098.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5443191528320312,
|
||
|
|
"epoch": 0.8876404494382022,
|
||
|
|
"grad_norm": 60.77368585176177,
|
||
|
|
"learning_rate": 4.8762010806111e-06,
|
||
|
|
"loss": 1.4007,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 65528767.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5321044921875,
|
||
|
|
"epoch": 0.898876404494382,
|
||
|
|
"grad_norm": 58.41490381142874,
|
||
|
|
"learning_rate": 4.8713411048678635e-06,
|
||
|
|
"loss": 1.3736,
|
||
|
|
"mean_token_accuracy": 0.923177087912336,
|
||
|
|
"num_tokens": 66376396.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5301437377929688,
|
||
|
|
"epoch": 0.9101123595505618,
|
||
|
|
"grad_norm": 58.31714038287807,
|
||
|
|
"learning_rate": 4.866390079854956e-06,
|
||
|
|
"loss": 1.3632,
|
||
|
|
"mean_token_accuracy": 0.9153645883779973,
|
||
|
|
"num_tokens": 67234926.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54400634765625,
|
||
|
|
"epoch": 0.9213483146067416,
|
||
|
|
"grad_norm": 58.481418531052604,
|
||
|
|
"learning_rate": 4.861348195670381e-06,
|
||
|
|
"loss": 1.2982,
|
||
|
|
"mean_token_accuracy": 0.9427083367481828,
|
||
|
|
"num_tokens": 68053260.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.534088134765625,
|
||
|
|
"epoch": 0.9325842696629213,
|
||
|
|
"grad_norm": 57.95432573120232,
|
||
|
|
"learning_rate": 4.856215645900742e-06,
|
||
|
|
"loss": 1.2632,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 68867802.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5389633178710938,
|
||
|
|
"epoch": 0.9438202247191011,
|
||
|
|
"grad_norm": 58.38827044496076,
|
||
|
|
"learning_rate": 4.850992627613812e-06,
|
||
|
|
"loss": 1.251,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 69694869.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5363616943359375,
|
||
|
|
"epoch": 0.9550561797752809,
|
||
|
|
"grad_norm": 57.860955575276655,
|
||
|
|
"learning_rate": 4.845679341350963e-06,
|
||
|
|
"loss": 1.2127,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 70531155.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5427932739257812,
|
||
|
|
"epoch": 0.9662921348314607,
|
||
|
|
"grad_norm": 58.932378976256814,
|
||
|
|
"learning_rate": 4.8402759911194705e-06,
|
||
|
|
"loss": 1.1981,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 71363290.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5369033813476562,
|
||
|
|
"epoch": 0.9775280898876404,
|
||
|
|
"grad_norm": 59.60454994673109,
|
||
|
|
"learning_rate": 4.834782784384674e-06,
|
||
|
|
"loss": 1.1884,
|
||
|
|
"mean_token_accuracy": 0.9244791711680591,
|
||
|
|
"num_tokens": 72178708.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5212631225585938,
|
||
|
|
"epoch": 0.9887640449438202,
|
||
|
|
"grad_norm": 57.158221171397074,
|
||
|
|
"learning_rate": 4.8291999320620185e-06,
|
||
|
|
"loss": 1.142,
|
||
|
|
"mean_token_accuracy": 0.9335937539581209,
|
||
|
|
"num_tokens": 73067536.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53387451171875,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 57.71387886521076,
|
||
|
|
"learning_rate": 4.823527648508951e-06,
|
||
|
|
"loss": 1.1127,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 73887526.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54193115234375,
|
||
|
|
"epoch": 1.0112359550561798,
|
||
|
|
"grad_norm": 56.86485522019607,
|
||
|
|
"learning_rate": 4.817766151516693e-06,
|
||
|
|
"loss": 1.0862,
|
||
|
|
"mean_token_accuracy": 0.9322916707023978,
|
||
|
|
"num_tokens": 74697192.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5283203125,
|
||
|
|
"epoch": 1.0224719101123596,
|
||
|
|
"grad_norm": 57.46020827331217,
|
||
|
|
"learning_rate": 4.811915662301877e-06,
|
||
|
|
"loss": 1.0731,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 75539407.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5219268798828125,
|
||
|
|
"epoch": 1.0337078651685394,
|
||
|
|
"grad_norm": 56.91787187127946,
|
||
|
|
"learning_rate": 4.805976405498052e-06,
|
||
|
|
"loss": 1.0468,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 76388888.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5343704223632812,
|
||
|
|
"epoch": 1.0449438202247192,
|
||
|
|
"grad_norm": 57.27346521029154,
|
||
|
|
"learning_rate": 4.799948609147061e-06,
|
||
|
|
"loss": 1.0094,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 77233062.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.527801513671875,
|
||
|
|
"epoch": 1.0561797752808988,
|
||
|
|
"grad_norm": 56.33009259771239,
|
||
|
|
"learning_rate": 4.793832504690283e-06,
|
||
|
|
"loss": 0.9796,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 78066352.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52886962890625,
|
||
|
|
"epoch": 1.0674157303370786,
|
||
|
|
"grad_norm": 57.343778352318104,
|
||
|
|
"learning_rate": 4.787628326959747e-06,
|
||
|
|
"loss": 0.9711,
|
||
|
|
"mean_token_accuracy": 0.9309895874466747,
|
||
|
|
"num_tokens": 78900853.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5222854614257812,
|
||
|
|
"epoch": 1.0786516853932584,
|
||
|
|
"grad_norm": 56.24835996693155,
|
||
|
|
"learning_rate": 4.7813363141691166e-06,
|
||
|
|
"loss": 0.947,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 79755419.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5308151245117188,
|
||
|
|
"epoch": 1.0898876404494382,
|
||
|
|
"grad_norm": 55.743629737936764,
|
||
|
|
"learning_rate": 4.774956707904542e-06,
|
||
|
|
"loss": 0.905,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 80595236.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53717041015625,
|
||
|
|
"epoch": 1.101123595505618,
|
||
|
|
"grad_norm": 55.52186768937809,
|
||
|
|
"learning_rate": 4.768489753115386e-06,
|
||
|
|
"loss": 0.8817,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 81420449.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5405044555664062,
|
||
|
|
"epoch": 1.1123595505617978,
|
||
|
|
"grad_norm": 54.95236799373694,
|
||
|
|
"learning_rate": 4.761935698104817e-06,
|
||
|
|
"loss": 0.852,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 82231257.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5181732177734375,
|
||
|
|
"epoch": 1.1235955056179776,
|
||
|
|
"grad_norm": 54.78774457440945,
|
||
|
|
"learning_rate": 4.755294794520277e-06,
|
||
|
|
"loss": 0.859,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 83098626.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5196533203125,
|
||
|
|
"epoch": 1.1348314606741572,
|
||
|
|
"grad_norm": 54.27301765434061,
|
||
|
|
"learning_rate": 4.7485672973438175e-06,
|
||
|
|
"loss": 0.805,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 83964827.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5354232788085938,
|
||
|
|
"epoch": 1.146067415730337,
|
||
|
|
"grad_norm": 53.98544893534557,
|
||
|
|
"learning_rate": 4.741753464882312e-06,
|
||
|
|
"loss": 0.8019,
|
||
|
|
"mean_token_accuracy": 0.9335937539581209,
|
||
|
|
"num_tokens": 84783212.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5196075439453125,
|
||
|
|
"epoch": 1.1573033707865168,
|
||
|
|
"grad_norm": 53.76533745223255,
|
||
|
|
"learning_rate": 4.734853558757534e-06,
|
||
|
|
"loss": 0.7712,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 85630983.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5174789428710938,
|
||
|
|
"epoch": 1.1685393258426966,
|
||
|
|
"grad_norm": 53.14821983286506,
|
||
|
|
"learning_rate": 4.727867843896116e-06,
|
||
|
|
"loss": 0.7418,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 86471990.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.535003662109375,
|
||
|
|
"epoch": 1.1797752808988764,
|
||
|
|
"grad_norm": 52.63736880368936,
|
||
|
|
"learning_rate": 4.72079658851938e-06,
|
||
|
|
"loss": 0.722,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 87254432.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5264739990234375,
|
||
|
|
"epoch": 1.1910112359550562,
|
||
|
|
"grad_norm": 51.9971299352231,
|
||
|
|
"learning_rate": 4.7136400641330245e-06,
|
||
|
|
"loss": 0.6939,
|
||
|
|
"mean_token_accuracy": 0.945312503259629,
|
||
|
|
"num_tokens": 88074800.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.524566650390625,
|
||
|
|
"epoch": 1.202247191011236,
|
||
|
|
"grad_norm": 51.764163888935016,
|
||
|
|
"learning_rate": 4.706398545516722e-06,
|
||
|
|
"loss": 0.6962,
|
||
|
|
"mean_token_accuracy": 0.9427083367481828,
|
||
|
|
"num_tokens": 88905071.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52813720703125,
|
||
|
|
"epoch": 1.2134831460674158,
|
||
|
|
"grad_norm": 52.44796000632313,
|
||
|
|
"learning_rate": 4.6990723107135475e-06,
|
||
|
|
"loss": 0.6476,
|
||
|
|
"mean_token_accuracy": 0.9570312525611371,
|
||
|
|
"num_tokens": 89698224.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5216827392578125,
|
||
|
|
"epoch": 1.2247191011235956,
|
||
|
|
"grad_norm": 53.80256967623094,
|
||
|
|
"learning_rate": 4.691661641019316e-06,
|
||
|
|
"loss": 0.6913,
|
||
|
|
"mean_token_accuracy": 0.9192708381451666,
|
||
|
|
"num_tokens": 90537702.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5336227416992188,
|
||
|
|
"epoch": 1.2359550561797752,
|
||
|
|
"grad_norm": 49.47024184934409,
|
||
|
|
"learning_rate": 4.684166820971779e-06,
|
||
|
|
"loss": 0.6087,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 91352264.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5501785278320312,
|
||
|
|
"epoch": 1.247191011235955,
|
||
|
|
"grad_norm": 47.99596183660331,
|
||
|
|
"learning_rate": 4.6765881383396985e-06,
|
||
|
|
"loss": 0.5901,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 92109673.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.542724609375,
|
||
|
|
"epoch": 1.2584269662921348,
|
||
|
|
"grad_norm": 46.25282102422423,
|
||
|
|
"learning_rate": 4.6689258841117946e-06,
|
||
|
|
"loss": 0.5642,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 92900953.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5264129638671875,
|
||
|
|
"epoch": 1.2696629213483146,
|
||
|
|
"grad_norm": 45.38074896415223,
|
||
|
|
"learning_rate": 4.6611803524855805e-06,
|
||
|
|
"loss": 0.5528,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 93718876.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5328903198242188,
|
||
|
|
"epoch": 1.2808988764044944,
|
||
|
|
"grad_norm": 44.871573881233765,
|
||
|
|
"learning_rate": 4.65335184085606e-06,
|
||
|
|
"loss": 0.5572,
|
||
|
|
"mean_token_accuracy": 0.9270833376795053,
|
||
|
|
"num_tokens": 94529005.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5391159057617188,
|
||
|
|
"epoch": 1.2921348314606742,
|
||
|
|
"grad_norm": 44.02274928854262,
|
||
|
|
"learning_rate": 4.64544064980431e-06,
|
||
|
|
"loss": 0.507,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 95341154.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5222015380859375,
|
||
|
|
"epoch": 1.303370786516854,
|
||
|
|
"grad_norm": 42.75676583668566,
|
||
|
|
"learning_rate": 4.637447083085944e-06,
|
||
|
|
"loss": 0.5046,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 96167635.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5244522094726562,
|
||
|
|
"epoch": 1.3146067415730336,
|
||
|
|
"grad_norm": 45.15532414937935,
|
||
|
|
"learning_rate": 4.629371447619443e-06,
|
||
|
|
"loss": 0.4978,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 97010200.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5244598388671875,
|
||
|
|
"epoch": 1.3258426966292136,
|
||
|
|
"grad_norm": 56.13362888266017,
|
||
|
|
"learning_rate": 4.621214053474374e-06,
|
||
|
|
"loss": 0.6082,
|
||
|
|
"mean_token_accuracy": 0.8971354227978736,
|
||
|
|
"num_tokens": 97852254.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.539215087890625,
|
||
|
|
"epoch": 1.3370786516853932,
|
||
|
|
"grad_norm": 38.98016965907191,
|
||
|
|
"learning_rate": 4.612975213859487e-06,
|
||
|
|
"loss": 0.4563,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 98679091.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5576171875,
|
||
|
|
"epoch": 1.348314606741573,
|
||
|
|
"grad_norm": 40.255740846718474,
|
||
|
|
"learning_rate": 4.604655245110684e-06,
|
||
|
|
"loss": 0.4792,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 99483720.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5303115844726562,
|
||
|
|
"epoch": 1.3595505617977528,
|
||
|
|
"grad_norm": 35.28949683471456,
|
||
|
|
"learning_rate": 4.596254466678877e-06,
|
||
|
|
"loss": 0.4717,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 100322701.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5253524780273438,
|
||
|
|
"epoch": 1.3707865168539326,
|
||
|
|
"grad_norm": 35.972964160783164,
|
||
|
|
"learning_rate": 4.5877732011177215e-06,
|
||
|
|
"loss": 0.4626,
|
||
|
|
"mean_token_accuracy": 0.9023437558207661,
|
||
|
|
"num_tokens": 101165734.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5372314453125,
|
||
|
|
"epoch": 1.3820224719101124,
|
||
|
|
"grad_norm": 34.071476491719324,
|
||
|
|
"learning_rate": 4.579211774071229e-06,
|
||
|
|
"loss": 0.4247,
|
||
|
|
"mean_token_accuracy": 0.9283854209352285,
|
||
|
|
"num_tokens": 101981751.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51861572265625,
|
||
|
|
"epoch": 1.3932584269662922,
|
||
|
|
"grad_norm": 33.00619895893129,
|
||
|
|
"learning_rate": 4.570570514261272e-06,
|
||
|
|
"loss": 0.4043,
|
||
|
|
"mean_token_accuracy": 0.9388020869810134,
|
||
|
|
"num_tokens": 102873816.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5457382202148438,
|
||
|
|
"epoch": 1.404494382022472,
|
||
|
|
"grad_norm": 33.283611733855004,
|
||
|
|
"learning_rate": 4.561849753474951e-06,
|
||
|
|
"loss": 0.3831,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 103679964.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.551483154296875,
|
||
|
|
"epoch": 1.4157303370786516,
|
||
|
|
"grad_norm": 30.450265998368877,
|
||
|
|
"learning_rate": 4.553049826551864e-06,
|
||
|
|
"loss": 0.3586,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 104483969.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.544921875,
|
||
|
|
"epoch": 1.4269662921348314,
|
||
|
|
"grad_norm": 33.037901010787365,
|
||
|
|
"learning_rate": 4.544171071371246e-06,
|
||
|
|
"loss": 0.3743,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 105281682.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5335235595703125,
|
||
|
|
"epoch": 1.4382022471910112,
|
||
|
|
"grad_norm": 29.309084873235943,
|
||
|
|
"learning_rate": 4.535213828838998e-06,
|
||
|
|
"loss": 0.3215,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 106098113.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.536895751953125,
|
||
|
|
"epoch": 1.449438202247191,
|
||
|
|
"grad_norm": 29.992570415099404,
|
||
|
|
"learning_rate": 4.526178442874596e-06,
|
||
|
|
"loss": 0.3275,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 106932062.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53948974609375,
|
||
|
|
"epoch": 1.4606741573033708,
|
||
|
|
"grad_norm": 29.198467800844845,
|
||
|
|
"learning_rate": 4.517065260397887e-06,
|
||
|
|
"loss": 0.3151,
|
||
|
|
"mean_token_accuracy": 0.9270833376795053,
|
||
|
|
"num_tokens": 107772618.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5549697875976562,
|
||
|
|
"epoch": 1.4719101123595506,
|
||
|
|
"grad_norm": 26.144198778784126,
|
||
|
|
"learning_rate": 4.5078746313157684e-06,
|
||
|
|
"loss": 0.2783,
|
||
|
|
"mean_token_accuracy": 0.9518229195382446,
|
||
|
|
"num_tokens": 108550420.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5575103759765625,
|
||
|
|
"epoch": 1.4831460674157304,
|
||
|
|
"grad_norm": 26.803101306468495,
|
||
|
|
"learning_rate": 4.498606908508754e-06,
|
||
|
|
"loss": 0.2814,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 109318814.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5332794189453125,
|
||
|
|
"epoch": 1.49438202247191,
|
||
|
|
"grad_norm": 23.65296150298368,
|
||
|
|
"learning_rate": 4.489262447817421e-06,
|
||
|
|
"loss": 0.2551,
|
||
|
|
"mean_token_accuracy": 0.9596354190725833,
|
||
|
|
"num_tokens": 110158987.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531585693359375,
|
||
|
|
"epoch": 1.50561797752809,
|
||
|
|
"grad_norm": 22.686143776318026,
|
||
|
|
"learning_rate": 4.479841608028756e-06,
|
||
|
|
"loss": 0.2753,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 111014455.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5310440063476562,
|
||
|
|
"epoch": 1.5168539325842696,
|
||
|
|
"grad_norm": 22.75785890829976,
|
||
|
|
"learning_rate": 4.470344750862369e-06,
|
||
|
|
"loss": 0.2744,
|
||
|
|
"mean_token_accuracy": 0.9361979204695672,
|
||
|
|
"num_tokens": 111844565.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5299530029296875,
|
||
|
|
"epoch": 1.5280898876404494,
|
||
|
|
"grad_norm": 19.646757022226197,
|
||
|
|
"learning_rate": 4.460772240956609e-06,
|
||
|
|
"loss": 0.236,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 112693858.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.530914306640625,
|
||
|
|
"epoch": 1.5393258426966292,
|
||
|
|
"grad_norm": 18.581871649623455,
|
||
|
|
"learning_rate": 4.4511244458545666e-06,
|
||
|
|
"loss": 0.2337,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 113552373.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5373458862304688,
|
||
|
|
"epoch": 1.550561797752809,
|
||
|
|
"grad_norm": 18.480906621711842,
|
||
|
|
"learning_rate": 4.441401735989958e-06,
|
||
|
|
"loss": 0.224,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 114386318.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5466995239257812,
|
||
|
|
"epoch": 1.5617977528089888,
|
||
|
|
"grad_norm": 19.092891408382183,
|
||
|
|
"learning_rate": 4.431604484672905e-06,
|
||
|
|
"loss": 0.2181,
|
||
|
|
"mean_token_accuracy": 0.9440104200039059,
|
||
|
|
"num_tokens": 115190002.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5388870239257812,
|
||
|
|
"epoch": 1.5730337078651684,
|
||
|
|
"grad_norm": 16.375029133835223,
|
||
|
|
"learning_rate": 4.421733068075596e-06,
|
||
|
|
"loss": 0.218,
|
||
|
|
"mean_token_accuracy": 0.9414062534924597,
|
||
|
|
"num_tokens": 116010674.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5213394165039062,
|
||
|
|
"epoch": 1.5842696629213484,
|
||
|
|
"grad_norm": 15.182778483601002,
|
||
|
|
"learning_rate": 4.411787865217847e-06,
|
||
|
|
"loss": 0.2018,
|
||
|
|
"mean_token_accuracy": 0.9479166697710752,
|
||
|
|
"num_tokens": 116866501.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5382003784179688,
|
||
|
|
"epoch": 1.595505617977528,
|
||
|
|
"grad_norm": 16.027990698815895,
|
||
|
|
"learning_rate": 4.401769257952551e-06,
|
||
|
|
"loss": 0.1885,
|
||
|
|
"mean_token_accuracy": 0.9570312525611371,
|
||
|
|
"num_tokens": 117690211.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51544189453125,
|
||
|
|
"epoch": 1.606741573033708,
|
||
|
|
"grad_norm": 19.861091704846544,
|
||
|
|
"learning_rate": 4.3916776309510115e-06,
|
||
|
|
"loss": 0.1953,
|
||
|
|
"mean_token_accuracy": 0.9401041702367365,
|
||
|
|
"num_tokens": 118570203.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5500030517578125,
|
||
|
|
"epoch": 1.6179775280898876,
|
||
|
|
"grad_norm": 20.061583831622784,
|
||
|
|
"learning_rate": 4.381513371688174e-06,
|
||
|
|
"loss": 0.184,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 119362860.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5243148803710938,
|
||
|
|
"epoch": 1.6292134831460674,
|
||
|
|
"grad_norm": 13.08141096028655,
|
||
|
|
"learning_rate": 4.3712768704277535e-06,
|
||
|
|
"loss": 0.1714,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 120222412.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5271530151367188,
|
||
|
|
"epoch": 1.6404494382022472,
|
||
|
|
"grad_norm": 30.698080213743623,
|
||
|
|
"learning_rate": 4.360968520207241e-06,
|
||
|
|
"loss": 0.2563,
|
||
|
|
"mean_token_accuracy": 0.9036458390764892,
|
||
|
|
"num_tokens": 121059163.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5256195068359375,
|
||
|
|
"epoch": 1.651685393258427,
|
||
|
|
"grad_norm": 11.372090831152036,
|
||
|
|
"learning_rate": 4.35058871682282e-06,
|
||
|
|
"loss": 0.1618,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 121918453.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.537628173828125,
|
||
|
|
"epoch": 1.6629213483146068,
|
||
|
|
"grad_norm": 18.04796633205754,
|
||
|
|
"learning_rate": 4.340137858814168e-06,
|
||
|
|
"loss": 0.1884,
|
||
|
|
"mean_token_accuracy": 0.9348958372138441,
|
||
|
|
"num_tokens": 122723340.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.536773681640625,
|
||
|
|
"epoch": 1.6741573033707864,
|
||
|
|
"grad_norm": 10.93001525080465,
|
||
|
|
"learning_rate": 4.329616347449154e-06,
|
||
|
|
"loss": 0.1552,
|
||
|
|
"mean_token_accuracy": 0.9596354190725833,
|
||
|
|
"num_tokens": 123541295.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5089187622070312,
|
||
|
|
"epoch": 1.6853932584269664,
|
||
|
|
"grad_norm": 20.39620342458635,
|
||
|
|
"learning_rate": 4.3190245867084275e-06,
|
||
|
|
"loss": 0.1989,
|
||
|
|
"mean_token_accuracy": 0.9322916707023978,
|
||
|
|
"num_tokens": 124435851.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5318374633789062,
|
||
|
|
"epoch": 1.696629213483146,
|
||
|
|
"grad_norm": 13.331468443274261,
|
||
|
|
"learning_rate": 4.308362983269916e-06,
|
||
|
|
"loss": 0.159,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 125265567.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53662109375,
|
||
|
|
"epoch": 1.7078651685393258,
|
||
|
|
"grad_norm": 23.59150612291166,
|
||
|
|
"learning_rate": 4.297631946493202e-06,
|
||
|
|
"loss": 0.2394,
|
||
|
|
"mean_token_accuracy": 0.9114583386108279,
|
||
|
|
"num_tokens": 126117998.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5291519165039062,
|
||
|
|
"epoch": 1.7191011235955056,
|
||
|
|
"grad_norm": 16.96931279554441,
|
||
|
|
"learning_rate": 4.2868318884038075e-06,
|
||
|
|
"loss": 0.1939,
|
||
|
|
"mean_token_accuracy": 0.9257812544237822,
|
||
|
|
"num_tokens": 126968965.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5551834106445312,
|
||
|
|
"epoch": 1.7303370786516854,
|
||
|
|
"grad_norm": 9.706337234757527,
|
||
|
|
"learning_rate": 4.275963223677379e-06,
|
||
|
|
"loss": 0.1495,
|
||
|
|
"mean_token_accuracy": 0.9661458353511989,
|
||
|
|
"num_tokens": 127735103.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55059814453125,
|
||
|
|
"epoch": 1.7415730337078652,
|
||
|
|
"grad_norm": 14.998195966114025,
|
||
|
|
"learning_rate": 4.265026369623761e-06,
|
||
|
|
"loss": 0.1691,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 128567014.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5133056640625,
|
||
|
|
"epoch": 1.7528089887640448,
|
||
|
|
"grad_norm": 8.38785965664328,
|
||
|
|
"learning_rate": 4.254021746170972e-06,
|
||
|
|
"loss": 0.1674,
|
||
|
|
"mean_token_accuracy": 0.9505208362825215,
|
||
|
|
"num_tokens": 129446494.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5388946533203125,
|
||
|
|
"epoch": 1.7640449438202248,
|
||
|
|
"grad_norm": 8.628009379073635,
|
||
|
|
"learning_rate": 4.242949775849083e-06,
|
||
|
|
"loss": 0.1342,
|
||
|
|
"mean_token_accuracy": 0.9583333358168602,
|
||
|
|
"num_tokens": 130282793.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5218429565429688,
|
||
|
|
"epoch": 1.7752808988764044,
|
||
|
|
"grad_norm": 8.979344060078741,
|
||
|
|
"learning_rate": 4.231810883773999e-06,
|
||
|
|
"loss": 0.1498,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 131144403.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5305633544921875,
|
||
|
|
"epoch": 1.7865168539325844,
|
||
|
|
"grad_norm": 10.386798650341824,
|
||
|
|
"learning_rate": 4.220605497631125e-06,
|
||
|
|
"loss": 0.1208,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 131969457.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5267715454101562,
|
||
|
|
"epoch": 1.797752808988764,
|
||
|
|
"grad_norm": 7.845366778688198,
|
||
|
|
"learning_rate": 4.209334047658956e-06,
|
||
|
|
"loss": 0.1339,
|
||
|
|
"mean_token_accuracy": 0.9596354190725833,
|
||
|
|
"num_tokens": 132802344.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51898193359375,
|
||
|
|
"epoch": 1.8089887640449438,
|
||
|
|
"grad_norm": 15.048259815135598,
|
||
|
|
"learning_rate": 4.197996966632551e-06,
|
||
|
|
"loss": 0.1477,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 133686657.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5142669677734375,
|
||
|
|
"epoch": 1.8202247191011236,
|
||
|
|
"grad_norm": 6.170398947994508,
|
||
|
|
"learning_rate": 4.186594689846919e-06,
|
||
|
|
"loss": 0.1196,
|
||
|
|
"mean_token_accuracy": 0.9544270860496908,
|
||
|
|
"num_tokens": 134569286.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5417022705078125,
|
||
|
|
"epoch": 1.8314606741573034,
|
||
|
|
"grad_norm": 6.977797842018329,
|
||
|
|
"learning_rate": 4.175127655100306e-06,
|
||
|
|
"loss": 0.1176,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 135384479.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5277252197265625,
|
||
|
|
"epoch": 1.8426966292134832,
|
||
|
|
"grad_norm": 12.599667992263445,
|
||
|
|
"learning_rate": 4.163596302677383e-06,
|
||
|
|
"loss": 0.1273,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 136199109.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.523162841796875,
|
||
|
|
"epoch": 1.8539325842696628,
|
||
|
|
"grad_norm": 6.79687806434144,
|
||
|
|
"learning_rate": 4.152001075332342e-06,
|
||
|
|
"loss": 0.1134,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 137047704.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5387954711914062,
|
||
|
|
"epoch": 1.8651685393258428,
|
||
|
|
"grad_norm": 5.293968888823296,
|
||
|
|
"learning_rate": 4.140342418271897e-06,
|
||
|
|
"loss": 0.0948,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 137837799.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5280380249023438,
|
||
|
|
"epoch": 1.8764044943820224,
|
||
|
|
"grad_norm": 9.734221142762577,
|
||
|
|
"learning_rate": 4.128620779138191e-06,
|
||
|
|
"loss": 0.1198,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 138667003.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5301589965820312,
|
||
|
|
"epoch": 1.8876404494382022,
|
||
|
|
"grad_norm": 10.983716082693723,
|
||
|
|
"learning_rate": 4.116836607991603e-06,
|
||
|
|
"loss": 0.1103,
|
||
|
|
"mean_token_accuracy": 0.9661458353511989,
|
||
|
|
"num_tokens": 139486069.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5339202880859375,
|
||
|
|
"epoch": 1.898876404494382,
|
||
|
|
"grad_norm": 4.754192213758688,
|
||
|
|
"learning_rate": 4.104990357293478e-06,
|
||
|
|
"loss": 0.0976,
|
||
|
|
"mean_token_accuracy": 0.9752604181412607,
|
||
|
|
"num_tokens": 140301431.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5370941162109375,
|
||
|
|
"epoch": 1.9101123595505618,
|
||
|
|
"grad_norm": 7.392070355069441,
|
||
|
|
"learning_rate": 4.09308248188874e-06,
|
||
|
|
"loss": 0.0953,
|
||
|
|
"mean_token_accuracy": 0.9635416688397527,
|
||
|
|
"num_tokens": 141127372.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5226898193359375,
|
||
|
|
"epoch": 1.9213483146067416,
|
||
|
|
"grad_norm": 14.855299560449547,
|
||
|
|
"learning_rate": 4.081113438988443e-06,
|
||
|
|
"loss": 0.1426,
|
||
|
|
"mean_token_accuracy": 0.9375000037252903,
|
||
|
|
"num_tokens": 141960887.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5215377807617188,
|
||
|
|
"epoch": 1.9325842696629212,
|
||
|
|
"grad_norm": 5.677690032968136,
|
||
|
|
"learning_rate": 4.069083688152206e-06,
|
||
|
|
"loss": 0.0999,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 142790358.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5326385498046875,
|
||
|
|
"epoch": 1.9438202247191012,
|
||
|
|
"grad_norm": 17.868174534877028,
|
||
|
|
"learning_rate": 4.056993691270569e-06,
|
||
|
|
"loss": 0.1516,
|
||
|
|
"mean_token_accuracy": 0.9414062534924597,
|
||
|
|
"num_tokens": 143606992.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5331192016601562,
|
||
|
|
"epoch": 1.9550561797752808,
|
||
|
|
"grad_norm": 10.649577515960202,
|
||
|
|
"learning_rate": 4.044843912547262e-06,
|
||
|
|
"loss": 0.1173,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 144422922.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5289459228515625,
|
||
|
|
"epoch": 1.9662921348314608,
|
||
|
|
"grad_norm": 19.71433225206811,
|
||
|
|
"learning_rate": 4.032634818481382e-06,
|
||
|
|
"loss": 0.1615,
|
||
|
|
"mean_token_accuracy": 0.9205729214008898,
|
||
|
|
"num_tokens": 145257110.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5237350463867188,
|
||
|
|
"epoch": 1.9775280898876404,
|
||
|
|
"grad_norm": 23.109552808231093,
|
||
|
|
"learning_rate": 4.020366877849477e-06,
|
||
|
|
"loss": 0.1769,
|
||
|
|
"mean_token_accuracy": 0.9101562553551048,
|
||
|
|
"num_tokens": 146089047.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5413665771484375,
|
||
|
|
"epoch": 1.9887640449438202,
|
||
|
|
"grad_norm": 11.897669907113016,
|
||
|
|
"learning_rate": 4.008040561687549e-06,
|
||
|
|
"loss": 0.1259,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 146909622.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5229339599609375,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 14.618156823772459,
|
||
|
|
"learning_rate": 3.995656343272969e-06,
|
||
|
|
"loss": 0.1309,
|
||
|
|
"mean_token_accuracy": 0.9492187530267984,
|
||
|
|
"num_tokens": 147777645.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.522613525390625,
|
||
|
|
"epoch": 2.0112359550561796,
|
||
|
|
"grad_norm": 22.246161812882423,
|
||
|
|
"learning_rate": 3.983214698106305e-06,
|
||
|
|
"loss": 0.1777,
|
||
|
|
"mean_token_accuracy": 0.9179687548894435,
|
||
|
|
"num_tokens": 148648659.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5502471923828125,
|
||
|
|
"epoch": 2.0224719101123596,
|
||
|
|
"grad_norm": 8.734834674837812,
|
||
|
|
"learning_rate": 3.970716103893065e-06,
|
||
|
|
"loss": 0.1015,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 149458601.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5521392822265625,
|
||
|
|
"epoch": 2.033707865168539,
|
||
|
|
"grad_norm": 5.921310602189352,
|
||
|
|
"learning_rate": 3.958161040525354e-06,
|
||
|
|
"loss": 0.1132,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 150274927.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.539154052734375,
|
||
|
|
"epoch": 2.044943820224719,
|
||
|
|
"grad_norm": 12.815169602354262,
|
||
|
|
"learning_rate": 3.94554999006345e-06,
|
||
|
|
"loss": 0.1233,
|
||
|
|
"mean_token_accuracy": 0.9466145865153521,
|
||
|
|
"num_tokens": 151122405.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5272445678710938,
|
||
|
|
"epoch": 2.056179775280899,
|
||
|
|
"grad_norm": 8.808389478714032,
|
||
|
|
"learning_rate": 3.932883436717291e-06,
|
||
|
|
"loss": 0.1029,
|
||
|
|
"mean_token_accuracy": 0.9596354190725833,
|
||
|
|
"num_tokens": 151962476.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5394363403320312,
|
||
|
|
"epoch": 2.067415730337079,
|
||
|
|
"grad_norm": 3.5484560249089445,
|
||
|
|
"learning_rate": 3.92016186682789e-06,
|
||
|
|
"loss": 0.0751,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 152793064.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.516876220703125,
|
||
|
|
"epoch": 2.0786516853932584,
|
||
|
|
"grad_norm": 8.176851356833051,
|
||
|
|
"learning_rate": 3.907385768848656e-06,
|
||
|
|
"loss": 0.1042,
|
||
|
|
"mean_token_accuracy": 0.9609375023283064,
|
||
|
|
"num_tokens": 153670203.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5299301147460938,
|
||
|
|
"epoch": 2.0898876404494384,
|
||
|
|
"grad_norm": 5.999812660978029,
|
||
|
|
"learning_rate": 3.894555633326642e-06,
|
||
|
|
"loss": 0.0825,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 154512429.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.522918701171875,
|
||
|
|
"epoch": 2.101123595505618,
|
||
|
|
"grad_norm": 10.700306121124873,
|
||
|
|
"learning_rate": 3.88167195288371e-06,
|
||
|
|
"loss": 0.0943,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 155359706.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5169754028320312,
|
||
|
|
"epoch": 2.1123595505617976,
|
||
|
|
"grad_norm": 8.896828713381819,
|
||
|
|
"learning_rate": 3.868735222197614e-06,
|
||
|
|
"loss": 0.0897,
|
||
|
|
"mean_token_accuracy": 0.9739583348855376,
|
||
|
|
"num_tokens": 156201601.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5218658447265625,
|
||
|
|
"epoch": 2.1235955056179776,
|
||
|
|
"grad_norm": 10.96828625347169,
|
||
|
|
"learning_rate": 3.85574593798301e-06,
|
||
|
|
"loss": 0.0896,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 157033011.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5191421508789062,
|
||
|
|
"epoch": 2.134831460674157,
|
||
|
|
"grad_norm": 10.179919221478066,
|
||
|
|
"learning_rate": 3.842704598972384e-06,
|
||
|
|
"loss": 0.0886,
|
||
|
|
"mean_token_accuracy": 0.9661458353511989,
|
||
|
|
"num_tokens": 157878092.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.524810791015625,
|
||
|
|
"epoch": 2.146067415730337,
|
||
|
|
"grad_norm": 5.580920039451122,
|
||
|
|
"learning_rate": 3.8296117058969e-06,
|
||
|
|
"loss": 0.1033,
|
||
|
|
"mean_token_accuracy": 0.9635416688397527,
|
||
|
|
"num_tokens": 158716668.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51739501953125,
|
||
|
|
"epoch": 2.157303370786517,
|
||
|
|
"grad_norm": 8.577423884096872,
|
||
|
|
"learning_rate": 3.816467761467175e-06,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 159542526.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51837158203125,
|
||
|
|
"epoch": 2.168539325842697,
|
||
|
|
"grad_norm": 6.116622062216189,
|
||
|
|
"learning_rate": 3.80327327035398e-06,
|
||
|
|
"loss": 0.0775,
|
||
|
|
"mean_token_accuracy": 0.9687500018626451,
|
||
|
|
"num_tokens": 160368637.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5116424560546875,
|
||
|
|
"epoch": 2.1797752808988764,
|
||
|
|
"grad_norm": 4.322788821439791,
|
||
|
|
"learning_rate": 3.7900287391688584e-06,
|
||
|
|
"loss": 0.0765,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 161223870.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.506072998046875,
|
||
|
|
"epoch": 2.191011235955056,
|
||
|
|
"grad_norm": 5.948522126434665,
|
||
|
|
"learning_rate": 3.776734676444678e-06,
|
||
|
|
"loss": 0.0751,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 162059887.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53485107421875,
|
||
|
|
"epoch": 2.202247191011236,
|
||
|
|
"grad_norm": 6.939964205005145,
|
||
|
|
"learning_rate": 3.763391592616104e-06,
|
||
|
|
"loss": 0.0912,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 162850931.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5248184204101562,
|
||
|
|
"epoch": 2.2134831460674156,
|
||
|
|
"grad_norm": 6.111124160198039,
|
||
|
|
"learning_rate": 3.7500000000000005e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 163662524.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.513763427734375,
|
||
|
|
"epoch": 2.2247191011235956,
|
||
|
|
"grad_norm": 4.072432706569934,
|
||
|
|
"learning_rate": 3.7365604127757584e-06,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 164489538.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5129623413085938,
|
||
|
|
"epoch": 2.235955056179775,
|
||
|
|
"grad_norm": 9.616365421634299,
|
||
|
|
"learning_rate": 3.7230733469655554e-06,
|
||
|
|
"loss": 0.0767,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 165320322.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51519775390625,
|
||
|
|
"epoch": 2.247191011235955,
|
||
|
|
"grad_norm": 4.926175655349057,
|
||
|
|
"learning_rate": 3.709539320414544e-06,
|
||
|
|
"loss": 0.0689,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 166151934.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5228424072265625,
|
||
|
|
"epoch": 2.258426966292135,
|
||
|
|
"grad_norm": 7.13563776574761,
|
||
|
|
"learning_rate": 3.6959588527709635e-06,
|
||
|
|
"loss": 0.0801,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 166958027.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515106201171875,
|
||
|
|
"epoch": 2.2696629213483144,
|
||
|
|
"grad_norm": 8.484197218001166,
|
||
|
|
"learning_rate": 3.6823324654661923e-06,
|
||
|
|
"loss": 0.0756,
|
||
|
|
"mean_token_accuracy": 0.9752604181412607,
|
||
|
|
"num_tokens": 167797700.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5139694213867188,
|
||
|
|
"epoch": 2.2808988764044944,
|
||
|
|
"grad_norm": 4.551393455292689,
|
||
|
|
"learning_rate": 3.6686606816947264e-06,
|
||
|
|
"loss": 0.0668,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 168633047.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.511627197265625,
|
||
|
|
"epoch": 2.292134831460674,
|
||
|
|
"grad_norm": 13.590007002387853,
|
||
|
|
"learning_rate": 3.6549440263940878e-06,
|
||
|
|
"loss": 0.1093,
|
||
|
|
"mean_token_accuracy": 0.955729169305414,
|
||
|
|
"num_tokens": 169477710.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5248260498046875,
|
||
|
|
"epoch": 2.303370786516854,
|
||
|
|
"grad_norm": 8.339146711131134,
|
||
|
|
"learning_rate": 3.6411830262246755e-06,
|
||
|
|
"loss": 0.0797,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 170293561.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52813720703125,
|
||
|
|
"epoch": 2.3146067415730336,
|
||
|
|
"grad_norm": 6.075928027622734,
|
||
|
|
"learning_rate": 3.627378209549537e-06,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 171094703.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5260696411132812,
|
||
|
|
"epoch": 2.3258426966292136,
|
||
|
|
"grad_norm": 8.027691228204391,
|
||
|
|
"learning_rate": 3.6135301064140856e-06,
|
||
|
|
"loss": 0.0788,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 171940750.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5362701416015625,
|
||
|
|
"epoch": 2.337078651685393,
|
||
|
|
"grad_norm": 4.7127759183440405,
|
||
|
|
"learning_rate": 3.599639248525749e-06,
|
||
|
|
"loss": 0.0656,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 172746388.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5174713134765625,
|
||
|
|
"epoch": 2.348314606741573,
|
||
|
|
"grad_norm": 12.123204299345215,
|
||
|
|
"learning_rate": 3.5857061692335503e-06,
|
||
|
|
"loss": 0.1167,
|
||
|
|
"mean_token_accuracy": 0.9531250027939677,
|
||
|
|
"num_tokens": 173596511.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5310440063476562,
|
||
|
|
"epoch": 2.359550561797753,
|
||
|
|
"grad_norm": 3.6583692027389065,
|
||
|
|
"learning_rate": 3.5717314035076355e-06,
|
||
|
|
"loss": 0.0552,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 174399527.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5257034301757812,
|
||
|
|
"epoch": 2.370786516853933,
|
||
|
|
"grad_norm": 10.990027992636044,
|
||
|
|
"learning_rate": 3.5577154879187286e-06,
|
||
|
|
"loss": 0.1029,
|
||
|
|
"mean_token_accuracy": 0.9648437520954758,
|
||
|
|
"num_tokens": 175213185.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5306854248046875,
|
||
|
|
"epoch": 2.3820224719101124,
|
||
|
|
"grad_norm": 9.765128326680557,
|
||
|
|
"learning_rate": 3.5436589606175296e-06,
|
||
|
|
"loss": 0.0794,
|
||
|
|
"mean_token_accuracy": 0.9700520851183683,
|
||
|
|
"num_tokens": 176038856.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5189208984375,
|
||
|
|
"epoch": 2.393258426966292,
|
||
|
|
"grad_norm": 8.116475495273617,
|
||
|
|
"learning_rate": 3.5295623613140563e-06,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 176888989.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.529022216796875,
|
||
|
|
"epoch": 2.404494382022472,
|
||
|
|
"grad_norm": 7.3861491529542045,
|
||
|
|
"learning_rate": 3.5154262312569134e-06,
|
||
|
|
"loss": 0.0751,
|
||
|
|
"mean_token_accuracy": 0.967447918606922,
|
||
|
|
"num_tokens": 177734742.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5261993408203125,
|
||
|
|
"epoch": 2.4157303370786516,
|
||
|
|
"grad_norm": 3.2893739082717977,
|
||
|
|
"learning_rate": 3.501251113212521e-06,
|
||
|
|
"loss": 0.0606,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 178565645.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5221099853515625,
|
||
|
|
"epoch": 2.4269662921348316,
|
||
|
|
"grad_norm": 4.62174053824965,
|
||
|
|
"learning_rate": 3.4870375514442677e-06,
|
||
|
|
"loss": 0.0474,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 179403274.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5177154541015625,
|
||
|
|
"epoch": 2.438202247191011,
|
||
|
|
"grad_norm": 4.936685319427876,
|
||
|
|
"learning_rate": 3.4727860916916143e-06,
|
||
|
|
"loss": 0.0742,
|
||
|
|
"mean_token_accuracy": 0.9713541683740914,
|
||
|
|
"num_tokens": 180259819.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5179824829101562,
|
||
|
|
"epoch": 2.449438202247191,
|
||
|
|
"grad_norm": 5.420191287061157,
|
||
|
|
"learning_rate": 3.458497281149143e-06,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"mean_token_accuracy": 0.977864584652707,
|
||
|
|
"num_tokens": 181106828.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5365829467773438,
|
||
|
|
"epoch": 2.460674157303371,
|
||
|
|
"grad_norm": 5.80819935292701,
|
||
|
|
"learning_rate": 3.444171668445544e-06,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 181903812.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5154647827148438,
|
||
|
|
"epoch": 2.4719101123595504,
|
||
|
|
"grad_norm": 4.646194834229386,
|
||
|
|
"learning_rate": 3.429809803622551e-06,
|
||
|
|
"loss": 0.0652,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 182736365.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5201644897460938,
|
||
|
|
"epoch": 2.4831460674157304,
|
||
|
|
"grad_norm": 4.3253828750360075,
|
||
|
|
"learning_rate": 3.415412238113823e-06,
|
||
|
|
"loss": 0.0592,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 183557658.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5149002075195312,
|
||
|
|
"epoch": 2.49438202247191,
|
||
|
|
"grad_norm": 3.335909978206754,
|
||
|
|
"learning_rate": 3.400979524723773e-06,
|
||
|
|
"loss": 0.0446,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 184415937.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5210342407226562,
|
||
|
|
"epoch": 2.50561797752809,
|
||
|
|
"grad_norm": 4.304889690569454,
|
||
|
|
"learning_rate": 3.386512217606339e-06,
|
||
|
|
"loss": 0.0582,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 185226530.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5176010131835938,
|
||
|
|
"epoch": 2.5168539325842696,
|
||
|
|
"grad_norm": 8.647373614702433,
|
||
|
|
"learning_rate": 3.372010872243711e-06,
|
||
|
|
"loss": 0.0624,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 186062362.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5189895629882812,
|
||
|
|
"epoch": 2.5280898876404496,
|
||
|
|
"grad_norm": 5.434934187796716,
|
||
|
|
"learning_rate": 3.357476045424998e-06,
|
||
|
|
"loss": 0.0524,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 186907709.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51287841796875,
|
||
|
|
"epoch": 2.539325842696629,
|
||
|
|
"grad_norm": 4.204918090662119,
|
||
|
|
"learning_rate": 3.342908295224854e-06,
|
||
|
|
"loss": 0.048,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 187754524.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5017776489257812,
|
||
|
|
"epoch": 2.550561797752809,
|
||
|
|
"grad_norm": 7.2126840274010755,
|
||
|
|
"learning_rate": 3.32830818098205e-06,
|
||
|
|
"loss": 0.0712,
|
||
|
|
"mean_token_accuracy": 0.9765625013969839,
|
||
|
|
"num_tokens": 188612569.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5189361572265625,
|
||
|
|
"epoch": 2.561797752808989,
|
||
|
|
"grad_norm": 4.49147927637142,
|
||
|
|
"learning_rate": 3.313676263277995e-06,
|
||
|
|
"loss": 0.0506,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 189431556.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5114974975585938,
|
||
|
|
"epoch": 2.5730337078651684,
|
||
|
|
"grad_norm": 8.23007248713336,
|
||
|
|
"learning_rate": 3.299013103915214e-06,
|
||
|
|
"loss": 0.069,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 190280003.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5097122192382812,
|
||
|
|
"epoch": 2.5842696629213484,
|
||
|
|
"grad_norm": 9.609630695704968,
|
||
|
|
"learning_rate": 3.2843192658957775e-06,
|
||
|
|
"loss": 0.0753,
|
||
|
|
"mean_token_accuracy": 0.9726562516298145,
|
||
|
|
"num_tokens": 191150162.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5175933837890625,
|
||
|
|
"epoch": 2.595505617977528,
|
||
|
|
"grad_norm": 4.841549926893903,
|
||
|
|
"learning_rate": 3.269595313399683e-06,
|
||
|
|
"loss": 0.0595,
|
||
|
|
"mean_token_accuracy": 0.9804687511641532,
|
||
|
|
"num_tokens": 191982499.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53167724609375,
|
||
|
|
"epoch": 2.606741573033708,
|
||
|
|
"grad_norm": 5.9233921841701065,
|
||
|
|
"learning_rate": 3.2548418117631952e-06,
|
||
|
|
"loss": 0.0464,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 192798994.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5178909301757812,
|
||
|
|
"epoch": 2.6179775280898876,
|
||
|
|
"grad_norm": 5.634708544843851,
|
||
|
|
"learning_rate": 3.240059327457138e-06,
|
||
|
|
"loss": 0.0541,
|
||
|
|
"mean_token_accuracy": 0.9804687511641532,
|
||
|
|
"num_tokens": 193626603.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5294189453125,
|
||
|
|
"epoch": 2.629213483146067,
|
||
|
|
"grad_norm": 4.84001706365338,
|
||
|
|
"learning_rate": 3.2252484280651453e-06,
|
||
|
|
"loss": 0.0558,
|
||
|
|
"mean_token_accuracy": 0.9843750009313226,
|
||
|
|
"num_tokens": 194426242.0,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5013351440429688,
|
||
|
|
"epoch": 2.640449438202247,
|
||
|
|
"grad_norm": 3.350014032586222,
|
||
|
|
"learning_rate": 3.2104096822618657e-06,
|
||
|
|
"loss": 0.0402,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 195272929.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5089569091796875,
|
||
|
|
"epoch": 2.6516853932584272,
|
||
|
|
"grad_norm": 4.702507794220196,
|
||
|
|
"learning_rate": 3.195543659791132e-06,
|
||
|
|
"loss": 0.0414,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 196112635.0,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5014801025390625,
|
||
|
|
"epoch": 2.662921348314607,
|
||
|
|
"grad_norm": 5.663346108909122,
|
||
|
|
"learning_rate": 3.1806509314440827e-06,
|
||
|
|
"loss": 0.0399,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 196967166.0,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5024566650390625,
|
||
|
|
"epoch": 2.6741573033707864,
|
||
|
|
"grad_norm": 7.232339951811718,
|
||
|
|
"learning_rate": 3.1657320690372464e-06,
|
||
|
|
"loss": 0.055,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 197827153.0,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51092529296875,
|
||
|
|
"epoch": 2.6853932584269664,
|
||
|
|
"grad_norm": 6.540851314197654,
|
||
|
|
"learning_rate": 3.150787645390587e-06,
|
||
|
|
"loss": 0.0593,
|
||
|
|
"mean_token_accuracy": 0.9791666679084301,
|
||
|
|
"num_tokens": 198648707.0,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5111083984375,
|
||
|
|
"epoch": 2.696629213483146,
|
||
|
|
"grad_norm": 5.662581940141526,
|
||
|
|
"learning_rate": 3.135818234305511e-06,
|
||
|
|
"loss": 0.0396,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 199450306.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5034255981445312,
|
||
|
|
"epoch": 2.7078651685393256,
|
||
|
|
"grad_norm": 5.8373831882494684,
|
||
|
|
"learning_rate": 3.120824410542833e-06,
|
||
|
|
"loss": 0.0321,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 200298341.0,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50079345703125,
|
||
|
|
"epoch": 2.7191011235955056,
|
||
|
|
"grad_norm": 5.910065701044768,
|
||
|
|
"learning_rate": 3.1058067498007094e-06,
|
||
|
|
"loss": 0.0422,
|
||
|
|
"mean_token_accuracy": 0.9830729176755995,
|
||
|
|
"num_tokens": 201149610.0,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5079193115234375,
|
||
|
|
"epoch": 2.7303370786516856,
|
||
|
|
"grad_norm": 4.478641894279129,
|
||
|
|
"learning_rate": 3.090765828692534e-06,
|
||
|
|
"loss": 0.0379,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 202005054.0,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.515045166015625,
|
||
|
|
"epoch": 2.741573033707865,
|
||
|
|
"grad_norm": 5.6413331362338806,
|
||
|
|
"learning_rate": 3.0757022247248e-06,
|
||
|
|
"loss": 0.0439,
|
||
|
|
"mean_token_accuracy": 0.9817708344198763,
|
||
|
|
"num_tokens": 202822761.0,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5003509521484375,
|
||
|
|
"epoch": 2.752808988764045,
|
||
|
|
"grad_norm": 4.675637549204775,
|
||
|
|
"learning_rate": 3.0606165162749212e-06,
|
||
|
|
"loss": 0.0304,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 203654329.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5175704956054688,
|
||
|
|
"epoch": 2.764044943820225,
|
||
|
|
"grad_norm": 4.266714217449698,
|
||
|
|
"learning_rate": 3.045509282569031e-06,
|
||
|
|
"loss": 0.0355,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 204452028.0,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.510162353515625,
|
||
|
|
"epoch": 2.7752808988764044,
|
||
|
|
"grad_norm": 3.472266417067048,
|
||
|
|
"learning_rate": 3.0303811036597395e-06,
|
||
|
|
"loss": 0.0275,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 205254485.0,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5191574096679688,
|
||
|
|
"epoch": 2.7865168539325844,
|
||
|
|
"grad_norm": 2.8786418244959275,
|
||
|
|
"learning_rate": 3.01523256040386e-06,
|
||
|
|
"loss": 0.0212,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 206055392.0,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5145721435546875,
|
||
|
|
"epoch": 2.797752808988764,
|
||
|
|
"grad_norm": 4.496243851753138,
|
||
|
|
"learning_rate": 3.0000642344401115e-06,
|
||
|
|
"loss": 0.0372,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 206869050.0,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49834442138671875,
|
||
|
|
"epoch": 2.808988764044944,
|
||
|
|
"grad_norm": 5.002231345777496,
|
||
|
|
"learning_rate": 2.9848767081667823e-06,
|
||
|
|
"loss": 0.028,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 207712295.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49251556396484375,
|
||
|
|
"epoch": 2.8202247191011236,
|
||
|
|
"grad_norm": 4.859586789233871,
|
||
|
|
"learning_rate": 2.9696705647193695e-06,
|
||
|
|
"loss": 0.0445,
|
||
|
|
"mean_token_accuracy": 0.9869791674427688,
|
||
|
|
"num_tokens": 208580496.0,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5194778442382812,
|
||
|
|
"epoch": 2.831460674157303,
|
||
|
|
"grad_norm": 3.9520698327060177,
|
||
|
|
"learning_rate": 2.9544463879481914e-06,
|
||
|
|
"loss": 0.0315,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 209388304.0,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5085906982421875,
|
||
|
|
"epoch": 2.842696629213483,
|
||
|
|
"grad_norm": 4.449886428662148,
|
||
|
|
"learning_rate": 2.9392047623959653e-06,
|
||
|
|
"loss": 0.0307,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 210216106.0,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4924468994140625,
|
||
|
|
"epoch": 2.853932584269663,
|
||
|
|
"grad_norm": 4.102595454995453,
|
||
|
|
"learning_rate": 2.923946273275369e-06,
|
||
|
|
"loss": 0.0377,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 211096912.0,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.501220703125,
|
||
|
|
"epoch": 2.865168539325843,
|
||
|
|
"grad_norm": 3.5065827045256714,
|
||
|
|
"learning_rate": 2.908671506446566e-06,
|
||
|
|
"loss": 0.0402,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 211917717.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5018081665039062,
|
||
|
|
"epoch": 2.8764044943820224,
|
||
|
|
"grad_norm": 4.772724836206971,
|
||
|
|
"learning_rate": 2.8933810483947156e-06,
|
||
|
|
"loss": 0.0387,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 212760942.0,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5093460083007812,
|
||
|
|
"epoch": 2.8876404494382024,
|
||
|
|
"grad_norm": 5.280967737032615,
|
||
|
|
"learning_rate": 2.878075486207452e-06,
|
||
|
|
"loss": 0.042,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 213577211.0,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5251312255859375,
|
||
|
|
"epoch": 2.898876404494382,
|
||
|
|
"grad_norm": 3.0721132607558403,
|
||
|
|
"learning_rate": 2.8627554075523426e-06,
|
||
|
|
"loss": 0.0276,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 214374237.0,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.509552001953125,
|
||
|
|
"epoch": 2.9101123595505616,
|
||
|
|
"grad_norm": 3.418094859784156,
|
||
|
|
"learning_rate": 2.8474214006543255e-06,
|
||
|
|
"loss": 0.0304,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 215204658.0,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49858856201171875,
|
||
|
|
"epoch": 2.9213483146067416,
|
||
|
|
"grad_norm": 3.4215791011808947,
|
||
|
|
"learning_rate": 2.832074054273121e-06,
|
||
|
|
"loss": 0.0337,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 216059885.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5129241943359375,
|
||
|
|
"epoch": 2.932584269662921,
|
||
|
|
"grad_norm": 3.013036330316608,
|
||
|
|
"learning_rate": 2.8167139576806306e-06,
|
||
|
|
"loss": 0.028,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 216892678.0,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49480438232421875,
|
||
|
|
"epoch": 2.943820224719101,
|
||
|
|
"grad_norm": 3.191221312139142,
|
||
|
|
"learning_rate": 2.8013417006383078e-06,
|
||
|
|
"loss": 0.0291,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 217739914.0,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4976348876953125,
|
||
|
|
"epoch": 2.955056179775281,
|
||
|
|
"grad_norm": 2.906401361021095,
|
||
|
|
"learning_rate": 2.7859578733745153e-06,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 218586521.0,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48983001708984375,
|
||
|
|
"epoch": 2.966292134831461,
|
||
|
|
"grad_norm": 4.7556975923651335,
|
||
|
|
"learning_rate": 2.7705630665618605e-06,
|
||
|
|
"loss": 0.0232,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 219419106.0,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4918670654296875,
|
||
|
|
"epoch": 2.9775280898876404,
|
||
|
|
"grad_norm": 5.022257244997175,
|
||
|
|
"learning_rate": 2.755157871294521e-06,
|
||
|
|
"loss": 0.0283,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 220252072.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.487274169921875,
|
||
|
|
"epoch": 2.98876404494382,
|
||
|
|
"grad_norm": 8.564888081434068,
|
||
|
|
"learning_rate": 2.7397428790655447e-06,
|
||
|
|
"loss": 0.0497,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 221111708.0,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5102157592773438,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 4.460036763086264,
|
||
|
|
"learning_rate": 2.7243186817441403e-06,
|
||
|
|
"loss": 0.0385,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 221886114.0,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48343658447265625,
|
||
|
|
"epoch": 3.0112359550561796,
|
||
|
|
"grad_norm": 9.20662230947583,
|
||
|
|
"learning_rate": 2.708885871552954e-06,
|
||
|
|
"loss": 0.021,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 222747070.0,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48952484130859375,
|
||
|
|
"epoch": 3.0224719101123596,
|
||
|
|
"grad_norm": 5.187135606996883,
|
||
|
|
"learning_rate": 2.693445041045326e-06,
|
||
|
|
"loss": 0.0245,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 223568902.0,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47576141357421875,
|
||
|
|
"epoch": 3.033707865168539,
|
||
|
|
"grad_norm": 3.251501070378064,
|
||
|
|
"learning_rate": 2.6779967830825454e-06,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 224413441.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48839569091796875,
|
||
|
|
"epoch": 3.044943820224719,
|
||
|
|
"grad_norm": 3.429178666958642,
|
||
|
|
"learning_rate": 2.6625416908110825e-06,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 225228850.0,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4837799072265625,
|
||
|
|
"epoch": 3.056179775280899,
|
||
|
|
"grad_norm": 5.639294809675952,
|
||
|
|
"learning_rate": 2.647080357639813e-06,
|
||
|
|
"loss": 0.0403,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 226050198.0,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48087310791015625,
|
||
|
|
"epoch": 3.067415730337079,
|
||
|
|
"grad_norm": 4.675070193494617,
|
||
|
|
"learning_rate": 2.6316133772172403e-06,
|
||
|
|
"loss": 0.0288,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 226889816.0,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49853515625,
|
||
|
|
"epoch": 3.0786516853932584,
|
||
|
|
"grad_norm": 4.8721179152633445,
|
||
|
|
"learning_rate": 2.616141343408696e-06,
|
||
|
|
"loss": 0.033,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 227673295.0,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49887847900390625,
|
||
|
|
"epoch": 3.0898876404494384,
|
||
|
|
"grad_norm": 3.780990923057449,
|
||
|
|
"learning_rate": 2.6006648502735384e-06,
|
||
|
|
"loss": 0.0237,
|
||
|
|
"mean_token_accuracy": 0.989583333954215,
|
||
|
|
"num_tokens": 228493992.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48746490478515625,
|
||
|
|
"epoch": 3.101123595505618,
|
||
|
|
"grad_norm": 3.463349045434377,
|
||
|
|
"learning_rate": 2.5851844920423473e-06,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 229329744.0,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5026702880859375,
|
||
|
|
"epoch": 3.1123595505617976,
|
||
|
|
"grad_norm": 3.277726971307229,
|
||
|
|
"learning_rate": 2.569700863094104e-06,
|
||
|
|
"loss": 0.0256,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 230150690.0,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.489990234375,
|
||
|
|
"epoch": 3.1235955056179776,
|
||
|
|
"grad_norm": 6.501478175738681,
|
||
|
|
"learning_rate": 2.554214557933372e-06,
|
||
|
|
"loss": 0.0306,
|
||
|
|
"mean_token_accuracy": 0.9882812506984919,
|
||
|
|
"num_tokens": 230974007.0,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5017242431640625,
|
||
|
|
"epoch": 3.134831460674157,
|
||
|
|
"grad_norm": 6.205617015245652,
|
||
|
|
"learning_rate": 2.5387261711674695e-06,
|
||
|
|
"loss": 0.0327,
|
||
|
|
"mean_token_accuracy": 0.9856770841870457,
|
||
|
|
"num_tokens": 231788328.0,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49090576171875,
|
||
|
|
"epoch": 3.146067415730337,
|
||
|
|
"grad_norm": 3.537442099096884,
|
||
|
|
"learning_rate": 2.5232362974836394e-06,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 232603310.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48221588134765625,
|
||
|
|
"epoch": 3.157303370786517,
|
||
|
|
"grad_norm": 5.701029872012899,
|
||
|
|
"learning_rate": 2.507745531626215e-06,
|
||
|
|
"loss": 0.0432,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 233460022.0,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4851226806640625,
|
||
|
|
"epoch": 3.168539325842697,
|
||
|
|
"grad_norm": 6.175135064457726,
|
||
|
|
"learning_rate": 2.4922544683737857e-06,
|
||
|
|
"loss": 0.027,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 234261620.0,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4852294921875,
|
||
|
|
"epoch": 3.1797752808988764,
|
||
|
|
"grad_norm": 3.6828139032465312,
|
||
|
|
"learning_rate": 2.4767637025163614e-06,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"mean_token_accuracy": 0.9908854172099382,
|
||
|
|
"num_tokens": 235088969.0,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48902130126953125,
|
||
|
|
"epoch": 3.191011235955056,
|
||
|
|
"grad_norm": 3.512834879515185,
|
||
|
|
"learning_rate": 2.461273828832531e-06,
|
||
|
|
"loss": 0.0258,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 235913452.0,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4867401123046875,
|
||
|
|
"epoch": 3.202247191011236,
|
||
|
|
"grad_norm": 4.189500977619063,
|
||
|
|
"learning_rate": 2.445785442066628e-06,
|
||
|
|
"loss": 0.0245,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 236724207.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4837646484375,
|
||
|
|
"epoch": 3.2134831460674156,
|
||
|
|
"grad_norm": 4.540256503622475,
|
||
|
|
"learning_rate": 2.4302991369058963e-06,
|
||
|
|
"loss": 0.0245,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 237532582.0,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48180389404296875,
|
||
|
|
"epoch": 3.2247191011235956,
|
||
|
|
"grad_norm": 3.461981571769003,
|
||
|
|
"learning_rate": 2.414815507957653e-06,
|
||
|
|
"loss": 0.0316,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 238364201.0,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4748687744140625,
|
||
|
|
"epoch": 3.235955056179775,
|
||
|
|
"grad_norm": 3.5186529222593323,
|
||
|
|
"learning_rate": 2.399335149726463e-06,
|
||
|
|
"loss": 0.0254,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 239227490.0,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48622894287109375,
|
||
|
|
"epoch": 3.247191011235955,
|
||
|
|
"grad_norm": 2.4622140466695583,
|
||
|
|
"learning_rate": 2.3838586565913053e-06,
|
||
|
|
"loss": 0.0219,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 240071974.0,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49408721923828125,
|
||
|
|
"epoch": 3.258426966292135,
|
||
|
|
"grad_norm": 2.7289333058731686,
|
||
|
|
"learning_rate": 2.3683866227827605e-06,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 240885558.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4902801513671875,
|
||
|
|
"epoch": 3.2696629213483144,
|
||
|
|
"grad_norm": 3.099007161228062,
|
||
|
|
"learning_rate": 2.352919642360188e-06,
|
||
|
|
"loss": 0.0209,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 241726366.0,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48973846435546875,
|
||
|
|
"epoch": 3.2808988764044944,
|
||
|
|
"grad_norm": 3.073212878993699,
|
||
|
|
"learning_rate": 2.3374583091889188e-06,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 242540731.0,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49147796630859375,
|
||
|
|
"epoch": 3.292134831460674,
|
||
|
|
"grad_norm": 3.4111012195131845,
|
||
|
|
"learning_rate": 2.322003216917455e-06,
|
||
|
|
"loss": 0.025,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 243359456.0,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48925018310546875,
|
||
|
|
"epoch": 3.303370786516854,
|
||
|
|
"grad_norm": 3.398747668599176,
|
||
|
|
"learning_rate": 2.3065549589546747e-06,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 244207041.0,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48650360107421875,
|
||
|
|
"epoch": 3.3146067415730336,
|
||
|
|
"grad_norm": 3.7147965496555604,
|
||
|
|
"learning_rate": 2.2911141284470466e-06,
|
||
|
|
"loss": 0.0226,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 245038695.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48868560791015625,
|
||
|
|
"epoch": 3.3258426966292136,
|
||
|
|
"grad_norm": 3.6350776720835167,
|
||
|
|
"learning_rate": 2.27568131825586e-06,
|
||
|
|
"loss": 0.0241,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 245862029.0,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5014572143554688,
|
||
|
|
"epoch": 3.337078651685393,
|
||
|
|
"grad_norm": 1.9496739139045483,
|
||
|
|
"learning_rate": 2.260257120934456e-06,
|
||
|
|
"loss": 0.008,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 246649993.0,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486724853515625,
|
||
|
|
"epoch": 3.348314606741573,
|
||
|
|
"grad_norm": 3.179926005223232,
|
||
|
|
"learning_rate": 2.2448421287054794e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 247496847.0,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49832916259765625,
|
||
|
|
"epoch": 3.359550561797753,
|
||
|
|
"grad_norm": 2.4689200808317735,
|
||
|
|
"learning_rate": 2.229436933438141e-06,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 248289120.0,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486846923828125,
|
||
|
|
"epoch": 3.370786516853933,
|
||
|
|
"grad_norm": 2.8695936224449703,
|
||
|
|
"learning_rate": 2.214042126625486e-06,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 249116159.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4945526123046875,
|
||
|
|
"epoch": 3.3820224719101124,
|
||
|
|
"grad_norm": 2.8107533462269965,
|
||
|
|
"learning_rate": 2.1986582993616926e-06,
|
||
|
|
"loss": 0.0239,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 249937868.0,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47344970703125,
|
||
|
|
"epoch": 3.393258426966292,
|
||
|
|
"grad_norm": 3.2328511428876383,
|
||
|
|
"learning_rate": 2.1832860423193703e-06,
|
||
|
|
"loss": 0.0137,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 250814526.0,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4880523681640625,
|
||
|
|
"epoch": 3.404494382022472,
|
||
|
|
"grad_norm": 4.262015191629379,
|
||
|
|
"learning_rate": 2.1679259457268796e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 251627640.0,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48815155029296875,
|
||
|
|
"epoch": 3.4157303370786516,
|
||
|
|
"grad_norm": 2.8300412513355524,
|
||
|
|
"learning_rate": 2.1525785993456753e-06,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 252464366.0,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5023727416992188,
|
||
|
|
"epoch": 3.4269662921348316,
|
||
|
|
"grad_norm": 4.510851185477219,
|
||
|
|
"learning_rate": 2.1372445924476578e-06,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 253246625.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47849273681640625,
|
||
|
|
"epoch": 3.438202247191011,
|
||
|
|
"grad_norm": 8.834249485370332,
|
||
|
|
"learning_rate": 2.1219245137925482e-06,
|
||
|
|
"loss": 0.0192,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 254096661.0,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4962005615234375,
|
||
|
|
"epoch": 3.449438202247191,
|
||
|
|
"grad_norm": 3.6441410619934462,
|
||
|
|
"learning_rate": 2.1066189516052848e-06,
|
||
|
|
"loss": 0.0286,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 254907096.0,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.477813720703125,
|
||
|
|
"epoch": 3.460674157303371,
|
||
|
|
"grad_norm": 6.015307012511412,
|
||
|
|
"learning_rate": 2.0913284935534345e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 255741853.0,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48021697998046875,
|
||
|
|
"epoch": 3.4719101123595504,
|
||
|
|
"grad_norm": 6.152174736765347,
|
||
|
|
"learning_rate": 2.0760537267246316e-06,
|
||
|
|
"loss": 0.0229,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 256570850.0,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48302459716796875,
|
||
|
|
"epoch": 3.4831460674157304,
|
||
|
|
"grad_norm": 2.359802121670126,
|
||
|
|
"learning_rate": 2.0607952376040355e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 257402947.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47766876220703125,
|
||
|
|
"epoch": 3.49438202247191,
|
||
|
|
"grad_norm": 3.3257588322733906,
|
||
|
|
"learning_rate": 2.0455536120518094e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 258264842.0,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4766845703125,
|
||
|
|
"epoch": 3.50561797752809,
|
||
|
|
"grad_norm": 1.603503598370096,
|
||
|
|
"learning_rate": 2.0303294352806313e-06,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 259122986.0,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47867584228515625,
|
||
|
|
"epoch": 3.5168539325842696,
|
||
|
|
"grad_norm": 4.431892256593624,
|
||
|
|
"learning_rate": 2.0151232918332186e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 259973474.0,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48300933837890625,
|
||
|
|
"epoch": 3.5280898876404496,
|
||
|
|
"grad_norm": 4.326635672396205,
|
||
|
|
"learning_rate": 1.9999357655598894e-06,
|
||
|
|
"loss": 0.0242,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 260798300.0,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48386383056640625,
|
||
|
|
"epoch": 3.539325842696629,
|
||
|
|
"grad_norm": 4.607397469663412,
|
||
|
|
"learning_rate": 1.9847674395961407e-06,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 261627223.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4709930419921875,
|
||
|
|
"epoch": 3.550561797752809,
|
||
|
|
"grad_norm": 2.6921209318158956,
|
||
|
|
"learning_rate": 1.9696188963402613e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 262492309.0,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49495697021484375,
|
||
|
|
"epoch": 3.561797752808989,
|
||
|
|
"grad_norm": 4.038453476304001,
|
||
|
|
"learning_rate": 1.9544907174309693e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 263296839.0,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47100830078125,
|
||
|
|
"epoch": 3.5730337078651684,
|
||
|
|
"grad_norm": 3.907050390180474,
|
||
|
|
"learning_rate": 1.939383483725079e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 264166709.0,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48455047607421875,
|
||
|
|
"epoch": 3.5842696629213484,
|
||
|
|
"grad_norm": 4.629582756662647,
|
||
|
|
"learning_rate": 1.9242977752752006e-06,
|
||
|
|
"loss": 0.0209,
|
||
|
|
"mean_token_accuracy": 0.9934895837213844,
|
||
|
|
"num_tokens": 265009474.0,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.492584228515625,
|
||
|
|
"epoch": 3.595505617977528,
|
||
|
|
"grad_norm": 3.206501722582584,
|
||
|
|
"learning_rate": 1.909234171307466e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 265828804.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48418426513671875,
|
||
|
|
"epoch": 3.606741573033708,
|
||
|
|
"grad_norm": 2.9833193666984594,
|
||
|
|
"learning_rate": 1.8941932501992915e-06,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 266691166.0,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48262786865234375,
|
||
|
|
"epoch": 3.6179775280898876,
|
||
|
|
"grad_norm": 2.9165593805140437,
|
||
|
|
"learning_rate": 1.879175589457168e-06,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 267543284.0,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4853363037109375,
|
||
|
|
"epoch": 3.629213483146067,
|
||
|
|
"grad_norm": 3.4411795063924315,
|
||
|
|
"learning_rate": 1.8641817656944894e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 268391851.0,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5037002563476562,
|
||
|
|
"epoch": 3.640449438202247,
|
||
|
|
"grad_norm": 7.219711770249014,
|
||
|
|
"learning_rate": 1.8492123546094132e-06,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 269181570.0,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48888397216796875,
|
||
|
|
"epoch": 3.6516853932584272,
|
||
|
|
"grad_norm": 2.760692104580548,
|
||
|
|
"learning_rate": 1.8342679309627545e-06,
|
||
|
|
"loss": 0.0196,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 269996062.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48822021484375,
|
||
|
|
"epoch": 3.662921348314607,
|
||
|
|
"grad_norm": 1.9046282601388171,
|
||
|
|
"learning_rate": 1.8193490685559179e-06,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 270800147.0,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49137115478515625,
|
||
|
|
"epoch": 3.6741573033707864,
|
||
|
|
"grad_norm": 3.419975494500157,
|
||
|
|
"learning_rate": 1.8044563402088686e-06,
|
||
|
|
"loss": 0.0194,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 271644718.0,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.488311767578125,
|
||
|
|
"epoch": 3.6853932584269664,
|
||
|
|
"grad_norm": 2.7970940768512844,
|
||
|
|
"learning_rate": 1.7895903177381351e-06,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 272494584.0,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46905517578125,
|
||
|
|
"epoch": 3.696629213483146,
|
||
|
|
"grad_norm": 3.2974393189689017,
|
||
|
|
"learning_rate": 1.7747515719348551e-06,
|
||
|
|
"loss": 0.03,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 273379667.0,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5005645751953125,
|
||
|
|
"epoch": 3.7078651685393256,
|
||
|
|
"grad_norm": 3.6205966064162824,
|
||
|
|
"learning_rate": 1.759940672542862e-06,
|
||
|
|
"loss": 0.016,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 274191441.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.479522705078125,
|
||
|
|
"epoch": 3.7191011235955056,
|
||
|
|
"grad_norm": 3.725884043346435,
|
||
|
|
"learning_rate": 1.7451581882368052e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 275059931.0,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49169158935546875,
|
||
|
|
"epoch": 3.7303370786516856,
|
||
|
|
"grad_norm": 2.6597945294485483,
|
||
|
|
"learning_rate": 1.7304046866003183e-06,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 275888389.0,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49629974365234375,
|
||
|
|
"epoch": 3.741573033707865,
|
||
|
|
"grad_norm": 2.433832686288709,
|
||
|
|
"learning_rate": 1.7156807341042242e-06,
|
||
|
|
"loss": 0.006,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 276689357.0,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.494659423828125,
|
||
|
|
"epoch": 3.752808988764045,
|
||
|
|
"grad_norm": 3.7349731267491526,
|
||
|
|
"learning_rate": 1.700986896084787e-06,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 277503649.0,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4885711669921875,
|
||
|
|
"epoch": 3.764044943820225,
|
||
|
|
"grad_norm": 3.3812622930224125,
|
||
|
|
"learning_rate": 1.686323736722006e-06,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 278326436.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5007553100585938,
|
||
|
|
"epoch": 3.7752808988764044,
|
||
|
|
"grad_norm": 1.6952250172415233,
|
||
|
|
"learning_rate": 1.671691819017951e-06,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 279115112.0,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5091018676757812,
|
||
|
|
"epoch": 3.7865168539325844,
|
||
|
|
"grad_norm": 2.403559137434532,
|
||
|
|
"learning_rate": 1.6570917047751465e-06,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 279883193.0,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4808349609375,
|
||
|
|
"epoch": 3.797752808988764,
|
||
|
|
"grad_norm": 5.835484541379819,
|
||
|
|
"learning_rate": 1.642523954575003e-06,
|
||
|
|
"loss": 0.0224,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 280724938.0,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48146820068359375,
|
||
|
|
"epoch": 3.808988764044944,
|
||
|
|
"grad_norm": 2.8914536871971994,
|
||
|
|
"learning_rate": 1.6279891277562896e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 281570918.0,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49124908447265625,
|
||
|
|
"epoch": 3.8202247191011236,
|
||
|
|
"grad_norm": 2.7829856607409718,
|
||
|
|
"learning_rate": 1.613487782393661e-06,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 282400350.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46373748779296875,
|
||
|
|
"epoch": 3.831460674157303,
|
||
|
|
"grad_norm": 2.70570741985553,
|
||
|
|
"learning_rate": 1.5990204752762273e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 283283614.0,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48815155029296875,
|
||
|
|
"epoch": 3.842696629213483,
|
||
|
|
"grad_norm": 3.3600186528383973,
|
||
|
|
"learning_rate": 1.5845877618861769e-06,
|
||
|
|
"loss": 0.007,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 284125946.0,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48841094970703125,
|
||
|
|
"epoch": 3.853932584269663,
|
||
|
|
"grad_norm": 2.7396108465149895,
|
||
|
|
"learning_rate": 1.5701901963774504e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 284927663.0,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.502349853515625,
|
||
|
|
"epoch": 3.865168539325843,
|
||
|
|
"grad_norm": 2.5440235934327706,
|
||
|
|
"learning_rate": 1.555828331554457e-06,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 285727563.0,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47910308837890625,
|
||
|
|
"epoch": 3.8764044943820224,
|
||
|
|
"grad_norm": 2.1966498948802724,
|
||
|
|
"learning_rate": 1.5415027188508574e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 286574749.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49324798583984375,
|
||
|
|
"epoch": 3.8876404494382024,
|
||
|
|
"grad_norm": 2.8523138768326963,
|
||
|
|
"learning_rate": 1.5272139083083865e-06,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 287366871.0,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48845672607421875,
|
||
|
|
"epoch": 3.898876404494382,
|
||
|
|
"grad_norm": 2.874883145772125,
|
||
|
|
"learning_rate": 1.5129624485557331e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 288166095.0,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.471282958984375,
|
||
|
|
"epoch": 3.9101123595505616,
|
||
|
|
"grad_norm": 2.916158906263163,
|
||
|
|
"learning_rate": 1.4987488867874798e-06,
|
||
|
|
"loss": 0.007,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 289017040.0,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467559814453125,
|
||
|
|
"epoch": 3.9213483146067416,
|
||
|
|
"grad_norm": 8.16739036316256,
|
||
|
|
"learning_rate": 1.4845737687430875e-06,
|
||
|
|
"loss": 0.0189,
|
||
|
|
"mean_token_accuracy": 0.9921875004656613,
|
||
|
|
"num_tokens": 289874408.0,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4731292724609375,
|
||
|
|
"epoch": 3.932584269662921,
|
||
|
|
"grad_norm": 5.468947396548159,
|
||
|
|
"learning_rate": 1.4704376386859447e-06,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 290697280.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4630584716796875,
|
||
|
|
"epoch": 3.943820224719101,
|
||
|
|
"grad_norm": 3.8244914261266194,
|
||
|
|
"learning_rate": 1.4563410393824701e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 291557021.0,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4810333251953125,
|
||
|
|
"epoch": 3.955056179775281,
|
||
|
|
"grad_norm": 4.933507820150018,
|
||
|
|
"learning_rate": 1.4422845120812718e-06,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 292378445.0,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46092987060546875,
|
||
|
|
"epoch": 3.966292134831461,
|
||
|
|
"grad_norm": 4.751144080267217,
|
||
|
|
"learning_rate": 1.4282685964923643e-06,
|
||
|
|
"loss": 0.0249,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 293233946.0,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478057861328125,
|
||
|
|
"epoch": 3.9775280898876404,
|
||
|
|
"grad_norm": 3.6775002976703437,
|
||
|
|
"learning_rate": 1.4142938307664505e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 294052224.0,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46146392822265625,
|
||
|
|
"epoch": 3.98876404494382,
|
||
|
|
"grad_norm": 2.7844187721067266,
|
||
|
|
"learning_rate": 1.400360751474253e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 294920656.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47667694091796875,
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 0.627168832321652,
|
||
|
|
"learning_rate": 1.3864698935859153e-06,
|
||
|
|
"loss": 0.0024,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 295750761.0,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47129058837890625,
|
||
|
|
"epoch": 4.01123595505618,
|
||
|
|
"grad_norm": 3.237981990670737,
|
||
|
|
"learning_rate": 1.3726217904504636e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 296580033.0,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47356414794921875,
|
||
|
|
"epoch": 4.022471910112359,
|
||
|
|
"grad_norm": 0.6977481548583607,
|
||
|
|
"learning_rate": 1.3588169737753258e-06,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 297422608.0,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46782684326171875,
|
||
|
|
"epoch": 4.033707865168539,
|
||
|
|
"grad_norm": 3.064591085863403,
|
||
|
|
"learning_rate": 1.3450559736059126e-06,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 298269567.0,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47429656982421875,
|
||
|
|
"epoch": 4.044943820224719,
|
||
|
|
"grad_norm": 2.069688101699077,
|
||
|
|
"learning_rate": 1.3313393183052747e-06,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 299104214.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4663238525390625,
|
||
|
|
"epoch": 4.056179775280899,
|
||
|
|
"grad_norm": 3.907869758579254,
|
||
|
|
"learning_rate": 1.3176675345338085e-06,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 299969102.0,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4740753173828125,
|
||
|
|
"epoch": 4.067415730337078,
|
||
|
|
"grad_norm": 0.6875226849852165,
|
||
|
|
"learning_rate": 1.304041147229037e-06,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 300808720.0,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4696197509765625,
|
||
|
|
"epoch": 4.078651685393258,
|
||
|
|
"grad_norm": 1.2333744557990824,
|
||
|
|
"learning_rate": 1.2904606795854562e-06,
|
||
|
|
"loss": 0.0026,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 301660462.0,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4776153564453125,
|
||
|
|
"epoch": 4.089887640449438,
|
||
|
|
"grad_norm": 3.4213409887705275,
|
||
|
|
"learning_rate": 1.276926653034444e-06,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 302476904.0,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486846923828125,
|
||
|
|
"epoch": 4.101123595505618,
|
||
|
|
"grad_norm": 1.6568151958809272,
|
||
|
|
"learning_rate": 1.2634395872242433e-06,
|
||
|
|
"loss": 0.003,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 303281350.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4847259521484375,
|
||
|
|
"epoch": 4.112359550561798,
|
||
|
|
"grad_norm": 1.43202146868413,
|
||
|
|
"learning_rate": 1.2500000000000007e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 304106181.0,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4637298583984375,
|
||
|
|
"epoch": 4.123595505617978,
|
||
|
|
"grad_norm": 4.19515608091639,
|
||
|
|
"learning_rate": 1.2366084073838963e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 304980353.0,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46537017822265625,
|
||
|
|
"epoch": 4.134831460674158,
|
||
|
|
"grad_norm": 10.286022538713485,
|
||
|
|
"learning_rate": 1.223265323555323e-06,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 305825415.0,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.486083984375,
|
||
|
|
"epoch": 4.146067415730337,
|
||
|
|
"grad_norm": 4.05182835847461,
|
||
|
|
"learning_rate": 1.2099712608311426e-06,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 306605981.0,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47647857666015625,
|
||
|
|
"epoch": 4.157303370786517,
|
||
|
|
"grad_norm": 7.930574622117007,
|
||
|
|
"learning_rate": 1.1967267296460208e-06,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 307430230.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.471038818359375,
|
||
|
|
"epoch": 4.168539325842697,
|
||
|
|
"grad_norm": 4.274711351283673,
|
||
|
|
"learning_rate": 1.183532238532826e-06,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 308274797.0,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45969390869140625,
|
||
|
|
"epoch": 4.179775280898877,
|
||
|
|
"grad_norm": 4.565232175830148,
|
||
|
|
"learning_rate": 1.1703882941031012e-06,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 309133614.0,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47374725341796875,
|
||
|
|
"epoch": 4.191011235955056,
|
||
|
|
"grad_norm": 3.485577354310045,
|
||
|
|
"learning_rate": 1.157295401027616e-06,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 309964446.0,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4734649658203125,
|
||
|
|
"epoch": 4.202247191011236,
|
||
|
|
"grad_norm": 4.503566249088593,
|
||
|
|
"learning_rate": 1.1442540620169906e-06,
|
||
|
|
"loss": 0.008,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 310780776.0,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47723388671875,
|
||
|
|
"epoch": 4.213483146067416,
|
||
|
|
"grad_norm": 3.683632589736363,
|
||
|
|
"learning_rate": 1.131264777802387e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 311606897.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46724700927734375,
|
||
|
|
"epoch": 4.224719101123595,
|
||
|
|
"grad_norm": 7.188150042856905,
|
||
|
|
"learning_rate": 1.1183280471162916e-06,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 312474482.0,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4607391357421875,
|
||
|
|
"epoch": 4.235955056179775,
|
||
|
|
"grad_norm": 1.8555417077203353,
|
||
|
|
"learning_rate": 1.1054443666733586e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 313320360.0,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4653167724609375,
|
||
|
|
"epoch": 4.247191011235955,
|
||
|
|
"grad_norm": 3.089259401627071,
|
||
|
|
"learning_rate": 1.0926142311513453e-06,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 314179757.0,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48670196533203125,
|
||
|
|
"epoch": 4.258426966292135,
|
||
|
|
"grad_norm": 0.8449253758558445,
|
||
|
|
"learning_rate": 1.079838133172111e-06,
|
||
|
|
"loss": 0.0029,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 314983724.0,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46709442138671875,
|
||
|
|
"epoch": 4.269662921348314,
|
||
|
|
"grad_norm": 2.5990482901945238,
|
||
|
|
"learning_rate": 1.0671165632827097e-06,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 315817708.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45947265625,
|
||
|
|
"epoch": 4.280898876404494,
|
||
|
|
"grad_norm": 4.100355461394568,
|
||
|
|
"learning_rate": 1.0544500099365515e-06,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 316672629.0,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4636688232421875,
|
||
|
|
"epoch": 4.292134831460674,
|
||
|
|
"grad_norm": 1.3481744831139768,
|
||
|
|
"learning_rate": 1.0418389594746462e-06,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 317519153.0,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46529388427734375,
|
||
|
|
"epoch": 4.303370786516854,
|
||
|
|
"grad_norm": 0.45901882778115105,
|
||
|
|
"learning_rate": 1.0292838961069348e-06,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 318332221.0,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4586639404296875,
|
||
|
|
"epoch": 4.314606741573034,
|
||
|
|
"grad_norm": 2.721713545239488,
|
||
|
|
"learning_rate": 1.0167853018936955e-06,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 319192190.0,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4707489013671875,
|
||
|
|
"epoch": 4.325842696629214,
|
||
|
|
"grad_norm": 0.43133743566324173,
|
||
|
|
"learning_rate": 1.0043436567270313e-06,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 320016084.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4701690673828125,
|
||
|
|
"epoch": 4.337078651685394,
|
||
|
|
"grad_norm": 1.8268064344668493,
|
||
|
|
"learning_rate": 9.919594383124512e-07,
|
||
|
|
"loss": 0.003,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 320849099.0,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4613800048828125,
|
||
|
|
"epoch": 4.348314606741573,
|
||
|
|
"grad_norm": 6.220960050767719,
|
||
|
|
"learning_rate": 9.796331221505235e-07,
|
||
|
|
"loss": 0.0249,
|
||
|
|
"mean_token_accuracy": 0.9947916669771075,
|
||
|
|
"num_tokens": 321673985.0,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4572601318359375,
|
||
|
|
"epoch": 4.359550561797753,
|
||
|
|
"grad_norm": 2.9553010407276807,
|
||
|
|
"learning_rate": 9.673651815186186e-07,
|
||
|
|
"loss": 0.0058,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 322512898.0,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4861297607421875,
|
||
|
|
"epoch": 4.370786516853933,
|
||
|
|
"grad_norm": 0.5559132664588449,
|
||
|
|
"learning_rate": 9.551560874527385e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 323304527.0,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47039794921875,
|
||
|
|
"epoch": 4.382022471910112,
|
||
|
|
"grad_norm": 2.5011975751042828,
|
||
|
|
"learning_rate": 9.43006308729432e-07,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 324144587.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.464385986328125,
|
||
|
|
"epoch": 4.393258426966292,
|
||
|
|
"grad_norm": 3.587804289407284,
|
||
|
|
"learning_rate": 9.309163118477954e-07,
|
||
|
|
"loss": 0.007,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 324983109.0,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4694671630859375,
|
||
|
|
"epoch": 4.404494382022472,
|
||
|
|
"grad_norm": 2.6638824884363426,
|
||
|
|
"learning_rate": 9.188865610115572e-07,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 325801705.0,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47499847412109375,
|
||
|
|
"epoch": 4.415730337078652,
|
||
|
|
"grad_norm": 6.179339696907779,
|
||
|
|
"learning_rate": 9.069175181112597e-07,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 326592032.0,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4689788818359375,
|
||
|
|
"epoch": 4.426966292134831,
|
||
|
|
"grad_norm": 6.644329248384315,
|
||
|
|
"learning_rate": 8.950096427065232e-07,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 327393253.0,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47740936279296875,
|
||
|
|
"epoch": 4.438202247191011,
|
||
|
|
"grad_norm": 1.8175645575817623,
|
||
|
|
"learning_rate": 8.831633920083968e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 328202359.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4592132568359375,
|
||
|
|
"epoch": 4.449438202247191,
|
||
|
|
"grad_norm": 1.984527870774781,
|
||
|
|
"learning_rate": 8.713792208618097e-07,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 329038112.0,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47894287109375,
|
||
|
|
"epoch": 4.460674157303371,
|
||
|
|
"grad_norm": 0.2554716930753162,
|
||
|
|
"learning_rate": 8.596575817281036e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 329827400.0,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47702789306640625,
|
||
|
|
"epoch": 4.47191011235955,
|
||
|
|
"grad_norm": 1.639558137815307,
|
||
|
|
"learning_rate": 8.479989246676595e-07,
|
||
|
|
"loss": 0.0022,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 330624750.0,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4718017578125,
|
||
|
|
"epoch": 4.48314606741573,
|
||
|
|
"grad_norm": 1.3769833756904124,
|
||
|
|
"learning_rate": 8.36403697322618e-07,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 331441801.0,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47125244140625,
|
||
|
|
"epoch": 4.49438202247191,
|
||
|
|
"grad_norm": 1.3987859409727157,
|
||
|
|
"learning_rate": 8.248723448996942e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 332277359.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48626708984375,
|
||
|
|
"epoch": 4.50561797752809,
|
||
|
|
"grad_norm": 2.9221300133808876,
|
||
|
|
"learning_rate": 8.134053101530814e-07,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 333072813.0,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.469268798828125,
|
||
|
|
"epoch": 4.51685393258427,
|
||
|
|
"grad_norm": 1.4654016785797133,
|
||
|
|
"learning_rate": 8.020030333674498e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 333897777.0,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4660797119140625,
|
||
|
|
"epoch": 4.52808988764045,
|
||
|
|
"grad_norm": 4.200397575849315,
|
||
|
|
"learning_rate": 7.906659523410445e-07,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 334695942.0,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46422576904296875,
|
||
|
|
"epoch": 4.539325842696629,
|
||
|
|
"grad_norm": 0.2658219325430272,
|
||
|
|
"learning_rate": 7.793945023688756e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 335532907.0,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47092437744140625,
|
||
|
|
"epoch": 4.550561797752809,
|
||
|
|
"grad_norm": 3.1678718102940757,
|
||
|
|
"learning_rate": 7.681891162260016e-07,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 336377217.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47515869140625,
|
||
|
|
"epoch": 4.561797752808989,
|
||
|
|
"grad_norm": 3.518189377752025,
|
||
|
|
"learning_rate": 7.570502241509162e-07,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 337185785.0,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46327972412109375,
|
||
|
|
"epoch": 4.573033707865169,
|
||
|
|
"grad_norm": 0.9320317151845362,
|
||
|
|
"learning_rate": 7.459782538290289e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 338041393.0,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47505950927734375,
|
||
|
|
"epoch": 4.584269662921348,
|
||
|
|
"grad_norm": 3.001322377230109,
|
||
|
|
"learning_rate": 7.349736303762392e-07,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 338852941.0,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.470977783203125,
|
||
|
|
"epoch": 4.595505617977528,
|
||
|
|
"grad_norm": 1.8828580251184197,
|
||
|
|
"learning_rate": 7.240367763226214e-07,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 339676128.0,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47037506103515625,
|
||
|
|
"epoch": 4.606741573033708,
|
||
|
|
"grad_norm": 1.6546966909357694,
|
||
|
|
"learning_rate": 7.13168111596193e-07,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 340476500.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47141265869140625,
|
||
|
|
"epoch": 4.617977528089888,
|
||
|
|
"grad_norm": 3.08125141351816,
|
||
|
|
"learning_rate": 7.023680535067998e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 341300800.0,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46193695068359375,
|
||
|
|
"epoch": 4.629213483146067,
|
||
|
|
"grad_norm": 1.4950626456597516,
|
||
|
|
"learning_rate": 6.916370167300846e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 342142892.0,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.463287353515625,
|
||
|
|
"epoch": 4.640449438202247,
|
||
|
|
"grad_norm": 0.9730184386637767,
|
||
|
|
"learning_rate": 6.809754132915722e-07,
|
||
|
|
"loss": 0.0023,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 342972325.0,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45442962646484375,
|
||
|
|
"epoch": 4.651685393258427,
|
||
|
|
"grad_norm": 2.602421273978658,
|
||
|
|
"learning_rate": 6.70383652550847e-07,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 343795139.0,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4595947265625,
|
||
|
|
"epoch": 4.662921348314606,
|
||
|
|
"grad_norm": 2.3960654933272565,
|
||
|
|
"learning_rate": 6.59862141185832e-07,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 344644354.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45831298828125,
|
||
|
|
"epoch": 4.674157303370786,
|
||
|
|
"grad_norm": 2.8939251687083365,
|
||
|
|
"learning_rate": 6.494112831771801e-07,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 345494609.0,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45064544677734375,
|
||
|
|
"epoch": 4.685393258426966,
|
||
|
|
"grad_norm": 3.5912419311907327,
|
||
|
|
"learning_rate": 6.390314797927601e-07,
|
||
|
|
"loss": 0.003,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 346343237.0,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4564056396484375,
|
||
|
|
"epoch": 4.696629213483146,
|
||
|
|
"grad_norm": 0.40858073845148135,
|
||
|
|
"learning_rate": 6.28723129572247e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 347197500.0,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46680450439453125,
|
||
|
|
"epoch": 4.707865168539326,
|
||
|
|
"grad_norm": 0.251709990148425,
|
||
|
|
"learning_rate": 6.184866283118254e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 348020958.0,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4546051025390625,
|
||
|
|
"epoch": 4.719101123595506,
|
||
|
|
"grad_norm": 0.25235852831165106,
|
||
|
|
"learning_rate": 6.083223690489901e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 348861858.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46105194091796875,
|
||
|
|
"epoch": 4.730337078651686,
|
||
|
|
"grad_norm": 3.858488259482445,
|
||
|
|
"learning_rate": 5.982307420474501e-07,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 349689679.0,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46041107177734375,
|
||
|
|
"epoch": 4.741573033707866,
|
||
|
|
"grad_norm": 1.7123619920632656,
|
||
|
|
"learning_rate": 5.882121347821537e-07,
|
||
|
|
"loss": 0.002,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 350521593.0,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.464263916015625,
|
||
|
|
"epoch": 4.752808988764045,
|
||
|
|
"grad_norm": 1.354957155560715,
|
||
|
|
"learning_rate": 5.782669319244058e-07,
|
||
|
|
"loss": 0.0019,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 351359395.0,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45088958740234375,
|
||
|
|
"epoch": 4.764044943820225,
|
||
|
|
"grad_norm": 2.9102479875837206,
|
||
|
|
"learning_rate": 5.683955153270959e-07,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 352212880.0,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.460906982421875,
|
||
|
|
"epoch": 4.775280898876405,
|
||
|
|
"grad_norm": 2.001881536246789,
|
||
|
|
"learning_rate": 5.585982640100416e-07,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 353039297.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45557403564453125,
|
||
|
|
"epoch": 4.786516853932584,
|
||
|
|
"grad_norm": 0.27617622178466006,
|
||
|
|
"learning_rate": 5.488755541454335e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 353892046.0,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.462371826171875,
|
||
|
|
"epoch": 4.797752808988764,
|
||
|
|
"grad_norm": 0.2591913786994451,
|
||
|
|
"learning_rate": 5.39227759043392e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 354714658.0,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4490509033203125,
|
||
|
|
"epoch": 4.808988764044944,
|
||
|
|
"grad_norm": 2.4389875882288288,
|
||
|
|
"learning_rate": 5.296552491376322e-07,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 355576030.0,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45798492431640625,
|
||
|
|
"epoch": 4.820224719101123,
|
||
|
|
"grad_norm": 0.6433650960240046,
|
||
|
|
"learning_rate": 5.201583919712441e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 356392627.0,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4479827880859375,
|
||
|
|
"epoch": 4.831460674157303,
|
||
|
|
"grad_norm": 0.44501458316967063,
|
||
|
|
"learning_rate": 5.107375521825791e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 357241360.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44903564453125,
|
||
|
|
"epoch": 4.842696629213483,
|
||
|
|
"grad_norm": 5.289790657216879,
|
||
|
|
"learning_rate": 5.013930914912477e-07,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"mean_token_accuracy": 0.9960937502328306,
|
||
|
|
"num_tokens": 358079861.0,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44850921630859375,
|
||
|
|
"epoch": 4.853932584269663,
|
||
|
|
"grad_norm": 0.485665337525698,
|
||
|
|
"learning_rate": 4.921253686842323e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 358944222.0,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4571380615234375,
|
||
|
|
"epoch": 4.865168539325842,
|
||
|
|
"grad_norm": 2.5252971133780107,
|
||
|
|
"learning_rate": 4.829347396021142e-07,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 359793305.0,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48027801513671875,
|
||
|
|
"epoch": 4.876404494382022,
|
||
|
|
"grad_norm": 0.6691097376008813,
|
||
|
|
"learning_rate": 4.7382155712540484e-07,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 360562830.0,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46465301513671875,
|
||
|
|
"epoch": 4.887640449438202,
|
||
|
|
"grad_norm": 0.43370214444423943,
|
||
|
|
"learning_rate": 4.6478617116100244e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 361364772.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46259307861328125,
|
||
|
|
"epoch": 4.898876404494382,
|
||
|
|
"grad_norm": 2.123319638794875,
|
||
|
|
"learning_rate": 4.5582892862875457e-07,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 362205959.0,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46250152587890625,
|
||
|
|
"epoch": 4.910112359550562,
|
||
|
|
"grad_norm": 0.2966276056762264,
|
||
|
|
"learning_rate": 4.469501734481363e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 363013439.0,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46131134033203125,
|
||
|
|
"epoch": 4.921348314606742,
|
||
|
|
"grad_norm": 0.2595312919643192,
|
||
|
|
"learning_rate": 4.3815024652504897e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 363838845.0,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4611930847167969,
|
||
|
|
"epoch": 4.932584269662922,
|
||
|
|
"grad_norm": 0.27109994066163073,
|
||
|
|
"learning_rate": 4.294294857387285e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 364673110.0,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45296478271484375,
|
||
|
|
"epoch": 4.943820224719101,
|
||
|
|
"grad_norm": 0.24175150457269287,
|
||
|
|
"learning_rate": 4.2078822592877074e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 365535079.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47109222412109375,
|
||
|
|
"epoch": 4.955056179775281,
|
||
|
|
"grad_norm": 4.033092061276572,
|
||
|
|
"learning_rate": 4.122267988822792e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 366360305.0,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455108642578125,
|
||
|
|
"epoch": 4.966292134831461,
|
||
|
|
"grad_norm": 3.0118455984225885,
|
||
|
|
"learning_rate": 4.0374553332112374e-07,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 367193457.0,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46689605712890625,
|
||
|
|
"epoch": 4.97752808988764,
|
||
|
|
"grad_norm": 0.281003717289293,
|
||
|
|
"learning_rate": 3.953447548893169e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 367997504.0,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4514923095703125,
|
||
|
|
"epoch": 4.98876404494382,
|
||
|
|
"grad_norm": 0.6207453388018358,
|
||
|
|
"learning_rate": 3.8702478614051353e-07,
|
||
|
|
"loss": 0.0016,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 368860805.0,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4544677734375,
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.5793207742642461,
|
||
|
|
"learning_rate": 3.787859465256258e-07,
|
||
|
|
"loss": 0.0015,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 369686022.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4565582275390625,
|
||
|
|
"epoch": 5.01123595505618,
|
||
|
|
"grad_norm": 3.2248965312750055,
|
||
|
|
"learning_rate": 3.706285523805578e-07,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 370534769.0,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47351837158203125,
|
||
|
|
"epoch": 5.022471910112359,
|
||
|
|
"grad_norm": 0.23588422102883916,
|
||
|
|
"learning_rate": 3.625529169140565e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 371333765.0,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46183013916015625,
|
||
|
|
"epoch": 5.033707865168539,
|
||
|
|
"grad_norm": 0.26051034061662975,
|
||
|
|
"learning_rate": 3.545593501956901e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 372158519.0,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46891021728515625,
|
||
|
|
"epoch": 5.044943820224719,
|
||
|
|
"grad_norm": 0.7897977948934947,
|
||
|
|
"learning_rate": 3.4664815914394106e-07,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 372972872.0,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4658966064453125,
|
||
|
|
"epoch": 5.056179775280899,
|
||
|
|
"grad_norm": 0.23854614330513269,
|
||
|
|
"learning_rate": 3.3881964751441984e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 373764531.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4539337158203125,
|
||
|
|
"epoch": 5.067415730337078,
|
||
|
|
"grad_norm": 0.2582411348351849,
|
||
|
|
"learning_rate": 3.3107411588820527e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 374610508.0,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44156646728515625,
|
||
|
|
"epoch": 5.078651685393258,
|
||
|
|
"grad_norm": 0.23327005612977159,
|
||
|
|
"learning_rate": 3.2341186166030214e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 375494315.0,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46539306640625,
|
||
|
|
"epoch": 5.089887640449438,
|
||
|
|
"grad_norm": 0.30673979741348245,
|
||
|
|
"learning_rate": 3.1583317902822127e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 376304953.0,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4680938720703125,
|
||
|
|
"epoch": 5.101123595505618,
|
||
|
|
"grad_norm": 0.23035448377867607,
|
||
|
|
"learning_rate": 3.083383589806846e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 377109236.0,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46471405029296875,
|
||
|
|
"epoch": 5.112359550561798,
|
||
|
|
"grad_norm": 0.22962165278983354,
|
||
|
|
"learning_rate": 3.0092768928645375e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 377915350.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4534149169921875,
|
||
|
|
"epoch": 5.123595505617978,
|
||
|
|
"grad_norm": 0.25075614257575074,
|
||
|
|
"learning_rate": 2.936014544832794e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 378732812.0,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45656585693359375,
|
||
|
|
"epoch": 5.134831460674158,
|
||
|
|
"grad_norm": 2.0839651517450366,
|
||
|
|
"learning_rate": 2.8635993586697555e-07,
|
||
|
|
"loss": 0.003,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 379535203.0,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4586334228515625,
|
||
|
|
"epoch": 5.146067415730337,
|
||
|
|
"grad_norm": 0.22756596903598503,
|
||
|
|
"learning_rate": 2.792034114806211e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 380351317.0,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45229339599609375,
|
||
|
|
"epoch": 5.157303370786517,
|
||
|
|
"grad_norm": 0.22757916112457438,
|
||
|
|
"learning_rate": 2.7213215610388364e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 381191082.0,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44696807861328125,
|
||
|
|
"epoch": 5.168539325842697,
|
||
|
|
"grad_norm": 0.22565516577479386,
|
||
|
|
"learning_rate": 2.6514644124246675e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 382014981.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46833038330078125,
|
||
|
|
"epoch": 5.179775280898877,
|
||
|
|
"grad_norm": 0.38263012185270356,
|
||
|
|
"learning_rate": 2.582465351176891e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 382843849.0,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4453125,
|
||
|
|
"epoch": 5.191011235955056,
|
||
|
|
"grad_norm": 1.941394944381824,
|
||
|
|
"learning_rate": 2.514327026561833e-07,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 383684445.0,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456756591796875,
|
||
|
|
"epoch": 5.202247191011236,
|
||
|
|
"grad_norm": 0.3217712106473879,
|
||
|
|
"learning_rate": 2.447052054797233e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 384501685.0,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437744140625,
|
||
|
|
"epoch": 5.213483146067416,
|
||
|
|
"grad_norm": 1.3017516617976264,
|
||
|
|
"learning_rate": 2.3806430189518337e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 385365419.0,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45667266845703125,
|
||
|
|
"epoch": 5.224719101123595,
|
||
|
|
"grad_norm": 0.220187005935604,
|
||
|
|
"learning_rate": 2.3151024688461422e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 386192220.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46067047119140625,
|
||
|
|
"epoch": 5.235955056179775,
|
||
|
|
"grad_norm": 0.22892437261891452,
|
||
|
|
"learning_rate": 2.2504329209545846e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 387013925.0,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4477996826171875,
|
||
|
|
"epoch": 5.247191011235955,
|
||
|
|
"grad_norm": 0.22510109028620062,
|
||
|
|
"learning_rate": 2.186636858308841e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 387859694.0,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4438323974609375,
|
||
|
|
"epoch": 5.258426966292135,
|
||
|
|
"grad_norm": 0.22217029391538748,
|
||
|
|
"learning_rate": 2.1237167304025336e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 388717543.0,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43897247314453125,
|
||
|
|
"epoch": 5.269662921348314,
|
||
|
|
"grad_norm": 1.3611717994727468,
|
||
|
|
"learning_rate": 2.0616749530971785e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 389594969.0,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438812255859375,
|
||
|
|
"epoch": 5.280898876404494,
|
||
|
|
"grad_norm": 4.6338923879333365,
|
||
|
|
"learning_rate": 2.0005139085293945e-07,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 390462095.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45113372802734375,
|
||
|
|
"epoch": 5.292134831460674,
|
||
|
|
"grad_norm": 3.1222985309280444,
|
||
|
|
"learning_rate": 1.9402359450194836e-07,
|
||
|
|
"loss": 0.0024,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 391317304.0,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45957183837890625,
|
||
|
|
"epoch": 5.303370786516854,
|
||
|
|
"grad_norm": 0.26983711489676676,
|
||
|
|
"learning_rate": 1.8808433769812367e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 392117648.0,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452423095703125,
|
||
|
|
"epoch": 5.314606741573034,
|
||
|
|
"grad_norm": 0.29167546299948666,
|
||
|
|
"learning_rate": 1.8223384848330723e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 392958679.0,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44556427001953125,
|
||
|
|
"epoch": 5.325842696629214,
|
||
|
|
"grad_norm": 3.9531950686501087,
|
||
|
|
"learning_rate": 1.7647235149104908e-07,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 393804414.0,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.465728759765625,
|
||
|
|
"epoch": 5.337078651685394,
|
||
|
|
"grad_norm": 0.2611165051574614,
|
||
|
|
"learning_rate": 1.7080006793798176e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 394606982.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4496002197265625,
|
||
|
|
"epoch": 5.348314606741573,
|
||
|
|
"grad_norm": 0.28831706738624335,
|
||
|
|
"learning_rate": 1.6521721561532645e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 395464681.0,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.454864501953125,
|
||
|
|
"epoch": 5.359550561797753,
|
||
|
|
"grad_norm": 0.27801351147825365,
|
||
|
|
"learning_rate": 1.597240088805302e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 396272787.0,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4578704833984375,
|
||
|
|
"epoch": 5.370786516853933,
|
||
|
|
"grad_norm": 3.7652732902520767,
|
||
|
|
"learning_rate": 1.54320658649037e-07,
|
||
|
|
"loss": 0.003,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 397106952.0,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45423126220703125,
|
||
|
|
"epoch": 5.382022471910112,
|
||
|
|
"grad_norm": 1.0222681737176318,
|
||
|
|
"learning_rate": 1.4900737238618874e-07,
|
||
|
|
"loss": 0.0017,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 397922607.0,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4500579833984375,
|
||
|
|
"epoch": 5.393258426966292,
|
||
|
|
"grad_norm": 3.2055117041350987,
|
||
|
|
"learning_rate": 1.4378435409925868e-07,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 398762139.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44394683837890625,
|
||
|
|
"epoch": 5.404494382022472,
|
||
|
|
"grad_norm": 0.27203919336113547,
|
||
|
|
"learning_rate": 1.3865180432961977e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 399606807.0,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4582977294921875,
|
||
|
|
"epoch": 5.415730337078652,
|
||
|
|
"grad_norm": 0.2626129462808237,
|
||
|
|
"learning_rate": 1.3360992014504414e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 400433406.0,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4539947509765625,
|
||
|
|
"epoch": 5.426966292134831,
|
||
|
|
"grad_norm": 1.3110655834081466,
|
||
|
|
"learning_rate": 1.286588951321363e-07,
|
||
|
|
"loss": 0.0018,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 401267844.0,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45575714111328125,
|
||
|
|
"epoch": 5.438202247191011,
|
||
|
|
"grad_norm": 0.30139852738632256,
|
||
|
|
"learning_rate": 1.237989193889e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 402087652.0,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4398040771484375,
|
||
|
|
"epoch": 5.449438202247191,
|
||
|
|
"grad_norm": 0.2772895954120305,
|
||
|
|
"learning_rate": 1.1903017951744144e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 402960724.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45752716064453125,
|
||
|
|
"epoch": 5.460674157303371,
|
||
|
|
"grad_norm": 0.28043078838229096,
|
||
|
|
"learning_rate": 1.1435285861680106e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 403770083.0,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4523468017578125,
|
||
|
|
"epoch": 5.47191011235955,
|
||
|
|
"grad_norm": 0.290486600050334,
|
||
|
|
"learning_rate": 1.0976713627592561e-07,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 404586068.0,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448974609375,
|
||
|
|
"epoch": 5.48314606741573,
|
||
|
|
"grad_norm": 0.2602876207673978,
|
||
|
|
"learning_rate": 1.0527318856677293e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 405433958.0,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4537506103515625,
|
||
|
|
"epoch": 5.49438202247191,
|
||
|
|
"grad_norm": 0.2255688115426861,
|
||
|
|
"learning_rate": 1.0087118803755069e-07,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 406236735.0,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44225311279296875,
|
||
|
|
"epoch": 5.50561797752809,
|
||
|
|
"grad_norm": 0.23839279319796375,
|
||
|
|
"learning_rate": 9.656130370609057e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 407098652.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44016265869140625,
|
||
|
|
"epoch": 5.51685393258427,
|
||
|
|
"grad_norm": 0.4044689455010622,
|
||
|
|
"learning_rate": 9.234370105336039e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 407954406.0,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4402008056640625,
|
||
|
|
"epoch": 5.52808988764045,
|
||
|
|
"grad_norm": 0.2430544169170237,
|
||
|
|
"learning_rate": 8.821854201711027e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 408805475.0,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45285797119140625,
|
||
|
|
"epoch": 5.539325842696629,
|
||
|
|
"grad_norm": 1.8753482122817826,
|
||
|
|
"learning_rate": 8.418598498565217e-08,
|
||
|
|
"loss": 0.0077,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 409625911.0,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44989013671875,
|
||
|
|
"epoch": 5.550561797752809,
|
||
|
|
"grad_norm": 0.22453644772138204,
|
||
|
|
"learning_rate": 8.024618479178237e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 410448377.0,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45960235595703125,
|
||
|
|
"epoch": 5.561797752808989,
|
||
|
|
"grad_norm": 0.22608759145228952,
|
||
|
|
"learning_rate": 7.639929270683438e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 411254496.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.447357177734375,
|
||
|
|
"epoch": 5.573033707865169,
|
||
|
|
"grad_norm": 0.22231664608313986,
|
||
|
|
"learning_rate": 7.264545643486997e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 412093632.0,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44928741455078125,
|
||
|
|
"epoch": 5.584269662921348,
|
||
|
|
"grad_norm": 0.7770114677040165,
|
||
|
|
"learning_rate": 6.898482010701036e-08,
|
||
|
|
"loss": 0.0014,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 412912725.0,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45952606201171875,
|
||
|
|
"epoch": 5.595505617977528,
|
||
|
|
"grad_norm": 0.22205123157021062,
|
||
|
|
"learning_rate": 6.541752427590004e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 413704471.0,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45113372802734375,
|
||
|
|
"epoch": 5.606741573033708,
|
||
|
|
"grad_norm": 0.21940329918848647,
|
||
|
|
"learning_rate": 6.194370591031174e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 414537095.0,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44696807861328125,
|
||
|
|
"epoch": 5.617977528089888,
|
||
|
|
"grad_norm": 0.2289707035324365,
|
||
|
|
"learning_rate": 5.856349838988612e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 415392981.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45949554443359375,
|
||
|
|
"epoch": 5.629213483146067,
|
||
|
|
"grad_norm": 0.2211713582495607,
|
||
|
|
"learning_rate": 5.5277031500011734e-08,
|
||
|
|
"loss": 0.0013,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 416223543.0,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4473876953125,
|
||
|
|
"epoch": 5.640449438202247,
|
||
|
|
"grad_norm": 1.182133604490211,
|
||
|
|
"learning_rate": 5.208443142684094e-08,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 417060727.0,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44683074951171875,
|
||
|
|
"epoch": 5.651685393258427,
|
||
|
|
"grad_norm": 0.21884403629787713,
|
||
|
|
"learning_rate": 4.8985820752445177e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 417890059.0,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44339752197265625,
|
||
|
|
"epoch": 5.662921348314606,
|
||
|
|
"grad_norm": 0.21637706969512882,
|
||
|
|
"learning_rate": 4.5981318450109e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 418737697.0,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45195770263671875,
|
||
|
|
"epoch": 5.674157303370786,
|
||
|
|
"grad_norm": 0.2173727523833432,
|
||
|
|
"learning_rate": 4.307103987976041e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 419554252.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45410919189453125,
|
||
|
|
"epoch": 5.685393258426966,
|
||
|
|
"grad_norm": 0.21430018532606634,
|
||
|
|
"learning_rate": 4.0255096783543e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 420379301.0,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44255828857421875,
|
||
|
|
"epoch": 5.696629213483146,
|
||
|
|
"grad_norm": 0.21992408445816547,
|
||
|
|
"learning_rate": 3.75335972815255e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 421251190.0,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45482635498046875,
|
||
|
|
"epoch": 5.707865168539326,
|
||
|
|
"grad_norm": 0.21487068594235248,
|
||
|
|
"learning_rate": 3.4906645867549547e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 422076483.0,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45025634765625,
|
||
|
|
"epoch": 5.719101123595506,
|
||
|
|
"grad_norm": 0.22095064700201622,
|
||
|
|
"learning_rate": 3.237434340521789e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 422892953.0,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4595794677734375,
|
||
|
|
"epoch": 5.730337078651686,
|
||
|
|
"grad_norm": 0.21347314128040096,
|
||
|
|
"learning_rate": 2.993678712402221e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 423725323.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44622802734375,
|
||
|
|
"epoch": 5.741573033707866,
|
||
|
|
"grad_norm": 0.21762320050905556,
|
||
|
|
"learning_rate": 2.7594070615609426e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 424568879.0,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4506378173828125,
|
||
|
|
"epoch": 5.752808988764045,
|
||
|
|
"grad_norm": 0.2399851374545116,
|
||
|
|
"learning_rate": 2.5346283830187667e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 425422220.0,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45760345458984375,
|
||
|
|
"epoch": 5.764044943820225,
|
||
|
|
"grad_norm": 3.618323663153434,
|
||
|
|
"learning_rate": 2.319351307307427e-08,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 426238802.0,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.463531494140625,
|
||
|
|
"epoch": 5.775280898876405,
|
||
|
|
"grad_norm": 3.020973889001048,
|
||
|
|
"learning_rate": 2.1135841001380386e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 427029167.0,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449920654296875,
|
||
|
|
"epoch": 5.786516853932584,
|
||
|
|
"grad_norm": 0.21466505136379285,
|
||
|
|
"learning_rate": 1.917334662083714e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 427868982.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4319610595703125,
|
||
|
|
"epoch": 5.797752808988764,
|
||
|
|
"grad_norm": 0.21193776091797573,
|
||
|
|
"learning_rate": 1.7306105282764162e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 428747997.0,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4481201171875,
|
||
|
|
"epoch": 5.808988764044944,
|
||
|
|
"grad_norm": 3.581869315768432,
|
||
|
|
"learning_rate": 1.55341886811744e-08,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 429565485.0,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.450286865234375,
|
||
|
|
"epoch": 5.820224719101123,
|
||
|
|
"grad_norm": 0.21526921468453963,
|
||
|
|
"learning_rate": 1.3857664850022157e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 430396686.0,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455841064453125,
|
||
|
|
"epoch": 5.831460674157303,
|
||
|
|
"grad_norm": 0.21786690914163995,
|
||
|
|
"learning_rate": 1.2276598160590736e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 431218771.0,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44765472412109375,
|
||
|
|
"epoch": 5.842696629213483,
|
||
|
|
"grad_norm": 0.22217126637320583,
|
||
|
|
"learning_rate": 1.0791049319021086e-08,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 432065865.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45168304443359375,
|
||
|
|
"epoch": 5.853932584269663,
|
||
|
|
"grad_norm": 0.2152958362352817,
|
||
|
|
"learning_rate": 9.401075363981438e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 432899235.0,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44405364990234375,
|
||
|
|
"epoch": 5.865168539325842,
|
||
|
|
"grad_norm": 0.2167849625384711,
|
||
|
|
"learning_rate": 8.106729664475178e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 433737697.0,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445068359375,
|
||
|
|
"epoch": 5.876404494382022,
|
||
|
|
"grad_norm": 0.22185722429726154,
|
||
|
|
"learning_rate": 6.908061917794417e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 434580120.0,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43703460693359375,
|
||
|
|
"epoch": 5.887640449438202,
|
||
|
|
"grad_norm": 0.2189020524852438,
|
||
|
|
"learning_rate": 5.805118147610145e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 435459272.0,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456207275390625,
|
||
|
|
"epoch": 5.898876404494382,
|
||
|
|
"grad_norm": 0.22301334936598696,
|
||
|
|
"learning_rate": 4.797940702205572e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 436268641.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4462432861328125,
|
||
|
|
"epoch": 5.910112359550562,
|
||
|
|
"grad_norm": 0.2160379000791954,
|
||
|
|
"learning_rate": 3.8865682528504975e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 437126939.0,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44385528564453125,
|
||
|
|
"epoch": 5.921348314606742,
|
||
|
|
"grad_norm": 4.116469262583432,
|
||
|
|
"learning_rate": 3.071035792315269e-09,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"mean_token_accuracy": 0.9973958334885538,
|
||
|
|
"num_tokens": 437951169.0,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45404815673828125,
|
||
|
|
"epoch": 5.932584269662922,
|
||
|
|
"grad_norm": 4.6861858731438435,
|
||
|
|
"learning_rate": 2.351374633528802e-09,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 438768686.0,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472015380859375,
|
||
|
|
"epoch": 5.943820224719101,
|
||
|
|
"grad_norm": 0.21812174178615043,
|
||
|
|
"learning_rate": 1.7276124083753788e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 439558604.0,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44561004638671875,
|
||
|
|
"epoch": 5.955056179775281,
|
||
|
|
"grad_norm": 0.22063825113739127,
|
||
|
|
"learning_rate": 1.1997730666338248e-09,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 440395957.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4581298828125,
|
||
|
|
"epoch": 5.966292134831461,
|
||
|
|
"grad_norm": 1.4192369358126131,
|
||
|
|
"learning_rate": 7.678768750579713e-10,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"mean_token_accuracy": 0.9986979167442769,
|
||
|
|
"num_tokens": 441212037.0,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45439910888671875,
|
||
|
|
"epoch": 5.97752808988764,
|
||
|
|
"grad_norm": 0.2134790958132504,
|
||
|
|
"learning_rate": 4.3194041659866405e-10,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 442053098.0,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43907928466796875,
|
||
|
|
"epoch": 5.98876404494382,
|
||
|
|
"grad_norm": 0.21385742865545188,
|
||
|
|
"learning_rate": 1.9197658976677358e-10,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 442888433.0,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44598388671875,
|
||
|
|
"epoch": 6.0,
|
||
|
|
"grad_norm": 0.2177543367348665,
|
||
|
|
"learning_rate": 4.799460813803558e-11,
|
||
|
|
"loss": 0.0012,
|
||
|
|
"mean_token_accuracy": 1.0,
|
||
|
|
"num_tokens": 443721575.0,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.0,
|
||
|
|
"step": 534,
|
||
|
|
"total_flos": 522066352668672.0,
|
||
|
|
"train_loss": 0.5458187263237473,
|
||
|
|
"train_runtime": 71514.3468,
|
||
|
|
"train_samples_per_second": 3.454,
|
||
|
|
"train_steps_per_second": 0.007
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 534,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 6,
|
||
|
|
"save_steps": 45,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 522066352668672.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|