1427 lines
39 KiB
JSON
1427 lines
39 KiB
JSON
[
|
|
{
|
|
"loss": 5.295790405273437,
|
|
"grad_norm": 17.75,
|
|
"learning_rate": 1.6937553464499572e-06,
|
|
"entropy": 1.3026689371466638,
|
|
"num_tokens": 788631.0,
|
|
"mean_token_accuracy": 0.45803309440612794,
|
|
"epoch": 0.02567723712928489,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 1.9093310546875,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 3.4046193327630456e-06,
|
|
"entropy": 1.1995259793102742,
|
|
"num_tokens": 1575343.0,
|
|
"mean_token_accuracy": 0.7073830207437277,
|
|
"epoch": 0.05135447425856978,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 0.7623916625976562,
|
|
"grad_norm": 0.93359375,
|
|
"learning_rate": 5.1154833190761344e-06,
|
|
"entropy": 0.7093755914270878,
|
|
"num_tokens": 2362291.0,
|
|
"mean_token_accuracy": 0.8655054874718189,
|
|
"epoch": 0.07703171138785467,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 0.6790202331542968,
|
|
"grad_norm": 0.75,
|
|
"learning_rate": 6.826347305389223e-06,
|
|
"entropy": 0.6651146249473094,
|
|
"num_tokens": 3150439.0,
|
|
"mean_token_accuracy": 0.8720192441344261,
|
|
"epoch": 0.10270894851713956,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 0.661846923828125,
|
|
"grad_norm": 0.82421875,
|
|
"learning_rate": 8.537211291702311e-06,
|
|
"entropy": 0.6505232656002045,
|
|
"num_tokens": 3939189.0,
|
|
"mean_token_accuracy": 0.8728892022371292,
|
|
"epoch": 0.12838618564642446,
|
|
"step": 500
|
|
},
|
|
{
|
|
"eval_loss": 0.6569298505783081,
|
|
"eval_runtime": 66.9323,
|
|
"eval_samples_per_second": 201.905,
|
|
"eval_steps_per_second": 25.249,
|
|
"eval_entropy": 0.6593257126723521,
|
|
"eval_num_tokens": 3939189.0,
|
|
"eval_mean_token_accuracy": 0.8742751985964691,
|
|
"epoch": 0.12838618564642446,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 0.6417784881591797,
|
|
"grad_norm": 0.65625,
|
|
"learning_rate": 1.02480752780154e-05,
|
|
"entropy": 0.6309234929084778,
|
|
"num_tokens": 4726987.0,
|
|
"mean_token_accuracy": 0.8763968905806542,
|
|
"epoch": 0.15406342277570934,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 0.6324351119995117,
|
|
"grad_norm": 0.58984375,
|
|
"learning_rate": 1.1958939264328486e-05,
|
|
"entropy": 0.6225168199837208,
|
|
"num_tokens": 5514520.0,
|
|
"mean_token_accuracy": 0.8769305641949177,
|
|
"epoch": 0.17974065990499422,
|
|
"step": 700
|
|
},
|
|
{
|
|
"loss": 0.6248664474487304,
|
|
"grad_norm": 0.5703125,
|
|
"learning_rate": 1.3669803250641576e-05,
|
|
"entropy": 0.6152851846814156,
|
|
"num_tokens": 6303997.0,
|
|
"mean_token_accuracy": 0.8783293107151985,
|
|
"epoch": 0.20541789703427912,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 0.6181363677978515,
|
|
"grad_norm": 0.4453125,
|
|
"learning_rate": 1.538066723695466e-05,
|
|
"entropy": 0.607969797924161,
|
|
"num_tokens": 7092051.0,
|
|
"mean_token_accuracy": 0.8795923219621181,
|
|
"epoch": 0.231095134163564,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 0.6079271697998047,
|
|
"grad_norm": 0.63671875,
|
|
"learning_rate": 1.7091531223267753e-05,
|
|
"entropy": 0.5982871637493372,
|
|
"num_tokens": 7879814.0,
|
|
"mean_token_accuracy": 0.8805274599790573,
|
|
"epoch": 0.2567723712928489,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"eval_loss": 0.6138819456100464,
|
|
"eval_runtime": 66.9074,
|
|
"eval_samples_per_second": 201.981,
|
|
"eval_steps_per_second": 25.259,
|
|
"eval_entropy": 0.5835022156584192,
|
|
"eval_num_tokens": 7879814.0,
|
|
"eval_mean_token_accuracy": 0.8795767288941604,
|
|
"epoch": 0.2567723712928489,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 0.6031342315673828,
|
|
"grad_norm": 0.4765625,
|
|
"learning_rate": 1.8802395209580838e-05,
|
|
"entropy": 0.5937141847610473,
|
|
"num_tokens": 8669131.0,
|
|
"mean_token_accuracy": 0.8811740911006928,
|
|
"epoch": 0.2824496084221338,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 0.6006821060180664,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 1.999959838659769e-05,
|
|
"entropy": 0.5919705433398486,
|
|
"num_tokens": 9457145.0,
|
|
"mean_token_accuracy": 0.8807326038181782,
|
|
"epoch": 0.30812684555141867,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 0.5951248550415039,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.9992459490144817e-05,
|
|
"entropy": 0.5857468252629041,
|
|
"num_tokens": 10246533.0,
|
|
"mean_token_accuracy": 0.8820273293554783,
|
|
"epoch": 0.33380408268070355,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 0.5904627990722656,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 1.9976403184682326e-05,
|
|
"entropy": 0.5813354634493589,
|
|
"num_tokens": 11033964.0,
|
|
"mean_token_accuracy": 0.8821511951088905,
|
|
"epoch": 0.35948131980998843,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 0.5805374526977539,
|
|
"grad_norm": 0.48828125,
|
|
"learning_rate": 1.9951443799079215e-05,
|
|
"entropy": 0.5730234136432409,
|
|
"num_tokens": 11822337.0,
|
|
"mean_token_accuracy": 0.8843117669224739,
|
|
"epoch": 0.3851585569392733,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"eval_loss": 0.587958812713623,
|
|
"eval_runtime": 66.7304,
|
|
"eval_samples_per_second": 202.516,
|
|
"eval_steps_per_second": 25.326,
|
|
"eval_entropy": 0.5761568091501146,
|
|
"eval_num_tokens": 11822337.0,
|
|
"eval_mean_token_accuracy": 0.8826649507827308,
|
|
"epoch": 0.3851585569392733,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 0.5845616149902344,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.99176036074363e-05,
|
|
"entropy": 0.5758610642701387,
|
|
"num_tokens": 12609317.0,
|
|
"mean_token_accuracy": 0.8831216642260551,
|
|
"epoch": 0.41083579406855825,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"loss": 0.579046745300293,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 1.9874912809208492e-05,
|
|
"entropy": 0.570131861642003,
|
|
"num_tokens": 13396496.0,
|
|
"mean_token_accuracy": 0.8837999847531318,
|
|
"epoch": 0.4365130311978431,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"loss": 0.5719428634643555,
|
|
"grad_norm": 0.388671875,
|
|
"learning_rate": 1.9823409502254395e-05,
|
|
"entropy": 0.5643735866248608,
|
|
"num_tokens": 14184641.0,
|
|
"mean_token_accuracy": 0.8844822055101395,
|
|
"epoch": 0.462190268327128,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"loss": 0.566561393737793,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.976313964883724e-05,
|
|
"entropy": 0.5589940486103296,
|
|
"num_tokens": 14972736.0,
|
|
"mean_token_accuracy": 0.8854228469729424,
|
|
"epoch": 0.4878675054564129,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"loss": 0.5729906463623047,
|
|
"grad_norm": 0.400390625,
|
|
"learning_rate": 1.969415703460754e-05,
|
|
"entropy": 0.5654201730340719,
|
|
"num_tokens": 15760765.0,
|
|
"mean_token_accuracy": 0.884299693107605,
|
|
"epoch": 0.5135447425856978,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"eval_loss": 0.574901282787323,
|
|
"eval_runtime": 66.9041,
|
|
"eval_samples_per_second": 201.991,
|
|
"eval_steps_per_second": 25.26,
|
|
"eval_entropy": 0.5581040822895321,
|
|
"eval_num_tokens": 15760765.0,
|
|
"eval_mean_token_accuracy": 0.8844290988685112,
|
|
"epoch": 0.5135447425856978,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"loss": 0.5724992752075195,
|
|
"grad_norm": 0.470703125,
|
|
"learning_rate": 1.9616523220604026e-05,
|
|
"entropy": 0.5640381355583668,
|
|
"num_tokens": 16548995.0,
|
|
"mean_token_accuracy": 0.8841193398833275,
|
|
"epoch": 0.5392219797149826,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"loss": 0.565244369506836,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.9530307488315705e-05,
|
|
"entropy": 0.5574718941748142,
|
|
"num_tokens": 17338195.0,
|
|
"mean_token_accuracy": 0.8858554971218109,
|
|
"epoch": 0.5648992168442676,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"loss": 0.5641956329345703,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 1.943558677785414e-05,
|
|
"entropy": 0.556980236619711,
|
|
"num_tokens": 18126378.0,
|
|
"mean_token_accuracy": 0.8855182483792305,
|
|
"epoch": 0.5905764539735524,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"loss": 0.5643896865844726,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 1.9332445619291003e-05,
|
|
"entropy": 0.5556083285063506,
|
|
"num_tokens": 18915415.0,
|
|
"mean_token_accuracy": 0.8858178888261318,
|
|
"epoch": 0.6162536911028373,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"loss": 0.5537351608276367,
|
|
"grad_norm": 0.361328125,
|
|
"learning_rate": 1.9220976057222272e-05,
|
|
"entropy": 0.5470526535063982,
|
|
"num_tokens": 19703215.0,
|
|
"mean_token_accuracy": 0.8877282282710075,
|
|
"epoch": 0.6419309282321223,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"eval_loss": 0.566738486289978,
|
|
"eval_runtime": 66.8788,
|
|
"eval_samples_per_second": 202.067,
|
|
"eval_steps_per_second": 25.27,
|
|
"eval_entropy": 0.5507401840750282,
|
|
"eval_num_tokens": 19703215.0,
|
|
"eval_mean_token_accuracy": 0.8854550624740194,
|
|
"epoch": 0.6419309282321223,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"loss": 0.5606909561157226,
|
|
"grad_norm": 0.4296875,
|
|
"learning_rate": 1.9101277568626374e-05,
|
|
"entropy": 0.5524309245496988,
|
|
"num_tokens": 20491378.0,
|
|
"mean_token_accuracy": 0.886026524156332,
|
|
"epoch": 0.6676081653614071,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"loss": 0.5588908386230469,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 1.8973456974089533e-05,
|
|
"entropy": 0.5509618154168129,
|
|
"num_tokens": 21281188.0,
|
|
"mean_token_accuracy": 0.8863780727982521,
|
|
"epoch": 0.693285402490692,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"loss": 0.5587222290039062,
|
|
"grad_norm": 0.439453125,
|
|
"learning_rate": 1.883762834247763e-05,
|
|
"entropy": 0.5505382239073515,
|
|
"num_tokens": 22068585.0,
|
|
"mean_token_accuracy": 0.8866781835258007,
|
|
"epoch": 0.7189626396199769,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"loss": 0.5535868453979492,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 1.8693912889139548e-05,
|
|
"entropy": 0.5467930260300636,
|
|
"num_tokens": 22857792.0,
|
|
"mean_token_accuracy": 0.8876498517394066,
|
|
"epoch": 0.7446398767492618,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"loss": 0.5560418701171875,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.8542438867732926e-05,
|
|
"entropy": 0.5481385685503483,
|
|
"num_tokens": 23646674.0,
|
|
"mean_token_accuracy": 0.8865408559143543,
|
|
"epoch": 0.7703171138785466,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"eval_loss": 0.5608235001564026,
|
|
"eval_runtime": 66.9694,
|
|
"eval_samples_per_second": 201.794,
|
|
"eval_steps_per_second": 25.235,
|
|
"eval_entropy": 0.5475869985727163,
|
|
"eval_num_tokens": 23646674.0,
|
|
"eval_mean_token_accuracy": 0.8862869346988271,
|
|
"epoch": 0.7703171138785466,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"loss": 0.5510826873779296,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.8383341455768818e-05,
|
|
"entropy": 0.5442652675509453,
|
|
"num_tokens": 24435112.0,
|
|
"mean_token_accuracy": 0.8880468539893627,
|
|
"epoch": 0.7959943510078316,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"loss": 0.5556667709350586,
|
|
"grad_norm": 0.46484375,
|
|
"learning_rate": 1.821676263397742e-05,
|
|
"entropy": 0.5477713013440371,
|
|
"num_tokens": 25222534.0,
|
|
"mean_token_accuracy": 0.8866990077495575,
|
|
"epoch": 0.8216715881371165,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"loss": 0.5512593460083007,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 1.80428510596025e-05,
|
|
"entropy": 0.5437313695996999,
|
|
"num_tokens": 26010061.0,
|
|
"mean_token_accuracy": 0.8885632981359959,
|
|
"epoch": 0.8473488252664013,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"loss": 0.5498195266723633,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 1.7861761933737617e-05,
|
|
"entropy": 0.5414738351106644,
|
|
"num_tokens": 26798369.0,
|
|
"mean_token_accuracy": 0.8879594984650612,
|
|
"epoch": 0.8730260623956863,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"loss": 0.5532369232177734,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.7673656862822515e-05,
|
|
"entropy": 0.5464992509037256,
|
|
"num_tokens": 27588005.0,
|
|
"mean_token_accuracy": 0.8868030488491059,
|
|
"epoch": 0.8987032995249711,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"eval_loss": 0.5567756295204163,
|
|
"eval_runtime": 66.7863,
|
|
"eval_samples_per_second": 202.347,
|
|
"eval_steps_per_second": 25.305,
|
|
"eval_entropy": 0.5490564776595528,
|
|
"eval_num_tokens": 27588005.0,
|
|
"eval_mean_token_accuracy": 0.8867901984051134,
|
|
"epoch": 0.8987032995249711,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"loss": 0.5537870025634766,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.7478703714423316e-05,
|
|
"entropy": 0.54683793194592,
|
|
"num_tokens": 28377750.0,
|
|
"mean_token_accuracy": 0.8869239047169686,
|
|
"epoch": 0.924380536654256,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"loss": 0.5514320755004882,
|
|
"grad_norm": 0.435546875,
|
|
"learning_rate": 1.7277076467425163e-05,
|
|
"entropy": 0.5437256157398224,
|
|
"num_tokens": 29166016.0,
|
|
"mean_token_accuracy": 0.8873179040849208,
|
|
"epoch": 0.9500577737835408,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"loss": 0.5542150115966797,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 1.706895505677108e-05,
|
|
"entropy": 0.5465144612640143,
|
|
"num_tokens": 29953000.0,
|
|
"mean_token_accuracy": 0.887007016390562,
|
|
"epoch": 0.9757350109128258,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"loss": 0.5507764053344727,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 1.6854525212885517e-05,
|
|
"entropy": 0.5444752973827285,
|
|
"num_tokens": 30735372.0,
|
|
"mean_token_accuracy": 0.887075167355226,
|
|
"epoch": 1.0012838618564643,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"loss": 0.5405293273925781,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.6633978295925973e-05,
|
|
"entropy": 0.5336830996721983,
|
|
"num_tokens": 31523776.0,
|
|
"mean_token_accuracy": 0.8888550646603107,
|
|
"epoch": 1.0269610989857492,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"eval_loss": 0.5542467832565308,
|
|
"eval_runtime": 66.8867,
|
|
"eval_samples_per_second": 202.043,
|
|
"eval_steps_per_second": 25.267,
|
|
"eval_entropy": 0.5430249788175673,
|
|
"eval_num_tokens": 31523776.0,
|
|
"eval_mean_token_accuracy": 0.8871892735450225,
|
|
"epoch": 1.0269610989857492,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"loss": 0.5439878845214844,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.6407511125010535e-05,
|
|
"entropy": 0.5366661065071822,
|
|
"num_tokens": 32312483.0,
|
|
"mean_token_accuracy": 0.8886200107634068,
|
|
"epoch": 1.052638336115034,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"loss": 0.538640251159668,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 1.6175325802573762e-05,
|
|
"entropy": 0.5315170773863792,
|
|
"num_tokens": 33100106.0,
|
|
"mean_token_accuracy": 0.8894328561425209,
|
|
"epoch": 1.0783155732443188,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"loss": 0.5469233703613281,
|
|
"grad_norm": 0.392578125,
|
|
"learning_rate": 1.593762953400771e-05,
|
|
"entropy": 0.5396519158780575,
|
|
"num_tokens": 33887447.0,
|
|
"mean_token_accuracy": 0.8873359954357147,
|
|
"epoch": 1.1039928103736039,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"loss": 0.5387564086914063,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 1.569463444274896e-05,
|
|
"entropy": 0.5312051647901535,
|
|
"num_tokens": 34676497.0,
|
|
"mean_token_accuracy": 0.8895892894268036,
|
|
"epoch": 1.1296700475028887,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"loss": 0.5374182510375977,
|
|
"grad_norm": 0.359375,
|
|
"learning_rate": 1.5446557380976705e-05,
|
|
"entropy": 0.5307135570794344,
|
|
"num_tokens": 35464531.0,
|
|
"mean_token_accuracy": 0.8893489798903466,
|
|
"epoch": 1.1553472846321735,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"eval_loss": 0.5518077611923218,
|
|
"eval_runtime": 66.9817,
|
|
"eval_samples_per_second": 201.756,
|
|
"eval_steps_per_second": 25.231,
|
|
"eval_entropy": 0.538807505527897,
|
|
"eval_num_tokens": 35464531.0,
|
|
"eval_mean_token_accuracy": 0.8874176931804454,
|
|
"epoch": 1.1553472846321735,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"loss": 0.5373505783081055,
|
|
"grad_norm": 0.34375,
|
|
"learning_rate": 1.5193619736090915e-05,
|
|
"entropy": 0.5305563137680293,
|
|
"num_tokens": 36252194.0,
|
|
"mean_token_accuracy": 0.8893280589580536,
|
|
"epoch": 1.1810245217614586,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"loss": 0.5385945892333984,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.4936047233143121e-05,
|
|
"entropy": 0.5316550326347351,
|
|
"num_tokens": 37040269.0,
|
|
"mean_token_accuracy": 0.8889729425311088,
|
|
"epoch": 1.2067017588907434,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"loss": 0.5383005142211914,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 1.4674069733396277e-05,
|
|
"entropy": 0.5314889302104712,
|
|
"num_tokens": 37828541.0,
|
|
"mean_token_accuracy": 0.8891593493521214,
|
|
"epoch": 1.2323789960200282,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"loss": 0.5413074493408203,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 1.4407921029193386e-05,
|
|
"entropy": 0.5336076222360134,
|
|
"num_tokens": 38617419.0,
|
|
"mean_token_accuracy": 0.8887772466242313,
|
|
"epoch": 1.258056233149313,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"loss": 0.5325155639648438,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.4137838635317981e-05,
|
|
"entropy": 0.5255533574521541,
|
|
"num_tokens": 39405754.0,
|
|
"mean_token_accuracy": 0.8911273476481437,
|
|
"epoch": 1.283733470278598,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"eval_loss": 0.5503326654434204,
|
|
"eval_runtime": 66.9521,
|
|
"eval_samples_per_second": 201.846,
|
|
"eval_steps_per_second": 25.242,
|
|
"eval_entropy": 0.5397688833037777,
|
|
"eval_num_tokens": 39405754.0,
|
|
"eval_mean_token_accuracy": 0.8876568412639686,
|
|
"epoch": 1.283733470278598,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"loss": 0.5346319198608398,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 1.3864063577032644e-05,
|
|
"entropy": 0.5269782991707325,
|
|
"num_tokens": 40194187.0,
|
|
"mean_token_accuracy": 0.8896128107607365,
|
|
"epoch": 1.309410707407883,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"loss": 0.5403598022460937,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.3586840174984741e-05,
|
|
"entropy": 0.5327016961574554,
|
|
"num_tokens": 40982840.0,
|
|
"mean_token_accuracy": 0.8892854882776737,
|
|
"epoch": 1.3350879445371677,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"loss": 0.5382755661010742,
|
|
"grad_norm": 0.421875,
|
|
"learning_rate": 1.3306415827171285e-05,
|
|
"entropy": 0.5321682307869195,
|
|
"num_tokens": 41771135.0,
|
|
"mean_token_accuracy": 0.8891067253053189,
|
|
"epoch": 1.3607651816664528,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"loss": 0.5331235885620117,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 1.3023040788157542e-05,
|
|
"entropy": 0.5263735573738814,
|
|
"num_tokens": 42560001.0,
|
|
"mean_token_accuracy": 0.8908107495307922,
|
|
"epoch": 1.3864424187957376,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"loss": 0.5401896667480469,
|
|
"grad_norm": 0.416015625,
|
|
"learning_rate": 1.2736967945746414e-05,
|
|
"entropy": 0.5332227103412152,
|
|
"num_tokens": 43348221.0,
|
|
"mean_token_accuracy": 0.8889072981476783,
|
|
"epoch": 1.4121196559250224,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"eval_loss": 0.5492498278617859,
|
|
"eval_runtime": 67.3776,
|
|
"eval_samples_per_second": 200.571,
|
|
"eval_steps_per_second": 25.083,
|
|
"eval_entropy": 0.537609067100745,
|
|
"eval_num_tokens": 43348221.0,
|
|
"eval_mean_token_accuracy": 0.8878510412379835,
|
|
"epoch": 1.4121196559250224,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"loss": 0.5376063156127929,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.244845259529785e-05,
|
|
"entropy": 0.530102282166481,
|
|
"num_tokens": 44137063.0,
|
|
"mean_token_accuracy": 0.8888533094525337,
|
|
"epoch": 1.4377968930543075,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"loss": 0.5347390747070313,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 1.2157752211899743e-05,
|
|
"entropy": 0.5278809563070536,
|
|
"num_tokens": 44925913.0,
|
|
"mean_token_accuracy": 0.890110841691494,
|
|
"epoch": 1.4634741301835923,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"loss": 0.5396208572387695,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 1.1865126220593606e-05,
|
|
"entropy": 0.5324571677297354,
|
|
"num_tokens": 45713041.0,
|
|
"mean_token_accuracy": 0.8887992192804813,
|
|
"epoch": 1.4891513673128771,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"loss": 0.5366056060791016,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 1.157083576486007e-05,
|
|
"entropy": 0.5297523141652346,
|
|
"num_tokens": 46501563.0,
|
|
"mean_token_accuracy": 0.8896933840215207,
|
|
"epoch": 1.5148286044421622,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"loss": 0.5389834213256836,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 1.127514347357083e-05,
|
|
"entropy": 0.5312013550847768,
|
|
"num_tokens": 47289330.0,
|
|
"mean_token_accuracy": 0.889183943271637,
|
|
"epoch": 1.5405058415714468,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"eval_loss": 0.5483260750770569,
|
|
"eval_runtime": 67.2323,
|
|
"eval_samples_per_second": 201.005,
|
|
"eval_steps_per_second": 25.137,
|
|
"eval_entropy": 0.5363286497148536,
|
|
"eval_num_tokens": 47289330.0,
|
|
"eval_mean_token_accuracy": 0.8879236852628944,
|
|
"epoch": 1.5405058415714468,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"loss": 0.5336640548706054,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 1.097831322661502e-05,
|
|
"entropy": 0.5288673885911703,
|
|
"num_tokens": 48077668.0,
|
|
"mean_token_accuracy": 0.8898833976686,
|
|
"epoch": 1.5661830787007318,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"loss": 0.5375140762329101,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 1.0680609919409147e-05,
|
|
"entropy": 0.5301950394362211,
|
|
"num_tokens": 48866262.0,
|
|
"mean_token_accuracy": 0.8895807403326035,
|
|
"epoch": 1.5918603158300166,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"loss": 0.5402913665771485,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.0382299226500746e-05,
|
|
"entropy": 0.5333884459733963,
|
|
"num_tokens": 49654335.0,
|
|
"mean_token_accuracy": 0.888499419093132,
|
|
"epoch": 1.6175375529593015,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"loss": 0.5385772323608399,
|
|
"grad_norm": 0.431640625,
|
|
"learning_rate": 1.0083647364476762e-05,
|
|
"entropy": 0.531462483778596,
|
|
"num_tokens": 50442084.0,
|
|
"mean_token_accuracy": 0.8892641182243824,
|
|
"epoch": 1.6432147900885865,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"loss": 0.5353133010864258,
|
|
"grad_norm": 0.38671875,
|
|
"learning_rate": 9.784920854388168e-06,
|
|
"entropy": 0.527387068271637,
|
|
"num_tokens": 51229616.0,
|
|
"mean_token_accuracy": 0.8898960523307323,
|
|
"epoch": 1.6688920272178713,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"eval_loss": 0.5475577116012573,
|
|
"eval_runtime": 66.8604,
|
|
"eval_samples_per_second": 202.122,
|
|
"eval_steps_per_second": 25.277,
|
|
"eval_entropy": 0.5349308549123403,
|
|
"eval_num_tokens": 51229616.0,
|
|
"eval_mean_token_accuracy": 0.8880981687610672,
|
|
"epoch": 1.6688920272178713,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"loss": 0.5429517364501953,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 9.486386283902909e-06,
|
|
"entropy": 0.537000098451972,
|
|
"num_tokens": 52018364.0,
|
|
"mean_token_accuracy": 0.8879952408373356,
|
|
"epoch": 1.6945692643471562,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"loss": 0.53443115234375,
|
|
"grad_norm": 0.447265625,
|
|
"learning_rate": 9.188310069399368e-06,
|
|
"entropy": 0.5280120493471623,
|
|
"num_tokens": 52807100.0,
|
|
"mean_token_accuracy": 0.8897963106632233,
|
|
"epoch": 1.7202465014764412,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"loss": 0.5356635665893554,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 8.890958218212716e-06,
|
|
"entropy": 0.5287372674047947,
|
|
"num_tokens": 53594818.0,
|
|
"mean_token_accuracy": 0.8895081703364849,
|
|
"epoch": 1.745923738605726,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"loss": 0.5367589569091797,
|
|
"grad_norm": 0.369140625,
|
|
"learning_rate": 8.594596091246282e-06,
|
|
"entropy": 0.5299282168596983,
|
|
"num_tokens": 54382275.0,
|
|
"mean_token_accuracy": 0.8896372695267201,
|
|
"epoch": 1.7716009757350109,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"loss": 0.5325223159790039,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 8.299488166159817e-06,
|
|
"entropy": 0.5243838762491941,
|
|
"num_tokens": 55171058.0,
|
|
"mean_token_accuracy": 0.890430568009615,
|
|
"epoch": 1.797278212864296,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"eval_loss": 0.5471183061599731,
|
|
"eval_runtime": 67.121,
|
|
"eval_samples_per_second": 201.338,
|
|
"eval_steps_per_second": 25.178,
|
|
"eval_entropy": 0.5362202901106614,
|
|
"eval_num_tokens": 55171058.0,
|
|
"eval_mean_token_accuracy": 0.8881139952402849,
|
|
"epoch": 1.797278212864296,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"loss": 0.5369550704956054,
|
|
"grad_norm": 0.41015625,
|
|
"learning_rate": 8.005897801345976e-06,
|
|
"entropy": 0.5303717163205147,
|
|
"num_tokens": 55960002.0,
|
|
"mean_token_accuracy": 0.8894575189054013,
|
|
"epoch": 1.8229554499935807,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"loss": 0.5351543045043945,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 7.714087000905643e-06,
|
|
"entropy": 0.5274955333024264,
|
|
"num_tokens": 56748093.0,
|
|
"mean_token_accuracy": 0.8897032625973225,
|
|
"epoch": 1.8486326871228655,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"loss": 0.5317428207397461,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 7.4243161808318465e-06,
|
|
"entropy": 0.5234256482124329,
|
|
"num_tokens": 57536251.0,
|
|
"mean_token_accuracy": 0.8904528062045575,
|
|
"epoch": 1.8743099242521506,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"loss": 0.5373667526245117,
|
|
"grad_norm": 0.462890625,
|
|
"learning_rate": 7.136843936610935e-06,
|
|
"entropy": 0.5299951387941837,
|
|
"num_tokens": 58324650.0,
|
|
"mean_token_accuracy": 0.8894152472913265,
|
|
"epoch": 1.8999871613814352,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"loss": 0.5356111907958985,
|
|
"grad_norm": 0.4375,
|
|
"learning_rate": 6.851926812448384e-06,
|
|
"entropy": 0.5281449986249208,
|
|
"num_tokens": 59112829.0,
|
|
"mean_token_accuracy": 0.8894895881414413,
|
|
"epoch": 1.9256643985107202,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"eval_loss": 0.5469039082527161,
|
|
"eval_runtime": 67.2936,
|
|
"eval_samples_per_second": 200.821,
|
|
"eval_steps_per_second": 25.114,
|
|
"eval_entropy": 0.5319856126809261,
|
|
"eval_num_tokens": 59112829.0,
|
|
"eval_mean_token_accuracy": 0.8881882847060819,
|
|
"epoch": 1.9256643985107202,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"loss": 0.5324957275390625,
|
|
"grad_norm": 0.373046875,
|
|
"learning_rate": 6.569819072325195e-06,
|
|
"entropy": 0.5259347888082266,
|
|
"num_tokens": 59899148.0,
|
|
"mean_token_accuracy": 0.8905223569273949,
|
|
"epoch": 1.9513416356400053,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"loss": 0.5345915985107422,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 6.290772473089214e-06,
|
|
"entropy": 0.5266730510443449,
|
|
"num_tokens": 60687114.0,
|
|
"mean_token_accuracy": 0.8896890124678611,
|
|
"epoch": 1.97701887276929,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"loss": 0.5284463882446289,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 6.015036039783836e-06,
|
|
"entropy": 0.5242100227717779,
|
|
"num_tokens": 61470852.0,
|
|
"mean_token_accuracy": 0.8908302205890867,
|
|
"epoch": 2.0025677237129287,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"loss": 0.5349670028686524,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 5.74285584341461e-06,
|
|
"entropy": 0.5276301963627339,
|
|
"num_tokens": 62258538.0,
|
|
"mean_token_accuracy": 0.8893012750148773,
|
|
"epoch": 2.0282449608422133,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"loss": 0.5321082305908204,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 5.474474781352066e-06,
|
|
"entropy": 0.5241756404936314,
|
|
"num_tokens": 63047491.0,
|
|
"mean_token_accuracy": 0.8900664694607258,
|
|
"epoch": 2.0539221979714983,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"eval_loss": 0.5467016696929932,
|
|
"eval_runtime": 67.0874,
|
|
"eval_samples_per_second": 201.439,
|
|
"eval_steps_per_second": 25.191,
|
|
"eval_entropy": 0.5306698962957901,
|
|
"eval_num_tokens": 63047491.0,
|
|
"eval_mean_token_accuracy": 0.8882128992024259,
|
|
"epoch": 2.0539221979714983,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"loss": 0.5294083023071289,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 5.210132360566756e-06,
|
|
"entropy": 0.5223005886375904,
|
|
"num_tokens": 63835062.0,
|
|
"mean_token_accuracy": 0.8909649957716465,
|
|
"epoch": 2.079599435100783,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"loss": 0.5341888427734375,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 4.95006448388992e-06,
|
|
"entropy": 0.5271259383112192,
|
|
"num_tokens": 64622452.0,
|
|
"mean_token_accuracy": 0.8900617562234402,
|
|
"epoch": 2.105276672230068,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"loss": 0.5306560897827148,
|
|
"grad_norm": 0.455078125,
|
|
"learning_rate": 4.69450323949053e-06,
|
|
"entropy": 0.5243064795434474,
|
|
"num_tokens": 65410929.0,
|
|
"mean_token_accuracy": 0.890057615339756,
|
|
"epoch": 2.130953909359353,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"loss": 0.5324376678466797,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 4.443676693756599e-06,
|
|
"entropy": 0.5259603321552276,
|
|
"num_tokens": 66199581.0,
|
|
"mean_token_accuracy": 0.8899568720161914,
|
|
"epoch": 2.1566311464886376,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"loss": 0.5324293899536133,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 4.197808687765592e-06,
|
|
"entropy": 0.5257175669819116,
|
|
"num_tokens": 66988012.0,
|
|
"mean_token_accuracy": 0.8905250385403634,
|
|
"epoch": 2.1823083836179227,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"eval_loss": 0.5466301441192627,
|
|
"eval_runtime": 67.1045,
|
|
"eval_samples_per_second": 201.387,
|
|
"eval_steps_per_second": 25.185,
|
|
"eval_entropy": 0.5302212731901711,
|
|
"eval_num_tokens": 66988012.0,
|
|
"eval_mean_token_accuracy": 0.888240820310525,
|
|
"epoch": 2.1823083836179227,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"loss": 0.5308111572265625,
|
|
"grad_norm": 0.423828125,
|
|
"learning_rate": 3.957118637525545e-06,
|
|
"entropy": 0.5240319129824639,
|
|
"num_tokens": 67776064.0,
|
|
"mean_token_accuracy": 0.8906722447276115,
|
|
"epoch": 2.2079856207472077,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"loss": 0.5349502182006836,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 3.721821338165191e-06,
|
|
"entropy": 0.5290320947766304,
|
|
"num_tokens": 68564254.0,
|
|
"mean_token_accuracy": 0.8893519747257232,
|
|
"epoch": 2.2336628578764923,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"loss": 0.5332579040527343,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 3.4921267722478115e-06,
|
|
"entropy": 0.5256545479595661,
|
|
"num_tokens": 69352920.0,
|
|
"mean_token_accuracy": 0.890228152424097,
|
|
"epoch": 2.2593400950057774,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"loss": 0.5388346099853516,
|
|
"grad_norm": 0.41796875,
|
|
"learning_rate": 3.2682399223799045e-06,
|
|
"entropy": 0.5306229508668184,
|
|
"num_tokens": 70141047.0,
|
|
"mean_token_accuracy": 0.889285789579153,
|
|
"epoch": 2.2850173321350624,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"loss": 0.5309392929077148,
|
|
"grad_norm": 0.478515625,
|
|
"learning_rate": 3.0503605882818623e-06,
|
|
"entropy": 0.5246253449469804,
|
|
"num_tokens": 70929498.0,
|
|
"mean_token_accuracy": 0.8901907150447369,
|
|
"epoch": 2.310694569264347,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"eval_loss": 0.5465638637542725,
|
|
"eval_runtime": 67.4174,
|
|
"eval_samples_per_second": 200.453,
|
|
"eval_steps_per_second": 25.068,
|
|
"eval_entropy": 0.529443961893313,
|
|
"eval_num_tokens": 70929498.0,
|
|
"eval_mean_token_accuracy": 0.8882127779828021,
|
|
"epoch": 2.310694569264347,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"loss": 0.5335653686523437,
|
|
"grad_norm": 0.40234375,
|
|
"learning_rate": 2.838683208483931e-06,
|
|
"entropy": 0.5271350515633821,
|
|
"num_tokens": 71718369.0,
|
|
"mean_token_accuracy": 0.8902666576206684,
|
|
"epoch": 2.336371806393632,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"loss": 0.5284453964233399,
|
|
"grad_norm": 0.39453125,
|
|
"learning_rate": 2.633396686806604e-06,
|
|
"entropy": 0.5240849039703608,
|
|
"num_tokens": 72506753.0,
|
|
"mean_token_accuracy": 0.8903698475658893,
|
|
"epoch": 2.362049043522917,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"loss": 0.5297767639160156,
|
|
"grad_norm": 0.44921875,
|
|
"learning_rate": 2.4346842237802137e-06,
|
|
"entropy": 0.523836979046464,
|
|
"num_tokens": 73294192.0,
|
|
"mean_token_accuracy": 0.8904416194558143,
|
|
"epoch": 2.3877262806522017,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"loss": 0.5338259506225586,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 2.2427231531542605e-06,
|
|
"entropy": 0.5259654937684536,
|
|
"num_tokens": 74082610.0,
|
|
"mean_token_accuracy": 0.8898109787702561,
|
|
"epoch": 2.4134035177814868,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"loss": 0.53539306640625,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 2.057684783642321e-06,
|
|
"entropy": 0.5273031425476075,
|
|
"num_tokens": 74870230.0,
|
|
"mean_token_accuracy": 0.8893570882081986,
|
|
"epoch": 2.439080754910772,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"eval_loss": 0.5465569496154785,
|
|
"eval_runtime": 67.2693,
|
|
"eval_samples_per_second": 200.894,
|
|
"eval_steps_per_second": 25.123,
|
|
"eval_entropy": 0.5297543280576108,
|
|
"eval_num_tokens": 74870230.0,
|
|
"eval_mean_token_accuracy": 0.8882103748928161,
|
|
"epoch": 2.439080754910772,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"loss": 0.5300022888183594,
|
|
"grad_norm": 0.42578125,
|
|
"learning_rate": 1.8797342460437773e-06,
|
|
"entropy": 0.5227815758436918,
|
|
"num_tokens": 75659042.0,
|
|
"mean_token_accuracy": 0.8905887195467949,
|
|
"epoch": 2.4647579920400564,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"loss": 0.5275243759155274,
|
|
"grad_norm": 0.36328125,
|
|
"learning_rate": 1.7090303458788138e-06,
|
|
"entropy": 0.5209154675900937,
|
|
"num_tokens": 76448289.0,
|
|
"mean_token_accuracy": 0.891223351508379,
|
|
"epoch": 2.4904352291693415,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"loss": 0.5308470916748047,
|
|
"grad_norm": 0.396484375,
|
|
"learning_rate": 1.5457254216681706e-06,
|
|
"entropy": 0.5242900583148002,
|
|
"num_tokens": 77237168.0,
|
|
"mean_token_accuracy": 0.8901365567743779,
|
|
"epoch": 2.516112466298626,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"loss": 0.5291099929809571,
|
|
"grad_norm": 0.408203125,
|
|
"learning_rate": 1.3899652089841475e-06,
|
|
"entropy": 0.5219676418602467,
|
|
"num_tokens": 78025882.0,
|
|
"mean_token_accuracy": 0.8904913778603077,
|
|
"epoch": 2.541789703427911,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"loss": 0.5335563278198242,
|
|
"grad_norm": 0.404296875,
|
|
"learning_rate": 1.2418887103941613e-06,
|
|
"entropy": 0.5248580784350634,
|
|
"num_tokens": 78811967.0,
|
|
"mean_token_accuracy": 0.8904596289992333,
|
|
"epoch": 2.567466940557196,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"eval_loss": 0.5465272665023804,
|
|
"eval_runtime": 66.7472,
|
|
"eval_samples_per_second": 202.465,
|
|
"eval_steps_per_second": 25.319,
|
|
"eval_entropy": 0.5304744325269609,
|
|
"eval_num_tokens": 78811967.0,
|
|
"eval_mean_token_accuracy": 0.8882129736905973,
|
|
"epoch": 2.567466940557196,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"loss": 0.5322509765625,
|
|
"grad_norm": 0.3984375,
|
|
"learning_rate": 1.10162807141293e-06,
|
|
"entropy": 0.5262704361230135,
|
|
"num_tokens": 79600687.0,
|
|
"mean_token_accuracy": 0.8897101627290249,
|
|
"epoch": 2.593144177686481,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"loss": 0.5316716384887695,
|
|
"grad_norm": 0.375,
|
|
"learning_rate": 9.693084625739946e-07,
|
|
"entropy": 0.5247344778478146,
|
|
"num_tokens": 80387505.0,
|
|
"mean_token_accuracy": 0.8906851357221603,
|
|
"epoch": 2.618821414815766,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"loss": 0.5358499526977539,
|
|
"grad_norm": 0.3671875,
|
|
"learning_rate": 8.450479677257962e-07,
|
|
"entropy": 0.528542704731226,
|
|
"num_tokens": 81176251.0,
|
|
"mean_token_accuracy": 0.8893057556450367,
|
|
"epoch": 2.644498651945051,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"loss": 0.5294158172607422,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 7.289574786520237e-07,
|
|
"entropy": 0.522821741476655,
|
|
"num_tokens": 81964977.0,
|
|
"mean_token_accuracy": 0.8909594585001469,
|
|
"epoch": 2.6701758890743355,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"loss": 0.5307606124877929,
|
|
"grad_norm": 0.3828125,
|
|
"learning_rate": 6.211405961102512e-07,
|
|
"entropy": 0.5265567531436681,
|
|
"num_tokens": 82752889.0,
|
|
"mean_token_accuracy": 0.8900834138691426,
|
|
"epoch": 2.6958531262036205,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"eval_loss": 0.5465019941329956,
|
|
"eval_runtime": 67.3423,
|
|
"eval_samples_per_second": 200.676,
|
|
"eval_steps_per_second": 25.096,
|
|
"eval_entropy": 0.5300784435145248,
|
|
"eval_num_tokens": 82752889.0,
|
|
"eval_mean_token_accuracy": 0.8881937025919469,
|
|
"epoch": 2.6958531262036205,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"loss": 0.5329842758178711,
|
|
"grad_norm": 0.376953125,
|
|
"learning_rate": 5.216935373771859e-07,
|
|
"entropy": 0.5243370252847671,
|
|
"num_tokens": 83540835.0,
|
|
"mean_token_accuracy": 0.8904808807373047,
|
|
"epoch": 2.7215303633329055,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"loss": 0.5306868362426758,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 4.307050503830457e-07,
|
|
"entropy": 0.5237632666528225,
|
|
"num_tokens": 84328863.0,
|
|
"mean_token_accuracy": 0.890521085858345,
|
|
"epoch": 2.74720760046219,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"loss": 0.5309492874145508,
|
|
"grad_norm": 0.37890625,
|
|
"learning_rate": 3.482563345116763e-07,
|
|
"entropy": 0.5249035055935383,
|
|
"num_tokens": 85116662.0,
|
|
"mean_token_accuracy": 0.8904018171131611,
|
|
"epoch": 2.772884837591475,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"loss": 0.5319928741455078,
|
|
"grad_norm": 0.390625,
|
|
"learning_rate": 2.7442096813709684e-07,
|
|
"entropy": 0.524908444583416,
|
|
"num_tokens": 85905228.0,
|
|
"mean_token_accuracy": 0.8903980639576912,
|
|
"epoch": 2.79856207472076,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"loss": 0.5291864395141601,
|
|
"grad_norm": 0.443359375,
|
|
"learning_rate": 2.0926484296114324e-07,
|
|
"entropy": 0.5247332839667798,
|
|
"num_tokens": 86693156.0,
|
|
"mean_token_accuracy": 0.8905435487627983,
|
|
"epoch": 2.824239311850045,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"eval_loss": 0.5464890003204346,
|
|
"eval_runtime": 67.2701,
|
|
"eval_samples_per_second": 200.892,
|
|
"eval_steps_per_second": 25.123,
|
|
"eval_entropy": 0.5298788991729183,
|
|
"eval_num_tokens": 86693156.0,
|
|
"eval_mean_token_accuracy": 0.8882385518776594,
|
|
"epoch": 2.824239311850045,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"loss": 0.5365339279174804,
|
|
"grad_norm": 0.40625,
|
|
"learning_rate": 1.5284610521080323e-07,
|
|
"entropy": 0.5279695607721806,
|
|
"num_tokens": 87481998.0,
|
|
"mean_token_accuracy": 0.8891479071974754,
|
|
"epoch": 2.84991654897933,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"loss": 0.5289971160888672,
|
|
"grad_norm": 0.380859375,
|
|
"learning_rate": 1.0521510374771781e-07,
|
|
"entropy": 0.5218742392957211,
|
|
"num_tokens": 88271222.0,
|
|
"mean_token_accuracy": 0.8910181857645512,
|
|
"epoch": 2.875593786108615,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"loss": 0.537615737915039,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 6.641434513616208e-08,
|
|
"entropy": 0.5303182192891837,
|
|
"num_tokens": 89059356.0,
|
|
"mean_token_accuracy": 0.88922975435853,
|
|
"epoch": 2.9012710232378995,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"loss": 0.5294175338745117,
|
|
"grad_norm": 0.37109375,
|
|
"learning_rate": 3.6478455709598735e-08,
|
|
"entropy": 0.5221536308526993,
|
|
"num_tokens": 89847466.0,
|
|
"mean_token_accuracy": 0.8907907168567181,
|
|
"epoch": 2.9269482603671846,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"loss": 0.5347037124633789,
|
|
"grad_norm": 0.419921875,
|
|
"learning_rate": 1.543415066966092e-08,
|
|
"entropy": 0.5283580356836319,
|
|
"num_tokens": 90636203.0,
|
|
"mean_token_accuracy": 0.8894244636595249,
|
|
"epoch": 2.952625497496469,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"eval_loss": 0.5464821457862854,
|
|
"eval_runtime": 66.8918,
|
|
"eval_samples_per_second": 202.028,
|
|
"eval_steps_per_second": 25.265,
|
|
"eval_entropy": 0.5296846309180796,
|
|
"eval_num_tokens": 90636203.0,
|
|
"eval_mean_token_accuracy": 0.8882489815618865,
|
|
"epoch": 2.952625497496469,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"loss": 0.5325915145874024,
|
|
"grad_norm": 0.43359375,
|
|
"learning_rate": 3.3002102451362704e-09,
|
|
"entropy": 0.528514067903161,
|
|
"num_tokens": 91424572.0,
|
|
"mean_token_accuracy": 0.8899151113629341,
|
|
"epoch": 2.9783027346257542,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"train_runtime": 9303.979,
|
|
"train_samples_per_second": 40.182,
|
|
"train_steps_per_second": 1.256,
|
|
"total_flos": 9.506673059457577e+17,
|
|
"train_loss": 0.6031799179292613,
|
|
"entropy": 0.525621883262544,
|
|
"num_tokens": 92088141.0,
|
|
"mean_token_accuracy": 0.890005886554718,
|
|
"epoch": 3.0,
|
|
"step": 11685
|
|
}
|
|
] |