Files
dyslexic-writer-qwen3-1.7b/training_metrics.json
ModelHub XC 4bde662064 初始化项目,由ModelHub XC社区提供模型
Model: jburnford/dyslexic-writer-qwen3-1.7b
Source: Original Platform
2026-05-07 17:41:44 +08:00

1427 lines
39 KiB
JSON

[
{
"loss": 5.295790405273437,
"grad_norm": 17.75,
"learning_rate": 1.6937553464499572e-06,
"entropy": 1.3026689371466638,
"num_tokens": 788631.0,
"mean_token_accuracy": 0.45803309440612794,
"epoch": 0.02567723712928489,
"step": 100
},
{
"loss": 1.9093310546875,
"grad_norm": 5.78125,
"learning_rate": 3.4046193327630456e-06,
"entropy": 1.1995259793102742,
"num_tokens": 1575343.0,
"mean_token_accuracy": 0.7073830207437277,
"epoch": 0.05135447425856978,
"step": 200
},
{
"loss": 0.7623916625976562,
"grad_norm": 0.93359375,
"learning_rate": 5.1154833190761344e-06,
"entropy": 0.7093755914270878,
"num_tokens": 2362291.0,
"mean_token_accuracy": 0.8655054874718189,
"epoch": 0.07703171138785467,
"step": 300
},
{
"loss": 0.6790202331542968,
"grad_norm": 0.75,
"learning_rate": 6.826347305389223e-06,
"entropy": 0.6651146249473094,
"num_tokens": 3150439.0,
"mean_token_accuracy": 0.8720192441344261,
"epoch": 0.10270894851713956,
"step": 400
},
{
"loss": 0.661846923828125,
"grad_norm": 0.82421875,
"learning_rate": 8.537211291702311e-06,
"entropy": 0.6505232656002045,
"num_tokens": 3939189.0,
"mean_token_accuracy": 0.8728892022371292,
"epoch": 0.12838618564642446,
"step": 500
},
{
"eval_loss": 0.6569298505783081,
"eval_runtime": 66.9323,
"eval_samples_per_second": 201.905,
"eval_steps_per_second": 25.249,
"eval_entropy": 0.6593257126723521,
"eval_num_tokens": 3939189.0,
"eval_mean_token_accuracy": 0.8742751985964691,
"epoch": 0.12838618564642446,
"step": 500
},
{
"loss": 0.6417784881591797,
"grad_norm": 0.65625,
"learning_rate": 1.02480752780154e-05,
"entropy": 0.6309234929084778,
"num_tokens": 4726987.0,
"mean_token_accuracy": 0.8763968905806542,
"epoch": 0.15406342277570934,
"step": 600
},
{
"loss": 0.6324351119995117,
"grad_norm": 0.58984375,
"learning_rate": 1.1958939264328486e-05,
"entropy": 0.6225168199837208,
"num_tokens": 5514520.0,
"mean_token_accuracy": 0.8769305641949177,
"epoch": 0.17974065990499422,
"step": 700
},
{
"loss": 0.6248664474487304,
"grad_norm": 0.5703125,
"learning_rate": 1.3669803250641576e-05,
"entropy": 0.6152851846814156,
"num_tokens": 6303997.0,
"mean_token_accuracy": 0.8783293107151985,
"epoch": 0.20541789703427912,
"step": 800
},
{
"loss": 0.6181363677978515,
"grad_norm": 0.4453125,
"learning_rate": 1.538066723695466e-05,
"entropy": 0.607969797924161,
"num_tokens": 7092051.0,
"mean_token_accuracy": 0.8795923219621181,
"epoch": 0.231095134163564,
"step": 900
},
{
"loss": 0.6079271697998047,
"grad_norm": 0.63671875,
"learning_rate": 1.7091531223267753e-05,
"entropy": 0.5982871637493372,
"num_tokens": 7879814.0,
"mean_token_accuracy": 0.8805274599790573,
"epoch": 0.2567723712928489,
"step": 1000
},
{
"eval_loss": 0.6138819456100464,
"eval_runtime": 66.9074,
"eval_samples_per_second": 201.981,
"eval_steps_per_second": 25.259,
"eval_entropy": 0.5835022156584192,
"eval_num_tokens": 7879814.0,
"eval_mean_token_accuracy": 0.8795767288941604,
"epoch": 0.2567723712928489,
"step": 1000
},
{
"loss": 0.6031342315673828,
"grad_norm": 0.4765625,
"learning_rate": 1.8802395209580838e-05,
"entropy": 0.5937141847610473,
"num_tokens": 8669131.0,
"mean_token_accuracy": 0.8811740911006928,
"epoch": 0.2824496084221338,
"step": 1100
},
{
"loss": 0.6006821060180664,
"grad_norm": 0.416015625,
"learning_rate": 1.999959838659769e-05,
"entropy": 0.5919705433398486,
"num_tokens": 9457145.0,
"mean_token_accuracy": 0.8807326038181782,
"epoch": 0.30812684555141867,
"step": 1200
},
{
"loss": 0.5951248550415039,
"grad_norm": 0.408203125,
"learning_rate": 1.9992459490144817e-05,
"entropy": 0.5857468252629041,
"num_tokens": 10246533.0,
"mean_token_accuracy": 0.8820273293554783,
"epoch": 0.33380408268070355,
"step": 1300
},
{
"loss": 0.5904627990722656,
"grad_norm": 0.380859375,
"learning_rate": 1.9976403184682326e-05,
"entropy": 0.5813354634493589,
"num_tokens": 11033964.0,
"mean_token_accuracy": 0.8821511951088905,
"epoch": 0.35948131980998843,
"step": 1400
},
{
"loss": 0.5805374526977539,
"grad_norm": 0.48828125,
"learning_rate": 1.9951443799079215e-05,
"entropy": 0.5730234136432409,
"num_tokens": 11822337.0,
"mean_token_accuracy": 0.8843117669224739,
"epoch": 0.3851585569392733,
"step": 1500
},
{
"eval_loss": 0.587958812713623,
"eval_runtime": 66.7304,
"eval_samples_per_second": 202.516,
"eval_steps_per_second": 25.326,
"eval_entropy": 0.5761568091501146,
"eval_num_tokens": 11822337.0,
"eval_mean_token_accuracy": 0.8826649507827308,
"epoch": 0.3851585569392733,
"step": 1500
},
{
"loss": 0.5845616149902344,
"grad_norm": 0.38671875,
"learning_rate": 1.99176036074363e-05,
"entropy": 0.5758610642701387,
"num_tokens": 12609317.0,
"mean_token_accuracy": 0.8831216642260551,
"epoch": 0.41083579406855825,
"step": 1600
},
{
"loss": 0.579046745300293,
"grad_norm": 0.40234375,
"learning_rate": 1.9874912809208492e-05,
"entropy": 0.570131861642003,
"num_tokens": 13396496.0,
"mean_token_accuracy": 0.8837999847531318,
"epoch": 0.4365130311978431,
"step": 1700
},
{
"loss": 0.5719428634643555,
"grad_norm": 0.388671875,
"learning_rate": 1.9823409502254395e-05,
"entropy": 0.5643735866248608,
"num_tokens": 14184641.0,
"mean_token_accuracy": 0.8844822055101395,
"epoch": 0.462190268327128,
"step": 1800
},
{
"loss": 0.566561393737793,
"grad_norm": 0.38671875,
"learning_rate": 1.976313964883724e-05,
"entropy": 0.5589940486103296,
"num_tokens": 14972736.0,
"mean_token_accuracy": 0.8854228469729424,
"epoch": 0.4878675054564129,
"step": 1900
},
{
"loss": 0.5729906463623047,
"grad_norm": 0.400390625,
"learning_rate": 1.969415703460754e-05,
"entropy": 0.5654201730340719,
"num_tokens": 15760765.0,
"mean_token_accuracy": 0.884299693107605,
"epoch": 0.5135447425856978,
"step": 2000
},
{
"eval_loss": 0.574901282787323,
"eval_runtime": 66.9041,
"eval_samples_per_second": 201.991,
"eval_steps_per_second": 25.26,
"eval_entropy": 0.5581040822895321,
"eval_num_tokens": 15760765.0,
"eval_mean_token_accuracy": 0.8844290988685112,
"epoch": 0.5135447425856978,
"step": 2000
},
{
"loss": 0.5724992752075195,
"grad_norm": 0.470703125,
"learning_rate": 1.9616523220604026e-05,
"entropy": 0.5640381355583668,
"num_tokens": 16548995.0,
"mean_token_accuracy": 0.8841193398833275,
"epoch": 0.5392219797149826,
"step": 2100
},
{
"loss": 0.565244369506836,
"grad_norm": 0.404296875,
"learning_rate": 1.9530307488315705e-05,
"entropy": 0.5574718941748142,
"num_tokens": 17338195.0,
"mean_token_accuracy": 0.8858554971218109,
"epoch": 0.5648992168442676,
"step": 2200
},
{
"loss": 0.5641956329345703,
"grad_norm": 0.375,
"learning_rate": 1.943558677785414e-05,
"entropy": 0.556980236619711,
"num_tokens": 18126378.0,
"mean_token_accuracy": 0.8855182483792305,
"epoch": 0.5905764539735524,
"step": 2300
},
{
"loss": 0.5643896865844726,
"grad_norm": 0.447265625,
"learning_rate": 1.9332445619291003e-05,
"entropy": 0.5556083285063506,
"num_tokens": 18915415.0,
"mean_token_accuracy": 0.8858178888261318,
"epoch": 0.6162536911028373,
"step": 2400
},
{
"loss": 0.5537351608276367,
"grad_norm": 0.361328125,
"learning_rate": 1.9220976057222272e-05,
"entropy": 0.5470526535063982,
"num_tokens": 19703215.0,
"mean_token_accuracy": 0.8877282282710075,
"epoch": 0.6419309282321223,
"step": 2500
},
{
"eval_loss": 0.566738486289978,
"eval_runtime": 66.8788,
"eval_samples_per_second": 202.067,
"eval_steps_per_second": 25.27,
"eval_entropy": 0.5507401840750282,
"eval_num_tokens": 19703215.0,
"eval_mean_token_accuracy": 0.8854550624740194,
"epoch": 0.6419309282321223,
"step": 2500
},
{
"loss": 0.5606909561157226,
"grad_norm": 0.4296875,
"learning_rate": 1.9101277568626374e-05,
"entropy": 0.5524309245496988,
"num_tokens": 20491378.0,
"mean_token_accuracy": 0.886026524156332,
"epoch": 0.6676081653614071,
"step": 2600
},
{
"loss": 0.5588908386230469,
"grad_norm": 0.41015625,
"learning_rate": 1.8973456974089533e-05,
"entropy": 0.5509618154168129,
"num_tokens": 21281188.0,
"mean_token_accuracy": 0.8863780727982521,
"epoch": 0.693285402490692,
"step": 2700
},
{
"loss": 0.5587222290039062,
"grad_norm": 0.439453125,
"learning_rate": 1.883762834247763e-05,
"entropy": 0.5505382239073515,
"num_tokens": 22068585.0,
"mean_token_accuracy": 0.8866781835258007,
"epoch": 0.7189626396199769,
"step": 2800
},
{
"loss": 0.5535868453979492,
"grad_norm": 0.359375,
"learning_rate": 1.8693912889139548e-05,
"entropy": 0.5467930260300636,
"num_tokens": 22857792.0,
"mean_token_accuracy": 0.8876498517394066,
"epoch": 0.7446398767492618,
"step": 2900
},
{
"loss": 0.5560418701171875,
"grad_norm": 0.38671875,
"learning_rate": 1.8542438867732926e-05,
"entropy": 0.5481385685503483,
"num_tokens": 23646674.0,
"mean_token_accuracy": 0.8865408559143543,
"epoch": 0.7703171138785466,
"step": 3000
},
{
"eval_loss": 0.5608235001564026,
"eval_runtime": 66.9694,
"eval_samples_per_second": 201.794,
"eval_steps_per_second": 25.235,
"eval_entropy": 0.5475869985727163,
"eval_num_tokens": 23646674.0,
"eval_mean_token_accuracy": 0.8862869346988271,
"epoch": 0.7703171138785466,
"step": 3000
},
{
"loss": 0.5510826873779296,
"grad_norm": 0.392578125,
"learning_rate": 1.8383341455768818e-05,
"entropy": 0.5442652675509453,
"num_tokens": 24435112.0,
"mean_token_accuracy": 0.8880468539893627,
"epoch": 0.7959943510078316,
"step": 3100
},
{
"loss": 0.5556667709350586,
"grad_norm": 0.46484375,
"learning_rate": 1.821676263397742e-05,
"entropy": 0.5477713013440371,
"num_tokens": 25222534.0,
"mean_token_accuracy": 0.8866990077495575,
"epoch": 0.8216715881371165,
"step": 3200
},
{
"loss": 0.5512593460083007,
"grad_norm": 0.36328125,
"learning_rate": 1.80428510596025e-05,
"entropy": 0.5437313695996999,
"num_tokens": 26010061.0,
"mean_token_accuracy": 0.8885632981359959,
"epoch": 0.8473488252664013,
"step": 3300
},
{
"loss": 0.5498195266723633,
"grad_norm": 0.39453125,
"learning_rate": 1.7861761933737617e-05,
"entropy": 0.5414738351106644,
"num_tokens": 26798369.0,
"mean_token_accuracy": 0.8879594984650612,
"epoch": 0.8730260623956863,
"step": 3400
},
{
"loss": 0.5532369232177734,
"grad_norm": 0.38671875,
"learning_rate": 1.7673656862822515e-05,
"entropy": 0.5464992509037256,
"num_tokens": 27588005.0,
"mean_token_accuracy": 0.8868030488491059,
"epoch": 0.8987032995249711,
"step": 3500
},
{
"eval_loss": 0.5567756295204163,
"eval_runtime": 66.7863,
"eval_samples_per_second": 202.347,
"eval_steps_per_second": 25.305,
"eval_entropy": 0.5490564776595528,
"eval_num_tokens": 27588005.0,
"eval_mean_token_accuracy": 0.8867901984051134,
"epoch": 0.8987032995249711,
"step": 3500
},
{
"loss": 0.5537870025634766,
"grad_norm": 0.38671875,
"learning_rate": 1.7478703714423316e-05,
"entropy": 0.54683793194592,
"num_tokens": 28377750.0,
"mean_token_accuracy": 0.8869239047169686,
"epoch": 0.924380536654256,
"step": 3600
},
{
"loss": 0.5514320755004882,
"grad_norm": 0.435546875,
"learning_rate": 1.7277076467425163e-05,
"entropy": 0.5437256157398224,
"num_tokens": 29166016.0,
"mean_token_accuracy": 0.8873179040849208,
"epoch": 0.9500577737835408,
"step": 3700
},
{
"loss": 0.5542150115966797,
"grad_norm": 0.37109375,
"learning_rate": 1.706895505677108e-05,
"entropy": 0.5465144612640143,
"num_tokens": 29953000.0,
"mean_token_accuracy": 0.887007016390562,
"epoch": 0.9757350109128258,
"step": 3800
},
{
"loss": 0.5507764053344727,
"grad_norm": 0.443359375,
"learning_rate": 1.6854525212885517e-05,
"entropy": 0.5444752973827285,
"num_tokens": 30735372.0,
"mean_token_accuracy": 0.887075167355226,
"epoch": 1.0012838618564643,
"step": 3900
},
{
"loss": 0.5405293273925781,
"grad_norm": 0.404296875,
"learning_rate": 1.6633978295925973e-05,
"entropy": 0.5336830996721983,
"num_tokens": 31523776.0,
"mean_token_accuracy": 0.8888550646603107,
"epoch": 1.0269610989857492,
"step": 4000
},
{
"eval_loss": 0.5542467832565308,
"eval_runtime": 66.8867,
"eval_samples_per_second": 202.043,
"eval_steps_per_second": 25.267,
"eval_entropy": 0.5430249788175673,
"eval_num_tokens": 31523776.0,
"eval_mean_token_accuracy": 0.8871892735450225,
"epoch": 1.0269610989857492,
"step": 4000
},
{
"loss": 0.5439878845214844,
"grad_norm": 0.408203125,
"learning_rate": 1.6407511125010535e-05,
"entropy": 0.5366661065071822,
"num_tokens": 32312483.0,
"mean_token_accuracy": 0.8886200107634068,
"epoch": 1.052638336115034,
"step": 4100
},
{
"loss": 0.538640251159668,
"grad_norm": 0.376953125,
"learning_rate": 1.6175325802573762e-05,
"entropy": 0.5315170773863792,
"num_tokens": 33100106.0,
"mean_token_accuracy": 0.8894328561425209,
"epoch": 1.0783155732443188,
"step": 4200
},
{
"loss": 0.5469233703613281,
"grad_norm": 0.392578125,
"learning_rate": 1.593762953400771e-05,
"entropy": 0.5396519158780575,
"num_tokens": 33887447.0,
"mean_token_accuracy": 0.8873359954357147,
"epoch": 1.1039928103736039,
"step": 4300
},
{
"loss": 0.5387564086914063,
"grad_norm": 0.375,
"learning_rate": 1.569463444274896e-05,
"entropy": 0.5312051647901535,
"num_tokens": 34676497.0,
"mean_token_accuracy": 0.8895892894268036,
"epoch": 1.1296700475028887,
"step": 4400
},
{
"loss": 0.5374182510375977,
"grad_norm": 0.359375,
"learning_rate": 1.5446557380976705e-05,
"entropy": 0.5307135570794344,
"num_tokens": 35464531.0,
"mean_token_accuracy": 0.8893489798903466,
"epoch": 1.1553472846321735,
"step": 4500
},
{
"eval_loss": 0.5518077611923218,
"eval_runtime": 66.9817,
"eval_samples_per_second": 201.756,
"eval_steps_per_second": 25.231,
"eval_entropy": 0.538807505527897,
"eval_num_tokens": 35464531.0,
"eval_mean_token_accuracy": 0.8874176931804454,
"epoch": 1.1553472846321735,
"step": 4500
},
{
"loss": 0.5373505783081055,
"grad_norm": 0.34375,
"learning_rate": 1.5193619736090915e-05,
"entropy": 0.5305563137680293,
"num_tokens": 36252194.0,
"mean_token_accuracy": 0.8893280589580536,
"epoch": 1.1810245217614586,
"step": 4600
},
{
"loss": 0.5385945892333984,
"grad_norm": 0.3984375,
"learning_rate": 1.4936047233143121e-05,
"entropy": 0.5316550326347351,
"num_tokens": 37040269.0,
"mean_token_accuracy": 0.8889729425311088,
"epoch": 1.2067017588907434,
"step": 4700
},
{
"loss": 0.5383005142211914,
"grad_norm": 0.43359375,
"learning_rate": 1.4674069733396277e-05,
"entropy": 0.5314889302104712,
"num_tokens": 37828541.0,
"mean_token_accuracy": 0.8891593493521214,
"epoch": 1.2323789960200282,
"step": 4800
},
{
"loss": 0.5413074493408203,
"grad_norm": 0.41015625,
"learning_rate": 1.4407921029193386e-05,
"entropy": 0.5336076222360134,
"num_tokens": 38617419.0,
"mean_token_accuracy": 0.8887772466242313,
"epoch": 1.258056233149313,
"step": 4900
},
{
"loss": 0.5325155639648438,
"grad_norm": 0.38671875,
"learning_rate": 1.4137838635317981e-05,
"entropy": 0.5255533574521541,
"num_tokens": 39405754.0,
"mean_token_accuracy": 0.8911273476481437,
"epoch": 1.283733470278598,
"step": 5000
},
{
"eval_loss": 0.5503326654434204,
"eval_runtime": 66.9521,
"eval_samples_per_second": 201.846,
"eval_steps_per_second": 25.242,
"eval_entropy": 0.5397688833037777,
"eval_num_tokens": 39405754.0,
"eval_mean_token_accuracy": 0.8876568412639686,
"epoch": 1.283733470278598,
"step": 5000
},
{
"loss": 0.5346319198608398,
"grad_norm": 0.37890625,
"learning_rate": 1.3864063577032644e-05,
"entropy": 0.5269782991707325,
"num_tokens": 40194187.0,
"mean_token_accuracy": 0.8896128107607365,
"epoch": 1.309410707407883,
"step": 5100
},
{
"loss": 0.5403598022460937,
"grad_norm": 0.40625,
"learning_rate": 1.3586840174984741e-05,
"entropy": 0.5327016961574554,
"num_tokens": 40982840.0,
"mean_token_accuracy": 0.8892854882776737,
"epoch": 1.3350879445371677,
"step": 5200
},
{
"loss": 0.5382755661010742,
"grad_norm": 0.421875,
"learning_rate": 1.3306415827171285e-05,
"entropy": 0.5321682307869195,
"num_tokens": 41771135.0,
"mean_token_accuracy": 0.8891067253053189,
"epoch": 1.3607651816664528,
"step": 5300
},
{
"loss": 0.5331235885620117,
"grad_norm": 0.380859375,
"learning_rate": 1.3023040788157542e-05,
"entropy": 0.5263735573738814,
"num_tokens": 42560001.0,
"mean_token_accuracy": 0.8908107495307922,
"epoch": 1.3864424187957376,
"step": 5400
},
{
"loss": 0.5401896667480469,
"grad_norm": 0.416015625,
"learning_rate": 1.2736967945746414e-05,
"entropy": 0.5332227103412152,
"num_tokens": 43348221.0,
"mean_token_accuracy": 0.8889072981476783,
"epoch": 1.4121196559250224,
"step": 5500
},
{
"eval_loss": 0.5492498278617859,
"eval_runtime": 67.3776,
"eval_samples_per_second": 200.571,
"eval_steps_per_second": 25.083,
"eval_entropy": 0.537609067100745,
"eval_num_tokens": 43348221.0,
"eval_mean_token_accuracy": 0.8878510412379835,
"epoch": 1.4121196559250224,
"step": 5500
},
{
"loss": 0.5376063156127929,
"grad_norm": 0.404296875,
"learning_rate": 1.244845259529785e-05,
"entropy": 0.530102282166481,
"num_tokens": 44137063.0,
"mean_token_accuracy": 0.8888533094525337,
"epoch": 1.4377968930543075,
"step": 5600
},
{
"loss": 0.5347390747070313,
"grad_norm": 0.3828125,
"learning_rate": 1.2157752211899743e-05,
"entropy": 0.5278809563070536,
"num_tokens": 44925913.0,
"mean_token_accuracy": 0.890110841691494,
"epoch": 1.4634741301835923,
"step": 5700
},
{
"loss": 0.5396208572387695,
"grad_norm": 0.396484375,
"learning_rate": 1.1865126220593606e-05,
"entropy": 0.5324571677297354,
"num_tokens": 45713041.0,
"mean_token_accuracy": 0.8887992192804813,
"epoch": 1.4891513673128771,
"step": 5800
},
{
"loss": 0.5366056060791016,
"grad_norm": 0.38671875,
"learning_rate": 1.157083576486007e-05,
"entropy": 0.5297523141652346,
"num_tokens": 46501563.0,
"mean_token_accuracy": 0.8896933840215207,
"epoch": 1.5148286044421622,
"step": 5900
},
{
"loss": 0.5389834213256836,
"grad_norm": 0.43359375,
"learning_rate": 1.127514347357083e-05,
"entropy": 0.5312013550847768,
"num_tokens": 47289330.0,
"mean_token_accuracy": 0.889183943271637,
"epoch": 1.5405058415714468,
"step": 6000
},
{
"eval_loss": 0.5483260750770569,
"eval_runtime": 67.2323,
"eval_samples_per_second": 201.005,
"eval_steps_per_second": 25.137,
"eval_entropy": 0.5363286497148536,
"eval_num_tokens": 47289330.0,
"eval_mean_token_accuracy": 0.8879236852628944,
"epoch": 1.5405058415714468,
"step": 6000
},
{
"loss": 0.5336640548706054,
"grad_norm": 0.43359375,
"learning_rate": 1.097831322661502e-05,
"entropy": 0.5288673885911703,
"num_tokens": 48077668.0,
"mean_token_accuracy": 0.8898833976686,
"epoch": 1.5661830787007318,
"step": 6100
},
{
"loss": 0.5375140762329101,
"grad_norm": 0.3671875,
"learning_rate": 1.0680609919409147e-05,
"entropy": 0.5301950394362211,
"num_tokens": 48866262.0,
"mean_token_accuracy": 0.8895807403326035,
"epoch": 1.5918603158300166,
"step": 6200
},
{
"loss": 0.5402913665771485,
"grad_norm": 0.431640625,
"learning_rate": 1.0382299226500746e-05,
"entropy": 0.5333884459733963,
"num_tokens": 49654335.0,
"mean_token_accuracy": 0.888499419093132,
"epoch": 1.6175375529593015,
"step": 6300
},
{
"loss": 0.5385772323608399,
"grad_norm": 0.431640625,
"learning_rate": 1.0083647364476762e-05,
"entropy": 0.531462483778596,
"num_tokens": 50442084.0,
"mean_token_accuracy": 0.8892641182243824,
"epoch": 1.6432147900885865,
"step": 6400
},
{
"loss": 0.5353133010864258,
"grad_norm": 0.38671875,
"learning_rate": 9.784920854388168e-06,
"entropy": 0.527387068271637,
"num_tokens": 51229616.0,
"mean_token_accuracy": 0.8898960523307323,
"epoch": 1.6688920272178713,
"step": 6500
},
{
"eval_loss": 0.5475577116012573,
"eval_runtime": 66.8604,
"eval_samples_per_second": 202.122,
"eval_steps_per_second": 25.277,
"eval_entropy": 0.5349308549123403,
"eval_num_tokens": 51229616.0,
"eval_mean_token_accuracy": 0.8880981687610672,
"epoch": 1.6688920272178713,
"step": 6500
},
{
"loss": 0.5429517364501953,
"grad_norm": 0.380859375,
"learning_rate": 9.486386283902909e-06,
"entropy": 0.537000098451972,
"num_tokens": 52018364.0,
"mean_token_accuracy": 0.8879952408373356,
"epoch": 1.6945692643471562,
"step": 6600
},
{
"loss": 0.53443115234375,
"grad_norm": 0.447265625,
"learning_rate": 9.188310069399368e-06,
"entropy": 0.5280120493471623,
"num_tokens": 52807100.0,
"mean_token_accuracy": 0.8897963106632233,
"epoch": 1.7202465014764412,
"step": 6700
},
{
"loss": 0.5356635665893554,
"grad_norm": 0.40625,
"learning_rate": 8.890958218212716e-06,
"entropy": 0.5287372674047947,
"num_tokens": 53594818.0,
"mean_token_accuracy": 0.8895081703364849,
"epoch": 1.745923738605726,
"step": 6800
},
{
"loss": 0.5367589569091797,
"grad_norm": 0.369140625,
"learning_rate": 8.594596091246282e-06,
"entropy": 0.5299282168596983,
"num_tokens": 54382275.0,
"mean_token_accuracy": 0.8896372695267201,
"epoch": 1.7716009757350109,
"step": 6900
},
{
"loss": 0.5325223159790039,
"grad_norm": 0.3984375,
"learning_rate": 8.299488166159817e-06,
"entropy": 0.5243838762491941,
"num_tokens": 55171058.0,
"mean_token_accuracy": 0.890430568009615,
"epoch": 1.797278212864296,
"step": 7000
},
{
"eval_loss": 0.5471183061599731,
"eval_runtime": 67.121,
"eval_samples_per_second": 201.338,
"eval_steps_per_second": 25.178,
"eval_entropy": 0.5362202901106614,
"eval_num_tokens": 55171058.0,
"eval_mean_token_accuracy": 0.8881139952402849,
"epoch": 1.797278212864296,
"step": 7000
},
{
"loss": 0.5369550704956054,
"grad_norm": 0.41015625,
"learning_rate": 8.005897801345976e-06,
"entropy": 0.5303717163205147,
"num_tokens": 55960002.0,
"mean_token_accuracy": 0.8894575189054013,
"epoch": 1.8229554499935807,
"step": 7100
},
{
"loss": 0.5351543045043945,
"grad_norm": 0.3671875,
"learning_rate": 7.714087000905643e-06,
"entropy": 0.5274955333024264,
"num_tokens": 56748093.0,
"mean_token_accuracy": 0.8897032625973225,
"epoch": 1.8486326871228655,
"step": 7200
},
{
"loss": 0.5317428207397461,
"grad_norm": 0.396484375,
"learning_rate": 7.4243161808318465e-06,
"entropy": 0.5234256482124329,
"num_tokens": 57536251.0,
"mean_token_accuracy": 0.8904528062045575,
"epoch": 1.8743099242521506,
"step": 7300
},
{
"loss": 0.5373667526245117,
"grad_norm": 0.462890625,
"learning_rate": 7.136843936610935e-06,
"entropy": 0.5299951387941837,
"num_tokens": 58324650.0,
"mean_token_accuracy": 0.8894152472913265,
"epoch": 1.8999871613814352,
"step": 7400
},
{
"loss": 0.5356111907958985,
"grad_norm": 0.4375,
"learning_rate": 6.851926812448384e-06,
"entropy": 0.5281449986249208,
"num_tokens": 59112829.0,
"mean_token_accuracy": 0.8894895881414413,
"epoch": 1.9256643985107202,
"step": 7500
},
{
"eval_loss": 0.5469039082527161,
"eval_runtime": 67.2936,
"eval_samples_per_second": 200.821,
"eval_steps_per_second": 25.114,
"eval_entropy": 0.5319856126809261,
"eval_num_tokens": 59112829.0,
"eval_mean_token_accuracy": 0.8881882847060819,
"epoch": 1.9256643985107202,
"step": 7500
},
{
"loss": 0.5324957275390625,
"grad_norm": 0.373046875,
"learning_rate": 6.569819072325195e-06,
"entropy": 0.5259347888082266,
"num_tokens": 59899148.0,
"mean_token_accuracy": 0.8905223569273949,
"epoch": 1.9513416356400053,
"step": 7600
},
{
"loss": 0.5345915985107422,
"grad_norm": 0.396484375,
"learning_rate": 6.290772473089214e-06,
"entropy": 0.5266730510443449,
"num_tokens": 60687114.0,
"mean_token_accuracy": 0.8896890124678611,
"epoch": 1.97701887276929,
"step": 7700
},
{
"loss": 0.5284463882446289,
"grad_norm": 0.37109375,
"learning_rate": 6.015036039783836e-06,
"entropy": 0.5242100227717779,
"num_tokens": 61470852.0,
"mean_token_accuracy": 0.8908302205890867,
"epoch": 2.0025677237129287,
"step": 7800
},
{
"loss": 0.5349670028686524,
"grad_norm": 0.40234375,
"learning_rate": 5.74285584341461e-06,
"entropy": 0.5276301963627339,
"num_tokens": 62258538.0,
"mean_token_accuracy": 0.8893012750148773,
"epoch": 2.0282449608422133,
"step": 7900
},
{
"loss": 0.5321082305908204,
"grad_norm": 0.39453125,
"learning_rate": 5.474474781352066e-06,
"entropy": 0.5241756404936314,
"num_tokens": 63047491.0,
"mean_token_accuracy": 0.8900664694607258,
"epoch": 2.0539221979714983,
"step": 8000
},
{
"eval_loss": 0.5467016696929932,
"eval_runtime": 67.0874,
"eval_samples_per_second": 201.439,
"eval_steps_per_second": 25.191,
"eval_entropy": 0.5306698962957901,
"eval_num_tokens": 63047491.0,
"eval_mean_token_accuracy": 0.8882128992024259,
"epoch": 2.0539221979714983,
"step": 8000
},
{
"loss": 0.5294083023071289,
"grad_norm": 0.43359375,
"learning_rate": 5.210132360566756e-06,
"entropy": 0.5223005886375904,
"num_tokens": 63835062.0,
"mean_token_accuracy": 0.8909649957716465,
"epoch": 2.079599435100783,
"step": 8100
},
{
"loss": 0.5341888427734375,
"grad_norm": 0.396484375,
"learning_rate": 4.95006448388992e-06,
"entropy": 0.5271259383112192,
"num_tokens": 64622452.0,
"mean_token_accuracy": 0.8900617562234402,
"epoch": 2.105276672230068,
"step": 8200
},
{
"loss": 0.5306560897827148,
"grad_norm": 0.455078125,
"learning_rate": 4.69450323949053e-06,
"entropy": 0.5243064795434474,
"num_tokens": 65410929.0,
"mean_token_accuracy": 0.890057615339756,
"epoch": 2.130953909359353,
"step": 8300
},
{
"loss": 0.5324376678466797,
"grad_norm": 0.3984375,
"learning_rate": 4.443676693756599e-06,
"entropy": 0.5259603321552276,
"num_tokens": 66199581.0,
"mean_token_accuracy": 0.8899568720161914,
"epoch": 2.1566311464886376,
"step": 8400
},
{
"loss": 0.5324293899536133,
"grad_norm": 0.42578125,
"learning_rate": 4.197808687765592e-06,
"entropy": 0.5257175669819116,
"num_tokens": 66988012.0,
"mean_token_accuracy": 0.8905250385403634,
"epoch": 2.1823083836179227,
"step": 8500
},
{
"eval_loss": 0.5466301441192627,
"eval_runtime": 67.1045,
"eval_samples_per_second": 201.387,
"eval_steps_per_second": 25.185,
"eval_entropy": 0.5302212731901711,
"eval_num_tokens": 66988012.0,
"eval_mean_token_accuracy": 0.888240820310525,
"epoch": 2.1823083836179227,
"step": 8500
},
{
"loss": 0.5308111572265625,
"grad_norm": 0.423828125,
"learning_rate": 3.957118637525545e-06,
"entropy": 0.5240319129824639,
"num_tokens": 67776064.0,
"mean_token_accuracy": 0.8906722447276115,
"epoch": 2.2079856207472077,
"step": 8600
},
{
"loss": 0.5349502182006836,
"grad_norm": 0.384765625,
"learning_rate": 3.721821338165191e-06,
"entropy": 0.5290320947766304,
"num_tokens": 68564254.0,
"mean_token_accuracy": 0.8893519747257232,
"epoch": 2.2336628578764923,
"step": 8700
},
{
"loss": 0.5332579040527343,
"grad_norm": 0.40234375,
"learning_rate": 3.4921267722478115e-06,
"entropy": 0.5256545479595661,
"num_tokens": 69352920.0,
"mean_token_accuracy": 0.890228152424097,
"epoch": 2.2593400950057774,
"step": 8800
},
{
"loss": 0.5388346099853516,
"grad_norm": 0.41796875,
"learning_rate": 3.2682399223799045e-06,
"entropy": 0.5306229508668184,
"num_tokens": 70141047.0,
"mean_token_accuracy": 0.889285789579153,
"epoch": 2.2850173321350624,
"step": 8900
},
{
"loss": 0.5309392929077148,
"grad_norm": 0.478515625,
"learning_rate": 3.0503605882818623e-06,
"entropy": 0.5246253449469804,
"num_tokens": 70929498.0,
"mean_token_accuracy": 0.8901907150447369,
"epoch": 2.310694569264347,
"step": 9000
},
{
"eval_loss": 0.5465638637542725,
"eval_runtime": 67.4174,
"eval_samples_per_second": 200.453,
"eval_steps_per_second": 25.068,
"eval_entropy": 0.529443961893313,
"eval_num_tokens": 70929498.0,
"eval_mean_token_accuracy": 0.8882127779828021,
"epoch": 2.310694569264347,
"step": 9000
},
{
"loss": 0.5335653686523437,
"grad_norm": 0.40234375,
"learning_rate": 2.838683208483931e-06,
"entropy": 0.5271350515633821,
"num_tokens": 71718369.0,
"mean_token_accuracy": 0.8902666576206684,
"epoch": 2.336371806393632,
"step": 9100
},
{
"loss": 0.5284453964233399,
"grad_norm": 0.39453125,
"learning_rate": 2.633396686806604e-06,
"entropy": 0.5240849039703608,
"num_tokens": 72506753.0,
"mean_token_accuracy": 0.8903698475658893,
"epoch": 2.362049043522917,
"step": 9200
},
{
"loss": 0.5297767639160156,
"grad_norm": 0.44921875,
"learning_rate": 2.4346842237802137e-06,
"entropy": 0.523836979046464,
"num_tokens": 73294192.0,
"mean_token_accuracy": 0.8904416194558143,
"epoch": 2.3877262806522017,
"step": 9300
},
{
"loss": 0.5338259506225586,
"grad_norm": 0.419921875,
"learning_rate": 2.2427231531542605e-06,
"entropy": 0.5259654937684536,
"num_tokens": 74082610.0,
"mean_token_accuracy": 0.8898109787702561,
"epoch": 2.4134035177814868,
"step": 9400
},
{
"loss": 0.53539306640625,
"grad_norm": 0.376953125,
"learning_rate": 2.057684783642321e-06,
"entropy": 0.5273031425476075,
"num_tokens": 74870230.0,
"mean_token_accuracy": 0.8893570882081986,
"epoch": 2.439080754910772,
"step": 9500
},
{
"eval_loss": 0.5465569496154785,
"eval_runtime": 67.2693,
"eval_samples_per_second": 200.894,
"eval_steps_per_second": 25.123,
"eval_entropy": 0.5297543280576108,
"eval_num_tokens": 74870230.0,
"eval_mean_token_accuracy": 0.8882103748928161,
"epoch": 2.439080754910772,
"step": 9500
},
{
"loss": 0.5300022888183594,
"grad_norm": 0.42578125,
"learning_rate": 1.8797342460437773e-06,
"entropy": 0.5227815758436918,
"num_tokens": 75659042.0,
"mean_token_accuracy": 0.8905887195467949,
"epoch": 2.4647579920400564,
"step": 9600
},
{
"loss": 0.5275243759155274,
"grad_norm": 0.36328125,
"learning_rate": 1.7090303458788138e-06,
"entropy": 0.5209154675900937,
"num_tokens": 76448289.0,
"mean_token_accuracy": 0.891223351508379,
"epoch": 2.4904352291693415,
"step": 9700
},
{
"loss": 0.5308470916748047,
"grad_norm": 0.396484375,
"learning_rate": 1.5457254216681706e-06,
"entropy": 0.5242900583148002,
"num_tokens": 77237168.0,
"mean_token_accuracy": 0.8901365567743779,
"epoch": 2.516112466298626,
"step": 9800
},
{
"loss": 0.5291099929809571,
"grad_norm": 0.408203125,
"learning_rate": 1.3899652089841475e-06,
"entropy": 0.5219676418602467,
"num_tokens": 78025882.0,
"mean_token_accuracy": 0.8904913778603077,
"epoch": 2.541789703427911,
"step": 9900
},
{
"loss": 0.5335563278198242,
"grad_norm": 0.404296875,
"learning_rate": 1.2418887103941613e-06,
"entropy": 0.5248580784350634,
"num_tokens": 78811967.0,
"mean_token_accuracy": 0.8904596289992333,
"epoch": 2.567466940557196,
"step": 10000
},
{
"eval_loss": 0.5465272665023804,
"eval_runtime": 66.7472,
"eval_samples_per_second": 202.465,
"eval_steps_per_second": 25.319,
"eval_entropy": 0.5304744325269609,
"eval_num_tokens": 78811967.0,
"eval_mean_token_accuracy": 0.8882129736905973,
"epoch": 2.567466940557196,
"step": 10000
},
{
"loss": 0.5322509765625,
"grad_norm": 0.3984375,
"learning_rate": 1.10162807141293e-06,
"entropy": 0.5262704361230135,
"num_tokens": 79600687.0,
"mean_token_accuracy": 0.8897101627290249,
"epoch": 2.593144177686481,
"step": 10100
},
{
"loss": 0.5316716384887695,
"grad_norm": 0.375,
"learning_rate": 9.693084625739946e-07,
"entropy": 0.5247344778478146,
"num_tokens": 80387505.0,
"mean_token_accuracy": 0.8906851357221603,
"epoch": 2.618821414815766,
"step": 10200
},
{
"loss": 0.5358499526977539,
"grad_norm": 0.3671875,
"learning_rate": 8.450479677257962e-07,
"entropy": 0.528542704731226,
"num_tokens": 81176251.0,
"mean_token_accuracy": 0.8893057556450367,
"epoch": 2.644498651945051,
"step": 10300
},
{
"loss": 0.5294158172607422,
"grad_norm": 0.43359375,
"learning_rate": 7.289574786520237e-07,
"entropy": 0.522821741476655,
"num_tokens": 81964977.0,
"mean_token_accuracy": 0.8909594585001469,
"epoch": 2.6701758890743355,
"step": 10400
},
{
"loss": 0.5307606124877929,
"grad_norm": 0.3828125,
"learning_rate": 6.211405961102512e-07,
"entropy": 0.5265567531436681,
"num_tokens": 82752889.0,
"mean_token_accuracy": 0.8900834138691426,
"epoch": 2.6958531262036205,
"step": 10500
},
{
"eval_loss": 0.5465019941329956,
"eval_runtime": 67.3423,
"eval_samples_per_second": 200.676,
"eval_steps_per_second": 25.096,
"eval_entropy": 0.5300784435145248,
"eval_num_tokens": 82752889.0,
"eval_mean_token_accuracy": 0.8881937025919469,
"epoch": 2.6958531262036205,
"step": 10500
},
{
"loss": 0.5329842758178711,
"grad_norm": 0.376953125,
"learning_rate": 5.216935373771859e-07,
"entropy": 0.5243370252847671,
"num_tokens": 83540835.0,
"mean_token_accuracy": 0.8904808807373047,
"epoch": 2.7215303633329055,
"step": 10600
},
{
"loss": 0.5306868362426758,
"grad_norm": 0.419921875,
"learning_rate": 4.307050503830457e-07,
"entropy": 0.5237632666528225,
"num_tokens": 84328863.0,
"mean_token_accuracy": 0.890521085858345,
"epoch": 2.74720760046219,
"step": 10700
},
{
"loss": 0.5309492874145508,
"grad_norm": 0.37890625,
"learning_rate": 3.482563345116763e-07,
"entropy": 0.5249035055935383,
"num_tokens": 85116662.0,
"mean_token_accuracy": 0.8904018171131611,
"epoch": 2.772884837591475,
"step": 10800
},
{
"loss": 0.5319928741455078,
"grad_norm": 0.390625,
"learning_rate": 2.7442096813709684e-07,
"entropy": 0.524908444583416,
"num_tokens": 85905228.0,
"mean_token_accuracy": 0.8903980639576912,
"epoch": 2.79856207472076,
"step": 10900
},
{
"loss": 0.5291864395141601,
"grad_norm": 0.443359375,
"learning_rate": 2.0926484296114324e-07,
"entropy": 0.5247332839667798,
"num_tokens": 86693156.0,
"mean_token_accuracy": 0.8905435487627983,
"epoch": 2.824239311850045,
"step": 11000
},
{
"eval_loss": 0.5464890003204346,
"eval_runtime": 67.2701,
"eval_samples_per_second": 200.892,
"eval_steps_per_second": 25.123,
"eval_entropy": 0.5298788991729183,
"eval_num_tokens": 86693156.0,
"eval_mean_token_accuracy": 0.8882385518776594,
"epoch": 2.824239311850045,
"step": 11000
},
{
"loss": 0.5365339279174804,
"grad_norm": 0.40625,
"learning_rate": 1.5284610521080323e-07,
"entropy": 0.5279695607721806,
"num_tokens": 87481998.0,
"mean_token_accuracy": 0.8891479071974754,
"epoch": 2.84991654897933,
"step": 11100
},
{
"loss": 0.5289971160888672,
"grad_norm": 0.380859375,
"learning_rate": 1.0521510374771781e-07,
"entropy": 0.5218742392957211,
"num_tokens": 88271222.0,
"mean_token_accuracy": 0.8910181857645512,
"epoch": 2.875593786108615,
"step": 11200
},
{
"loss": 0.537615737915039,
"grad_norm": 0.43359375,
"learning_rate": 6.641434513616208e-08,
"entropy": 0.5303182192891837,
"num_tokens": 89059356.0,
"mean_token_accuracy": 0.88922975435853,
"epoch": 2.9012710232378995,
"step": 11300
},
{
"loss": 0.5294175338745117,
"grad_norm": 0.37109375,
"learning_rate": 3.6478455709598735e-08,
"entropy": 0.5221536308526993,
"num_tokens": 89847466.0,
"mean_token_accuracy": 0.8907907168567181,
"epoch": 2.9269482603671846,
"step": 11400
},
{
"loss": 0.5347037124633789,
"grad_norm": 0.419921875,
"learning_rate": 1.543415066966092e-08,
"entropy": 0.5283580356836319,
"num_tokens": 90636203.0,
"mean_token_accuracy": 0.8894244636595249,
"epoch": 2.952625497496469,
"step": 11500
},
{
"eval_loss": 0.5464821457862854,
"eval_runtime": 66.8918,
"eval_samples_per_second": 202.028,
"eval_steps_per_second": 25.265,
"eval_entropy": 0.5296846309180796,
"eval_num_tokens": 90636203.0,
"eval_mean_token_accuracy": 0.8882489815618865,
"epoch": 2.952625497496469,
"step": 11500
},
{
"loss": 0.5325915145874024,
"grad_norm": 0.43359375,
"learning_rate": 3.3002102451362704e-09,
"entropy": 0.528514067903161,
"num_tokens": 91424572.0,
"mean_token_accuracy": 0.8899151113629341,
"epoch": 2.9783027346257542,
"step": 11600
},
{
"train_runtime": 9303.979,
"train_samples_per_second": 40.182,
"train_steps_per_second": 1.256,
"total_flos": 9.506673059457577e+17,
"train_loss": 0.6031799179292613,
"entropy": 0.525621883262544,
"num_tokens": 92088141.0,
"mean_token_accuracy": 0.890005886554718,
"epoch": 3.0,
"step": 11685
}
]