Files
LLaMA2-7bTatoeba/trainer_state.json
ModelHub XC a1e548d67f 初始化项目,由ModelHub XC社区提供模型
Model: Lili85/LLaMA2-7bTatoeba
Source: Original Platform
2026-06-21 07:03:25 +08:00

1157 lines
33 KiB
JSON

{
"best_global_step": 4400,
"best_metric": 0.6191316843032837,
"best_model_checkpoint": "/mnt/scratch/users/sglli24/fine-tuning-project/fine_tuned_model/llama2-tatoeba-en-fr-20251120-101824/checkpoint-4400",
"epoch": 0.9777777777777777,
"eval_steps": 200,
"global_step": 4400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.8978288495540618,
"epoch": 0.011111111111111112,
"grad_norm": 4.291239261627197,
"learning_rate": 7.25925925925926e-05,
"loss": 1.6226,
"mean_token_accuracy": 0.6836586940288544,
"num_tokens": 18001.0,
"step": 50
},
{
"entropy": 0.6665060448646546,
"epoch": 0.022222222222222223,
"grad_norm": 1.3680720329284668,
"learning_rate": 0.00014666666666666666,
"loss": 0.6686,
"mean_token_accuracy": 0.8410311663150787,
"num_tokens": 35844.0,
"step": 100
},
{
"entropy": 0.6690424817800522,
"epoch": 0.03333333333333333,
"grad_norm": 0.8499931693077087,
"learning_rate": 0.00019999492362553862,
"loss": 0.6736,
"mean_token_accuracy": 0.8405047404766083,
"num_tokens": 53968.0,
"step": 150
},
{
"entropy": 0.6419891297817231,
"epoch": 0.044444444444444446,
"grad_norm": 0.5812472105026245,
"learning_rate": 0.0001998939319921494,
"loss": 0.6571,
"mean_token_accuracy": 0.8440787088871002,
"num_tokens": 71937.0,
"step": 200
},
{
"epoch": 0.044444444444444446,
"eval_entropy": 0.6093449130654335,
"eval_loss": 0.7027862071990967,
"eval_mean_token_accuracy": 0.8369034069776535,
"eval_num_tokens": 71937.0,
"eval_runtime": 88.0125,
"eval_samples_per_second": 45.448,
"eval_steps_per_second": 5.681,
"step": 200
},
{
"entropy": 0.6681106066703797,
"epoch": 0.05555555555555555,
"grad_norm": 0.41130882501602173,
"learning_rate": 0.00019966359123301493,
"loss": 0.6637,
"mean_token_accuracy": 0.8417772734165192,
"num_tokens": 90058.0,
"step": 250
},
{
"entropy": 0.6466343528032303,
"epoch": 0.06666666666666667,
"grad_norm": 0.5034320950508118,
"learning_rate": 0.00019930419960825186,
"loss": 0.6507,
"mean_token_accuracy": 0.8456220865249634,
"num_tokens": 107871.0,
"step": 300
},
{
"entropy": 0.659225270152092,
"epoch": 0.07777777777777778,
"grad_norm": 0.6105099320411682,
"learning_rate": 0.0001988162224813867,
"loss": 0.6507,
"mean_token_accuracy": 0.8445734548568725,
"num_tokens": 126050.0,
"step": 350
},
{
"entropy": 0.6419739973545074,
"epoch": 0.08888888888888889,
"grad_norm": 0.4679639935493469,
"learning_rate": 0.00019820029171677286,
"loss": 0.658,
"mean_token_accuracy": 0.8432002317905426,
"num_tokens": 143680.0,
"step": 400
},
{
"epoch": 0.08888888888888889,
"eval_entropy": 0.5937302582263947,
"eval_loss": 0.68473219871521,
"eval_mean_token_accuracy": 0.8390118762254715,
"eval_num_tokens": 143680.0,
"eval_runtime": 88.0097,
"eval_samples_per_second": 45.45,
"eval_steps_per_second": 5.681,
"step": 400
},
{
"entropy": 0.6531517493724823,
"epoch": 0.1,
"grad_norm": 0.7278866171836853,
"learning_rate": 0.00019745720486141172,
"loss": 0.6568,
"mean_token_accuracy": 0.8461188304424286,
"num_tokens": 161432.0,
"step": 450
},
{
"entropy": 0.6509544163942337,
"epoch": 0.1111111111111111,
"grad_norm": 0.5412130951881409,
"learning_rate": 0.00019658792411223736,
"loss": 0.6524,
"mean_token_accuracy": 0.8447290122509002,
"num_tokens": 179392.0,
"step": 500
},
{
"entropy": 0.6577291601896286,
"epoch": 0.12222222222222222,
"grad_norm": 0.48672693967819214,
"learning_rate": 0.00019559357507020162,
"loss": 0.6567,
"mean_token_accuracy": 0.8449541187286377,
"num_tokens": 197260.0,
"step": 550
},
{
"entropy": 0.6546426635980606,
"epoch": 0.13333333333333333,
"grad_norm": 0.355656236410141,
"learning_rate": 0.00019447544528277316,
"loss": 0.6505,
"mean_token_accuracy": 0.8445561909675598,
"num_tokens": 215226.0,
"step": 600
},
{
"epoch": 0.13333333333333333,
"eval_entropy": 0.5969589519500732,
"eval_loss": 0.6697619557380676,
"eval_mean_token_accuracy": 0.8419688076972961,
"eval_num_tokens": 215226.0,
"eval_runtime": 87.9751,
"eval_samples_per_second": 45.467,
"eval_steps_per_second": 5.683,
"step": 600
},
{
"entropy": 0.6280296057462692,
"epoch": 0.14444444444444443,
"grad_norm": 0.38800254464149475,
"learning_rate": 0.00019323498257673775,
"loss": 0.6329,
"mean_token_accuracy": 0.851637612581253,
"num_tokens": 232693.0,
"step": 650
},
{
"entropy": 0.6526319080591202,
"epoch": 0.15555555555555556,
"grad_norm": 0.3850780427455902,
"learning_rate": 0.00019187379318345846,
"loss": 0.6558,
"mean_token_accuracy": 0.8436833143234252,
"num_tokens": 250484.0,
"step": 700
},
{
"entropy": 0.6411016964912415,
"epoch": 0.16666666666666666,
"grad_norm": 0.4081222116947174,
"learning_rate": 0.00019039363965902336,
"loss": 0.6377,
"mean_token_accuracy": 0.8467725789546967,
"num_tokens": 268442.0,
"step": 750
},
{
"entropy": 0.6459184002876281,
"epoch": 0.17777777777777778,
"grad_norm": 0.40721970796585083,
"learning_rate": 0.0001887964386019739,
"loss": 0.6432,
"mean_token_accuracy": 0.8461637806892395,
"num_tokens": 286332.0,
"step": 800
},
{
"epoch": 0.17777777777777778,
"eval_entropy": 0.6085493609309196,
"eval_loss": 0.6541684865951538,
"eval_mean_token_accuracy": 0.8444654858112335,
"eval_num_tokens": 286332.0,
"eval_runtime": 88.0078,
"eval_samples_per_second": 45.451,
"eval_steps_per_second": 5.681,
"step": 800
},
{
"entropy": 0.652884566783905,
"epoch": 0.18888888888888888,
"grad_norm": 0.3846185505390167,
"learning_rate": 0.0001870842581715691,
"loss": 0.6537,
"mean_token_accuracy": 0.8446102023124695,
"num_tokens": 304193.0,
"step": 850
},
{
"entropy": 0.6620096349716187,
"epoch": 0.2,
"grad_norm": 0.48012644052505493,
"learning_rate": 0.0001852593154097991,
"loss": 0.6477,
"mean_token_accuracy": 0.8448492765426636,
"num_tokens": 322178.0,
"step": 900
},
{
"entropy": 0.6532311421632767,
"epoch": 0.2111111111111111,
"grad_norm": 0.3579741418361664,
"learning_rate": 0.00018332397337061585,
"loss": 0.6536,
"mean_token_accuracy": 0.8460512828826904,
"num_tokens": 340138.0,
"step": 950
},
{
"entropy": 0.647466834783554,
"epoch": 0.2222222222222222,
"grad_norm": 0.3946477472782135,
"learning_rate": 0.000181280738060098,
"loss": 0.6452,
"mean_token_accuracy": 0.8455521380901336,
"num_tokens": 358160.0,
"step": 1000
},
{
"epoch": 0.2222222222222222,
"eval_entropy": 0.6152187791466713,
"eval_loss": 0.6446735858917236,
"eval_mean_token_accuracy": 0.8457589077949524,
"eval_num_tokens": 358160.0,
"eval_runtime": 88.02,
"eval_samples_per_second": 45.444,
"eval_steps_per_second": 5.681,
"step": 1000
},
{
"entropy": 0.6481309163570405,
"epoch": 0.23333333333333334,
"grad_norm": 0.3936346769332886,
"learning_rate": 0.00017913225519151194,
"loss": 0.6366,
"mean_token_accuracy": 0.8483670651912689,
"num_tokens": 376065.0,
"step": 1050
},
{
"entropy": 0.6381743800640106,
"epoch": 0.24444444444444444,
"grad_norm": 0.41422468423843384,
"learning_rate": 0.00017688130675947122,
"loss": 0.6342,
"mean_token_accuracy": 0.8456036126613617,
"num_tokens": 394017.0,
"step": 1100
},
{
"entropy": 0.6401758706569671,
"epoch": 0.25555555555555554,
"grad_norm": 0.34131714701652527,
"learning_rate": 0.00017453080743763,
"loss": 0.633,
"mean_token_accuracy": 0.8499649393558503,
"num_tokens": 411946.0,
"step": 1150
},
{
"entropy": 0.6275939923524857,
"epoch": 0.26666666666666666,
"grad_norm": 0.35889938473701477,
"learning_rate": 0.00017208380080457485,
"loss": 0.6322,
"mean_token_accuracy": 0.8477345359325409,
"num_tokens": 429919.0,
"step": 1200
},
{
"epoch": 0.26666666666666666,
"eval_entropy": 0.6125104904770852,
"eval_loss": 0.6421033143997192,
"eval_mean_token_accuracy": 0.8464078787565231,
"eval_num_tokens": 429919.0,
"eval_runtime": 87.9726,
"eval_samples_per_second": 45.469,
"eval_steps_per_second": 5.684,
"step": 1200
},
{
"entropy": 0.6274101620912552,
"epoch": 0.2777777777777778,
"grad_norm": 0.43362855911254883,
"learning_rate": 0.0001695434554028025,
"loss": 0.6316,
"mean_token_accuracy": 0.8491797077655793,
"num_tokens": 447547.0,
"step": 1250
},
{
"entropy": 0.6455954253673554,
"epoch": 0.28888888888888886,
"grad_norm": 0.4339112937450409,
"learning_rate": 0.00016691306063588583,
"loss": 0.6414,
"mean_token_accuracy": 0.8443870341777802,
"num_tokens": 465741.0,
"step": 1300
},
{
"entropy": 0.6329428994655609,
"epoch": 0.3,
"grad_norm": 0.42899656295776367,
"learning_rate": 0.00016419602250914155,
"loss": 0.6252,
"mean_token_accuracy": 0.8489247059822083,
"num_tokens": 483587.0,
"step": 1350
},
{
"entropy": 0.6409079706668854,
"epoch": 0.3111111111111111,
"grad_norm": 0.4631573259830475,
"learning_rate": 0.00016139585921931394,
"loss": 0.6553,
"mean_token_accuracy": 0.8468127429485321,
"num_tokens": 501709.0,
"step": 1400
},
{
"epoch": 0.3111111111111111,
"eval_entropy": 0.621678286254406,
"eval_loss": 0.6395798325538635,
"eval_mean_token_accuracy": 0.8467997794151306,
"eval_num_tokens": 501709.0,
"eval_runtime": 87.9297,
"eval_samples_per_second": 45.491,
"eval_steps_per_second": 5.686,
"step": 1400
},
{
"entropy": 0.6271992588043213,
"epoch": 0.32222222222222224,
"grad_norm": 0.3840474486351013,
"learning_rate": 0.00015851619659898623,
"loss": 0.6295,
"mean_token_accuracy": 0.8492995512485504,
"num_tokens": 519472.0,
"step": 1450
},
{
"entropy": 0.6468123584985733,
"epoch": 0.3333333333333333,
"grad_norm": 0.38773396611213684,
"learning_rate": 0.00015556076342161795,
"loss": 0.6468,
"mean_token_accuracy": 0.8433372890949249,
"num_tokens": 537436.0,
"step": 1500
},
{
"entropy": 0.6341830235719681,
"epoch": 0.34444444444444444,
"grad_norm": 0.37739357352256775,
"learning_rate": 0.00015253338657328784,
"loss": 0.6282,
"mean_token_accuracy": 0.8481453454494476,
"num_tokens": 555101.0,
"step": 1550
},
{
"entropy": 0.6232440257072449,
"epoch": 0.35555555555555557,
"grad_norm": 0.4216736853122711,
"learning_rate": 0.00014943798609739418,
"loss": 0.6077,
"mean_token_accuracy": 0.853328298330307,
"num_tokens": 572708.0,
"step": 1600
},
{
"epoch": 0.35555555555555557,
"eval_entropy": 0.5843898313045501,
"eval_loss": 0.6388325095176697,
"eval_mean_token_accuracy": 0.8475761367082596,
"eval_num_tokens": 572708.0,
"eval_runtime": 88.0014,
"eval_samples_per_second": 45.454,
"eval_steps_per_second": 5.682,
"step": 1600
},
{
"entropy": 0.629493715763092,
"epoch": 0.36666666666666664,
"grad_norm": 0.4576176106929779,
"learning_rate": 0.00014627857011872893,
"loss": 0.6369,
"mean_token_accuracy": 0.8483493518829346,
"num_tokens": 590473.0,
"step": 1650
},
{
"entropy": 0.6264066398143768,
"epoch": 0.37777777777777777,
"grad_norm": 0.3063693940639496,
"learning_rate": 0.00014305922965349857,
"loss": 0.6223,
"mean_token_accuracy": 0.8495516037940979,
"num_tokens": 608122.0,
"step": 1700
},
{
"entropy": 0.6332901197671891,
"epoch": 0.3888888888888889,
"grad_norm": 0.45330291986465454,
"learning_rate": 0.00013978413331201158,
"loss": 0.6393,
"mean_token_accuracy": 0.8468031466007233,
"num_tokens": 626153.0,
"step": 1750
},
{
"entropy": 0.6384645104408264,
"epoch": 0.4,
"grad_norm": 0.4152071475982666,
"learning_rate": 0.00013645752190089206,
"loss": 0.6325,
"mean_token_accuracy": 0.8465726387500763,
"num_tokens": 644466.0,
"step": 1800
},
{
"epoch": 0.4,
"eval_entropy": 0.6132586502432823,
"eval_loss": 0.6346827149391174,
"eval_mean_token_accuracy": 0.8480832484960557,
"eval_num_tokens": 644466.0,
"eval_runtime": 87.9901,
"eval_samples_per_second": 45.46,
"eval_steps_per_second": 5.682,
"step": 1800
},
{
"entropy": 0.631557651758194,
"epoch": 0.4111111111111111,
"grad_norm": 0.2891533076763153,
"learning_rate": 0.00013308370293180902,
"loss": 0.6255,
"mean_token_accuracy": 0.8511977612972259,
"num_tokens": 662209.0,
"step": 1850
},
{
"entropy": 0.6207279634475708,
"epoch": 0.4222222222222222,
"grad_norm": 0.42430344223976135,
"learning_rate": 0.00012966704504383168,
"loss": 0.6205,
"mean_token_accuracy": 0.8519205749034882,
"num_tokens": 679790.0,
"step": 1900
},
{
"entropy": 0.6246707677841187,
"epoch": 0.43333333333333335,
"grad_norm": 0.29151901602745056,
"learning_rate": 0.00012621197234663283,
"loss": 0.6247,
"mean_token_accuracy": 0.8536795401573181,
"num_tokens": 697693.0,
"step": 1950
},
{
"entropy": 0.6282322096824646,
"epoch": 0.4444444444444444,
"grad_norm": 0.40175795555114746,
"learning_rate": 0.0001227229586918655,
"loss": 0.6315,
"mean_token_accuracy": 0.8500750088691711,
"num_tokens": 715480.0,
"step": 2000
},
{
"epoch": 0.4444444444444444,
"eval_entropy": 0.614536872446537,
"eval_loss": 0.6332096457481384,
"eval_mean_token_accuracy": 0.8484884305000305,
"eval_num_tokens": 715480.0,
"eval_runtime": 87.9964,
"eval_samples_per_second": 45.456,
"eval_steps_per_second": 5.682,
"step": 2000
},
{
"entropy": 0.6323211324214936,
"epoch": 0.45555555555555555,
"grad_norm": 0.3964557945728302,
"learning_rate": 0.00011920452188013029,
"loss": 0.6327,
"mean_token_accuracy": 0.8506696331501007,
"num_tokens": 733381.0,
"step": 2050
},
{
"entropy": 0.6420121788978577,
"epoch": 0.4666666666666667,
"grad_norm": 0.270047664642334,
"learning_rate": 0.0001156612178110351,
"loss": 0.6356,
"mean_token_accuracy": 0.8502523565292358,
"num_tokens": 751399.0,
"step": 2100
},
{
"entropy": 0.6098494738340378,
"epoch": 0.4777777777777778,
"grad_norm": 0.30119839310646057,
"learning_rate": 0.00011209763458392135,
"loss": 0.6028,
"mean_token_accuracy": 0.8526248228549957,
"num_tokens": 769220.0,
"step": 2150
},
{
"entropy": 0.6257252705097198,
"epoch": 0.4888888888888889,
"grad_norm": 0.41217416524887085,
"learning_rate": 0.00010851838655689625,
"loss": 0.6278,
"mean_token_accuracy": 0.8503323125839234,
"num_tokens": 787060.0,
"step": 2200
},
{
"epoch": 0.4888888888888889,
"eval_entropy": 0.6182988230586052,
"eval_loss": 0.629729151725769,
"eval_mean_token_accuracy": 0.8488966919183731,
"eval_num_tokens": 787060.0,
"eval_runtime": 88.02,
"eval_samples_per_second": 45.444,
"eval_steps_per_second": 5.681,
"step": 2200
},
{
"entropy": 0.629525854587555,
"epoch": 0.5,
"grad_norm": 0.33919742703437805,
"learning_rate": 0.00010492810837186333,
"loss": 0.6288,
"mean_token_accuracy": 0.8490475380420685,
"num_tokens": 804988.0,
"step": 2250
},
{
"entropy": 0.6336910331249237,
"epoch": 0.5111111111111111,
"grad_norm": 0.42546504735946655,
"learning_rate": 0.00010133144895328832,
"loss": 0.6302,
"mean_token_accuracy": 0.848661150932312,
"num_tokens": 823030.0,
"step": 2300
},
{
"entropy": 0.6298299109935761,
"epoch": 0.5222222222222223,
"grad_norm": 0.4121692180633545,
"learning_rate": 9.7733065488471e-05,
"loss": 0.6261,
"mean_token_accuracy": 0.8524516999721528,
"num_tokens": 840873.0,
"step": 2350
},
{
"entropy": 0.6273639261722564,
"epoch": 0.5333333333333333,
"grad_norm": 0.33091071248054504,
"learning_rate": 9.413761739711771e-05,
"loss": 0.6279,
"mean_token_accuracy": 0.8502094805240631,
"num_tokens": 858631.0,
"step": 2400
},
{
"epoch": 0.5333333333333333,
"eval_entropy": 0.6137934300303459,
"eval_loss": 0.6281214356422424,
"eval_mean_token_accuracy": 0.849094120979309,
"eval_num_tokens": 858631.0,
"eval_runtime": 88.0567,
"eval_samples_per_second": 45.425,
"eval_steps_per_second": 5.678,
"step": 2400
},
{
"entropy": 0.6171291017532349,
"epoch": 0.5444444444444444,
"grad_norm": 0.41635480523109436,
"learning_rate": 9.054976029802337e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8495936059951782,
"num_tokens": 876423.0,
"step": 2450
},
{
"entropy": 0.6276765954494476,
"epoch": 0.5555555555555556,
"grad_norm": 0.43940269947052,
"learning_rate": 8.6974139980675e-05,
"loss": 0.6233,
"mean_token_accuracy": 0.8525845003128052,
"num_tokens": 894210.0,
"step": 2500
},
{
"entropy": 0.6247690558433533,
"epoch": 0.5666666666666667,
"grad_norm": 0.31268852949142456,
"learning_rate": 8.341538638958291e-05,
"loss": 0.6255,
"mean_token_accuracy": 0.8500683605670929,
"num_tokens": 912075.0,
"step": 2550
},
{
"entropy": 0.6128390139341354,
"epoch": 0.5777777777777777,
"grad_norm": 0.3916880190372467,
"learning_rate": 7.987810762912924e-05,
"loss": 0.615,
"mean_token_accuracy": 0.8547429955005645,
"num_tokens": 929888.0,
"step": 2600
},
{
"epoch": 0.5777777777777777,
"eval_entropy": 0.612749766767025,
"eval_loss": 0.6256938576698303,
"eval_mean_token_accuracy": 0.8497577202320099,
"eval_num_tokens": 929888.0,
"eval_runtime": 87.9613,
"eval_samples_per_second": 45.475,
"eval_steps_per_second": 5.684,
"step": 2600
},
{
"entropy": 0.6389735889434814,
"epoch": 0.5888888888888889,
"grad_norm": 0.38676777482032776,
"learning_rate": 7.636688399669589e-05,
"loss": 0.6435,
"mean_token_accuracy": 0.84725133061409,
"num_tokens": 947902.0,
"step": 2650
},
{
"entropy": 0.6316736024618149,
"epoch": 0.6,
"grad_norm": 0.3665507733821869,
"learning_rate": 7.288626205179951e-05,
"loss": 0.6242,
"mean_token_accuracy": 0.8496048271656036,
"num_tokens": 965905.0,
"step": 2700
},
{
"entropy": 0.6109506344795227,
"epoch": 0.6111111111111112,
"grad_norm": 0.4900154769420624,
"learning_rate": 6.944074872891199e-05,
"loss": 0.6063,
"mean_token_accuracy": 0.852264449596405,
"num_tokens": 983884.0,
"step": 2750
},
{
"entropy": 0.6153418481349945,
"epoch": 0.6222222222222222,
"grad_norm": 0.47226202487945557,
"learning_rate": 6.603480550158995e-05,
"loss": 0.62,
"mean_token_accuracy": 0.8504341340065003,
"num_tokens": 1001717.0,
"step": 2800
},
{
"epoch": 0.6222222222222222,
"eval_entropy": 0.619692858338356,
"eval_loss": 0.6247742772102356,
"eval_mean_token_accuracy": 0.8500253454446792,
"eval_num_tokens": 1001717.0,
"eval_runtime": 87.9899,
"eval_samples_per_second": 45.46,
"eval_steps_per_second": 5.682,
"step": 2800
},
{
"entropy": 0.6241399937868118,
"epoch": 0.6333333333333333,
"grad_norm": 0.4187294840812683,
"learning_rate": 6.267284260547049e-05,
"loss": 0.6156,
"mean_token_accuracy": 0.8507660686969757,
"num_tokens": 1019451.0,
"step": 2850
},
{
"entropy": 0.6226123148202896,
"epoch": 0.6444444444444445,
"grad_norm": 0.33195361495018005,
"learning_rate": 5.9359213327612416e-05,
"loss": 0.6129,
"mean_token_accuracy": 0.8508730328083038,
"num_tokens": 1037299.0,
"step": 2900
},
{
"entropy": 0.6067140877246857,
"epoch": 0.6555555555555556,
"grad_norm": 0.4044334292411804,
"learning_rate": 5.609820836957871e-05,
"loss": 0.5978,
"mean_token_accuracy": 0.8547445130348206,
"num_tokens": 1054997.0,
"step": 2950
},
{
"entropy": 0.6072817480564118,
"epoch": 0.6666666666666666,
"grad_norm": 0.3837581276893616,
"learning_rate": 5.28940502915587e-05,
"loss": 0.6195,
"mean_token_accuracy": 0.8500410854816437,
"num_tokens": 1072981.0,
"step": 3000
},
{
"epoch": 0.6666666666666666,
"eval_entropy": 0.6186771767735482,
"eval_loss": 0.6234644055366516,
"eval_mean_token_accuracy": 0.8501761881113052,
"eval_num_tokens": 1072981.0,
"eval_runtime": 88.0305,
"eval_samples_per_second": 45.439,
"eval_steps_per_second": 5.68,
"step": 3000
},
{
"entropy": 0.6253303802013397,
"epoch": 0.6777777777777778,
"grad_norm": 0.4809369742870331,
"learning_rate": 4.975088804472356e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.850438643693924,
"num_tokens": 1090627.0,
"step": 3050
},
{
"entropy": 0.6156042230129242,
"epoch": 0.6888888888888889,
"grad_norm": 0.35011181235313416,
"learning_rate": 4.667279159889624e-05,
"loss": 0.6211,
"mean_token_accuracy": 0.8514653861522674,
"num_tokens": 1108362.0,
"step": 3100
},
{
"entropy": 0.6167298531532288,
"epoch": 0.7,
"grad_norm": 0.41358232498168945,
"learning_rate": 4.366374667249118e-05,
"loss": 0.6069,
"mean_token_accuracy": 0.8523639941215515,
"num_tokens": 1126044.0,
"step": 3150
},
{
"entropy": 0.6212670838832856,
"epoch": 0.7111111111111111,
"grad_norm": 0.44910740852355957,
"learning_rate": 4.0727649571548146e-05,
"loss": 0.6272,
"mean_token_accuracy": 0.8506516909599304,
"num_tokens": 1143877.0,
"step": 3200
},
{
"epoch": 0.7111111111111111,
"eval_entropy": 0.6189550485610962,
"eval_loss": 0.622456431388855,
"eval_mean_token_accuracy": 0.8506602959632874,
"eval_num_tokens": 1143877.0,
"eval_runtime": 87.9986,
"eval_samples_per_second": 45.455,
"eval_steps_per_second": 5.682,
"step": 3200
},
{
"entropy": 0.6167463368177414,
"epoch": 0.7222222222222222,
"grad_norm": 0.4499205946922302,
"learning_rate": 3.786830214454315e-05,
"loss": 0.6159,
"mean_token_accuracy": 0.8481730723381042,
"num_tokens": 1161796.0,
"step": 3250
},
{
"entropy": 0.6131170511245727,
"epoch": 0.7333333333333333,
"grad_norm": 0.38012415170669556,
"learning_rate": 3.5089406859509166e-05,
"loss": 0.6219,
"mean_token_accuracy": 0.8537890160083771,
"num_tokens": 1179566.0,
"step": 3300
},
{
"entropy": 0.6255488413572311,
"epoch": 0.7444444444444445,
"grad_norm": 0.3967890739440918,
"learning_rate": 3.2394562009840835e-05,
"loss": 0.6268,
"mean_token_accuracy": 0.8487379801273346,
"num_tokens": 1197824.0,
"step": 3350
},
{
"entropy": 0.6168732368946075,
"epoch": 0.7555555555555555,
"grad_norm": 0.38855499029159546,
"learning_rate": 2.9787257054991592e-05,
"loss": 0.6036,
"mean_token_accuracy": 0.8553757643699647,
"num_tokens": 1215503.0,
"step": 3400
},
{
"epoch": 0.7555555555555555,
"eval_entropy": 0.6248494290113449,
"eval_loss": 0.6211217045783997,
"eval_mean_token_accuracy": 0.8511025402545929,
"eval_num_tokens": 1215503.0,
"eval_runtime": 87.9806,
"eval_samples_per_second": 45.465,
"eval_steps_per_second": 5.683,
"step": 3400
},
{
"entropy": 0.6244831168651581,
"epoch": 0.7666666666666667,
"grad_norm": 0.40160423517227173,
"learning_rate": 2.727086810209559e-05,
"loss": 0.6144,
"mean_token_accuracy": 0.8539444077014923,
"num_tokens": 1233233.0,
"step": 3450
},
{
"entropy": 0.6258886575698852,
"epoch": 0.7777777777777778,
"grad_norm": 0.4197278618812561,
"learning_rate": 2.4848653534365886e-05,
"loss": 0.6157,
"mean_token_accuracy": 0.8522231721878052,
"num_tokens": 1251135.0,
"step": 3500
},
{
"entropy": 0.6242718535661698,
"epoch": 0.7888888888888889,
"grad_norm": 0.31663864850997925,
"learning_rate": 2.2523749791929127e-05,
"loss": 0.6236,
"mean_token_accuracy": 0.8481460773944854,
"num_tokens": 1269437.0,
"step": 3550
},
{
"entropy": 0.6332012844085694,
"epoch": 0.8,
"grad_norm": 0.4183398485183716,
"learning_rate": 2.029916731055981e-05,
"loss": 0.6331,
"mean_token_accuracy": 0.8506816875934601,
"num_tokens": 1287466.0,
"step": 3600
},
{
"epoch": 0.8,
"eval_entropy": 0.6211995969414711,
"eval_loss": 0.6201685667037964,
"eval_mean_token_accuracy": 0.8512796934843063,
"eval_num_tokens": 1287466.0,
"eval_runtime": 87.995,
"eval_samples_per_second": 45.457,
"eval_steps_per_second": 5.682,
"step": 3600
},
{
"entropy": 0.6207393455505371,
"epoch": 0.8111111111111111,
"grad_norm": 0.3644893169403076,
"learning_rate": 1.8177786623573322e-05,
"loss": 0.6051,
"mean_token_accuracy": 0.8554283630847931,
"num_tokens": 1305114.0,
"step": 3650
},
{
"entropy": 0.6093165588378906,
"epoch": 0.8222222222222222,
"grad_norm": 0.4035656154155731,
"learning_rate": 1.6162354631925204e-05,
"loss": 0.5942,
"mean_token_accuracy": 0.8568060100078583,
"num_tokens": 1322570.0,
"step": 3700
},
{
"entropy": 0.610592405796051,
"epoch": 0.8333333333333334,
"grad_norm": 0.4430359899997711,
"learning_rate": 1.425548104734583e-05,
"loss": 0.6228,
"mean_token_accuracy": 0.853781110048294,
"num_tokens": 1340182.0,
"step": 3750
},
{
"entropy": 0.624454995393753,
"epoch": 0.8444444444444444,
"grad_norm": 0.47560906410217285,
"learning_rate": 1.2459635013117043e-05,
"loss": 0.6285,
"mean_token_accuracy": 0.8506780207157135,
"num_tokens": 1357930.0,
"step": 3800
},
{
"epoch": 0.8444444444444444,
"eval_entropy": 0.6203543889522553,
"eval_loss": 0.6197088360786438,
"eval_mean_token_accuracy": 0.851304793715477,
"eval_num_tokens": 1357930.0,
"eval_runtime": 88.0719,
"eval_samples_per_second": 45.417,
"eval_steps_per_second": 5.677,
"step": 3800
},
{
"entropy": 0.6153292739391327,
"epoch": 0.8555555555555555,
"grad_norm": 0.6369743347167969,
"learning_rate": 1.0777141906865584e-05,
"loss": 0.6206,
"mean_token_accuracy": 0.8497644782066345,
"num_tokens": 1375720.0,
"step": 3850
},
{
"entropy": 0.6167460489273071,
"epoch": 0.8666666666666667,
"grad_norm": 0.31762850284576416,
"learning_rate": 9.210180329513674e-06,
"loss": 0.6164,
"mean_token_accuracy": 0.8522521209716797,
"num_tokens": 1393343.0,
"step": 3900
},
{
"entropy": 0.6256991571187973,
"epoch": 0.8777777777777778,
"grad_norm": 0.44901618361473083,
"learning_rate": 7.760779284285724e-06,
"loss": 0.6304,
"mean_token_accuracy": 0.8484922182559967,
"num_tokens": 1411415.0,
"step": 3950
},
{
"entropy": 0.6351810383796692,
"epoch": 0.8888888888888888,
"grad_norm": 0.3676467537879944,
"learning_rate": 6.430815549423541e-06,
"loss": 0.6343,
"mean_token_accuracy": 0.8446752560138703,
"num_tokens": 1429355.0,
"step": 4000
},
{
"epoch": 0.8888888888888888,
"eval_entropy": 0.6227085783481597,
"eval_loss": 0.6193701028823853,
"eval_mean_token_accuracy": 0.8513179312944412,
"eval_num_tokens": 1429355.0,
"eval_runtime": 87.8963,
"eval_samples_per_second": 45.508,
"eval_steps_per_second": 5.689,
"step": 4000
},
{
"entropy": 0.6218395137786865,
"epoch": 0.9,
"grad_norm": 0.352983683347702,
"learning_rate": 5.222011248012537e-06,
"loss": 0.6208,
"mean_token_accuracy": 0.8524598634243011,
"num_tokens": 1447378.0,
"step": 4050
},
{
"entropy": 0.6192989981174469,
"epoch": 0.9111111111111111,
"grad_norm": 0.5437832474708557,
"learning_rate": 4.1359316180653806e-06,
"loss": 0.6071,
"mean_token_accuracy": 0.8508526837825775,
"num_tokens": 1465326.0,
"step": 4100
},
{
"entropy": 0.6204692393541336,
"epoch": 0.9222222222222223,
"grad_norm": 0.29962158203125,
"learning_rate": 3.1739829857504234e-06,
"loss": 0.6173,
"mean_token_accuracy": 0.8498383402824402,
"num_tokens": 1483262.0,
"step": 4150
},
{
"entropy": 0.6081379109621048,
"epoch": 0.9333333333333333,
"grad_norm": 0.39637628197669983,
"learning_rate": 2.3374109443897065e-06,
"loss": 0.6151,
"mean_token_accuracy": 0.8525230586528778,
"num_tokens": 1500855.0,
"step": 4200
},
{
"epoch": 0.9333333333333333,
"eval_entropy": 0.6221803342103958,
"eval_loss": 0.6191594004631042,
"eval_mean_token_accuracy": 0.8513582646846771,
"eval_num_tokens": 1500855.0,
"eval_runtime": 87.871,
"eval_samples_per_second": 45.521,
"eval_steps_per_second": 5.69,
"step": 4200
},
{
"entropy": 0.6116468846797943,
"epoch": 0.9444444444444444,
"grad_norm": 0.3078557252883911,
"learning_rate": 1.6272987415841267e-06,
"loss": 0.598,
"mean_token_accuracy": 0.8564710664749146,
"num_tokens": 1518914.0,
"step": 4250
},
{
"entropy": 0.6252698361873626,
"epoch": 0.9555555555555556,
"grad_norm": 0.38908475637435913,
"learning_rate": 1.0445658765543153e-06,
"loss": 0.6194,
"mean_token_accuracy": 0.8515545094013214,
"num_tokens": 1536933.0,
"step": 4300
},
{
"entropy": 0.6222954159975052,
"epoch": 0.9666666666666667,
"grad_norm": 0.4802163541316986,
"learning_rate": 5.899669095136174e-07,
"loss": 0.6291,
"mean_token_accuracy": 0.8507583463191986,
"num_tokens": 1555013.0,
"step": 4350
},
{
"entropy": 0.6046170508861541,
"epoch": 0.9777777777777777,
"grad_norm": 0.3905353546142578,
"learning_rate": 2.640904846146652e-07,
"loss": 0.6059,
"mean_token_accuracy": 0.8536884272098542,
"num_tokens": 1572842.0,
"step": 4400
},
{
"epoch": 0.9777777777777777,
"eval_entropy": 0.6219840022921562,
"eval_loss": 0.6191316843032837,
"eval_mean_token_accuracy": 0.8512439979314804,
"eval_num_tokens": 1572842.0,
"eval_runtime": 87.8917,
"eval_samples_per_second": 45.511,
"eval_steps_per_second": 5.689,
"step": 4400
}
],
"logging_steps": 50,
"max_steps": 4500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.357263504886989e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}