1157 lines
33 KiB
JSON
1157 lines
33 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 4400,
|
||
|
|
"best_metric": 0.6191316843032837,
|
||
|
|
"best_model_checkpoint": "/mnt/scratch/users/sglli24/fine-tuning-project/fine_tuned_model/llama2-tatoeba-en-fr-20251120-101824/checkpoint-4400",
|
||
|
|
"epoch": 0.9777777777777777,
|
||
|
|
"eval_steps": 200,
|
||
|
|
"global_step": 4400,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 1.8978288495540618,
|
||
|
|
"epoch": 0.011111111111111112,
|
||
|
|
"grad_norm": 4.291239261627197,
|
||
|
|
"learning_rate": 7.25925925925926e-05,
|
||
|
|
"loss": 1.6226,
|
||
|
|
"mean_token_accuracy": 0.6836586940288544,
|
||
|
|
"num_tokens": 18001.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6665060448646546,
|
||
|
|
"epoch": 0.022222222222222223,
|
||
|
|
"grad_norm": 1.3680720329284668,
|
||
|
|
"learning_rate": 0.00014666666666666666,
|
||
|
|
"loss": 0.6686,
|
||
|
|
"mean_token_accuracy": 0.8410311663150787,
|
||
|
|
"num_tokens": 35844.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6690424817800522,
|
||
|
|
"epoch": 0.03333333333333333,
|
||
|
|
"grad_norm": 0.8499931693077087,
|
||
|
|
"learning_rate": 0.00019999492362553862,
|
||
|
|
"loss": 0.6736,
|
||
|
|
"mean_token_accuracy": 0.8405047404766083,
|
||
|
|
"num_tokens": 53968.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6419891297817231,
|
||
|
|
"epoch": 0.044444444444444446,
|
||
|
|
"grad_norm": 0.5812472105026245,
|
||
|
|
"learning_rate": 0.0001998939319921494,
|
||
|
|
"loss": 0.6571,
|
||
|
|
"mean_token_accuracy": 0.8440787088871002,
|
||
|
|
"num_tokens": 71937.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.044444444444444446,
|
||
|
|
"eval_entropy": 0.6093449130654335,
|
||
|
|
"eval_loss": 0.7027862071990967,
|
||
|
|
"eval_mean_token_accuracy": 0.8369034069776535,
|
||
|
|
"eval_num_tokens": 71937.0,
|
||
|
|
"eval_runtime": 88.0125,
|
||
|
|
"eval_samples_per_second": 45.448,
|
||
|
|
"eval_steps_per_second": 5.681,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6681106066703797,
|
||
|
|
"epoch": 0.05555555555555555,
|
||
|
|
"grad_norm": 0.41130882501602173,
|
||
|
|
"learning_rate": 0.00019966359123301493,
|
||
|
|
"loss": 0.6637,
|
||
|
|
"mean_token_accuracy": 0.8417772734165192,
|
||
|
|
"num_tokens": 90058.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6466343528032303,
|
||
|
|
"epoch": 0.06666666666666667,
|
||
|
|
"grad_norm": 0.5034320950508118,
|
||
|
|
"learning_rate": 0.00019930419960825186,
|
||
|
|
"loss": 0.6507,
|
||
|
|
"mean_token_accuracy": 0.8456220865249634,
|
||
|
|
"num_tokens": 107871.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.659225270152092,
|
||
|
|
"epoch": 0.07777777777777778,
|
||
|
|
"grad_norm": 0.6105099320411682,
|
||
|
|
"learning_rate": 0.0001988162224813867,
|
||
|
|
"loss": 0.6507,
|
||
|
|
"mean_token_accuracy": 0.8445734548568725,
|
||
|
|
"num_tokens": 126050.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6419739973545074,
|
||
|
|
"epoch": 0.08888888888888889,
|
||
|
|
"grad_norm": 0.4679639935493469,
|
||
|
|
"learning_rate": 0.00019820029171677286,
|
||
|
|
"loss": 0.658,
|
||
|
|
"mean_token_accuracy": 0.8432002317905426,
|
||
|
|
"num_tokens": 143680.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08888888888888889,
|
||
|
|
"eval_entropy": 0.5937302582263947,
|
||
|
|
"eval_loss": 0.68473219871521,
|
||
|
|
"eval_mean_token_accuracy": 0.8390118762254715,
|
||
|
|
"eval_num_tokens": 143680.0,
|
||
|
|
"eval_runtime": 88.0097,
|
||
|
|
"eval_samples_per_second": 45.45,
|
||
|
|
"eval_steps_per_second": 5.681,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6531517493724823,
|
||
|
|
"epoch": 0.1,
|
||
|
|
"grad_norm": 0.7278866171836853,
|
||
|
|
"learning_rate": 0.00019745720486141172,
|
||
|
|
"loss": 0.6568,
|
||
|
|
"mean_token_accuracy": 0.8461188304424286,
|
||
|
|
"num_tokens": 161432.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6509544163942337,
|
||
|
|
"epoch": 0.1111111111111111,
|
||
|
|
"grad_norm": 0.5412130951881409,
|
||
|
|
"learning_rate": 0.00019658792411223736,
|
||
|
|
"loss": 0.6524,
|
||
|
|
"mean_token_accuracy": 0.8447290122509002,
|
||
|
|
"num_tokens": 179392.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6577291601896286,
|
||
|
|
"epoch": 0.12222222222222222,
|
||
|
|
"grad_norm": 0.48672693967819214,
|
||
|
|
"learning_rate": 0.00019559357507020162,
|
||
|
|
"loss": 0.6567,
|
||
|
|
"mean_token_accuracy": 0.8449541187286377,
|
||
|
|
"num_tokens": 197260.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6546426635980606,
|
||
|
|
"epoch": 0.13333333333333333,
|
||
|
|
"grad_norm": 0.355656236410141,
|
||
|
|
"learning_rate": 0.00019447544528277316,
|
||
|
|
"loss": 0.6505,
|
||
|
|
"mean_token_accuracy": 0.8445561909675598,
|
||
|
|
"num_tokens": 215226.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13333333333333333,
|
||
|
|
"eval_entropy": 0.5969589519500732,
|
||
|
|
"eval_loss": 0.6697619557380676,
|
||
|
|
"eval_mean_token_accuracy": 0.8419688076972961,
|
||
|
|
"eval_num_tokens": 215226.0,
|
||
|
|
"eval_runtime": 87.9751,
|
||
|
|
"eval_samples_per_second": 45.467,
|
||
|
|
"eval_steps_per_second": 5.683,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6280296057462692,
|
||
|
|
"epoch": 0.14444444444444443,
|
||
|
|
"grad_norm": 0.38800254464149475,
|
||
|
|
"learning_rate": 0.00019323498257673775,
|
||
|
|
"loss": 0.6329,
|
||
|
|
"mean_token_accuracy": 0.851637612581253,
|
||
|
|
"num_tokens": 232693.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6526319080591202,
|
||
|
|
"epoch": 0.15555555555555556,
|
||
|
|
"grad_norm": 0.3850780427455902,
|
||
|
|
"learning_rate": 0.00019187379318345846,
|
||
|
|
"loss": 0.6558,
|
||
|
|
"mean_token_accuracy": 0.8436833143234252,
|
||
|
|
"num_tokens": 250484.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6411016964912415,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"grad_norm": 0.4081222116947174,
|
||
|
|
"learning_rate": 0.00019039363965902336,
|
||
|
|
"loss": 0.6377,
|
||
|
|
"mean_token_accuracy": 0.8467725789546967,
|
||
|
|
"num_tokens": 268442.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6459184002876281,
|
||
|
|
"epoch": 0.17777777777777778,
|
||
|
|
"grad_norm": 0.40721970796585083,
|
||
|
|
"learning_rate": 0.0001887964386019739,
|
||
|
|
"loss": 0.6432,
|
||
|
|
"mean_token_accuracy": 0.8461637806892395,
|
||
|
|
"num_tokens": 286332.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17777777777777778,
|
||
|
|
"eval_entropy": 0.6085493609309196,
|
||
|
|
"eval_loss": 0.6541684865951538,
|
||
|
|
"eval_mean_token_accuracy": 0.8444654858112335,
|
||
|
|
"eval_num_tokens": 286332.0,
|
||
|
|
"eval_runtime": 88.0078,
|
||
|
|
"eval_samples_per_second": 45.451,
|
||
|
|
"eval_steps_per_second": 5.681,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.652884566783905,
|
||
|
|
"epoch": 0.18888888888888888,
|
||
|
|
"grad_norm": 0.3846185505390167,
|
||
|
|
"learning_rate": 0.0001870842581715691,
|
||
|
|
"loss": 0.6537,
|
||
|
|
"mean_token_accuracy": 0.8446102023124695,
|
||
|
|
"num_tokens": 304193.0,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6620096349716187,
|
||
|
|
"epoch": 0.2,
|
||
|
|
"grad_norm": 0.48012644052505493,
|
||
|
|
"learning_rate": 0.0001852593154097991,
|
||
|
|
"loss": 0.6477,
|
||
|
|
"mean_token_accuracy": 0.8448492765426636,
|
||
|
|
"num_tokens": 322178.0,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6532311421632767,
|
||
|
|
"epoch": 0.2111111111111111,
|
||
|
|
"grad_norm": 0.3579741418361664,
|
||
|
|
"learning_rate": 0.00018332397337061585,
|
||
|
|
"loss": 0.6536,
|
||
|
|
"mean_token_accuracy": 0.8460512828826904,
|
||
|
|
"num_tokens": 340138.0,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.647466834783554,
|
||
|
|
"epoch": 0.2222222222222222,
|
||
|
|
"grad_norm": 0.3946477472782135,
|
||
|
|
"learning_rate": 0.000181280738060098,
|
||
|
|
"loss": 0.6452,
|
||
|
|
"mean_token_accuracy": 0.8455521380901336,
|
||
|
|
"num_tokens": 358160.0,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2222222222222222,
|
||
|
|
"eval_entropy": 0.6152187791466713,
|
||
|
|
"eval_loss": 0.6446735858917236,
|
||
|
|
"eval_mean_token_accuracy": 0.8457589077949524,
|
||
|
|
"eval_num_tokens": 358160.0,
|
||
|
|
"eval_runtime": 88.02,
|
||
|
|
"eval_samples_per_second": 45.444,
|
||
|
|
"eval_steps_per_second": 5.681,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6481309163570405,
|
||
|
|
"epoch": 0.23333333333333334,
|
||
|
|
"grad_norm": 0.3936346769332886,
|
||
|
|
"learning_rate": 0.00017913225519151194,
|
||
|
|
"loss": 0.6366,
|
||
|
|
"mean_token_accuracy": 0.8483670651912689,
|
||
|
|
"num_tokens": 376065.0,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6381743800640106,
|
||
|
|
"epoch": 0.24444444444444444,
|
||
|
|
"grad_norm": 0.41422468423843384,
|
||
|
|
"learning_rate": 0.00017688130675947122,
|
||
|
|
"loss": 0.6342,
|
||
|
|
"mean_token_accuracy": 0.8456036126613617,
|
||
|
|
"num_tokens": 394017.0,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6401758706569671,
|
||
|
|
"epoch": 0.25555555555555554,
|
||
|
|
"grad_norm": 0.34131714701652527,
|
||
|
|
"learning_rate": 0.00017453080743763,
|
||
|
|
"loss": 0.633,
|
||
|
|
"mean_token_accuracy": 0.8499649393558503,
|
||
|
|
"num_tokens": 411946.0,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6275939923524857,
|
||
|
|
"epoch": 0.26666666666666666,
|
||
|
|
"grad_norm": 0.35889938473701477,
|
||
|
|
"learning_rate": 0.00017208380080457485,
|
||
|
|
"loss": 0.6322,
|
||
|
|
"mean_token_accuracy": 0.8477345359325409,
|
||
|
|
"num_tokens": 429919.0,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26666666666666666,
|
||
|
|
"eval_entropy": 0.6125104904770852,
|
||
|
|
"eval_loss": 0.6421033143997192,
|
||
|
|
"eval_mean_token_accuracy": 0.8464078787565231,
|
||
|
|
"eval_num_tokens": 429919.0,
|
||
|
|
"eval_runtime": 87.9726,
|
||
|
|
"eval_samples_per_second": 45.469,
|
||
|
|
"eval_steps_per_second": 5.684,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6274101620912552,
|
||
|
|
"epoch": 0.2777777777777778,
|
||
|
|
"grad_norm": 0.43362855911254883,
|
||
|
|
"learning_rate": 0.0001695434554028025,
|
||
|
|
"loss": 0.6316,
|
||
|
|
"mean_token_accuracy": 0.8491797077655793,
|
||
|
|
"num_tokens": 447547.0,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6455954253673554,
|
||
|
|
"epoch": 0.28888888888888886,
|
||
|
|
"grad_norm": 0.4339112937450409,
|
||
|
|
"learning_rate": 0.00016691306063588583,
|
||
|
|
"loss": 0.6414,
|
||
|
|
"mean_token_accuracy": 0.8443870341777802,
|
||
|
|
"num_tokens": 465741.0,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6329428994655609,
|
||
|
|
"epoch": 0.3,
|
||
|
|
"grad_norm": 0.42899656295776367,
|
||
|
|
"learning_rate": 0.00016419602250914155,
|
||
|
|
"loss": 0.6252,
|
||
|
|
"mean_token_accuracy": 0.8489247059822083,
|
||
|
|
"num_tokens": 483587.0,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6409079706668854,
|
||
|
|
"epoch": 0.3111111111111111,
|
||
|
|
"grad_norm": 0.4631573259830475,
|
||
|
|
"learning_rate": 0.00016139585921931394,
|
||
|
|
"loss": 0.6553,
|
||
|
|
"mean_token_accuracy": 0.8468127429485321,
|
||
|
|
"num_tokens": 501709.0,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3111111111111111,
|
||
|
|
"eval_entropy": 0.621678286254406,
|
||
|
|
"eval_loss": 0.6395798325538635,
|
||
|
|
"eval_mean_token_accuracy": 0.8467997794151306,
|
||
|
|
"eval_num_tokens": 501709.0,
|
||
|
|
"eval_runtime": 87.9297,
|
||
|
|
"eval_samples_per_second": 45.491,
|
||
|
|
"eval_steps_per_second": 5.686,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6271992588043213,
|
||
|
|
"epoch": 0.32222222222222224,
|
||
|
|
"grad_norm": 0.3840474486351013,
|
||
|
|
"learning_rate": 0.00015851619659898623,
|
||
|
|
"loss": 0.6295,
|
||
|
|
"mean_token_accuracy": 0.8492995512485504,
|
||
|
|
"num_tokens": 519472.0,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6468123584985733,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"grad_norm": 0.38773396611213684,
|
||
|
|
"learning_rate": 0.00015556076342161795,
|
||
|
|
"loss": 0.6468,
|
||
|
|
"mean_token_accuracy": 0.8433372890949249,
|
||
|
|
"num_tokens": 537436.0,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6341830235719681,
|
||
|
|
"epoch": 0.34444444444444444,
|
||
|
|
"grad_norm": 0.37739357352256775,
|
||
|
|
"learning_rate": 0.00015253338657328784,
|
||
|
|
"loss": 0.6282,
|
||
|
|
"mean_token_accuracy": 0.8481453454494476,
|
||
|
|
"num_tokens": 555101.0,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6232440257072449,
|
||
|
|
"epoch": 0.35555555555555557,
|
||
|
|
"grad_norm": 0.4216736853122711,
|
||
|
|
"learning_rate": 0.00014943798609739418,
|
||
|
|
"loss": 0.6077,
|
||
|
|
"mean_token_accuracy": 0.853328298330307,
|
||
|
|
"num_tokens": 572708.0,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35555555555555557,
|
||
|
|
"eval_entropy": 0.5843898313045501,
|
||
|
|
"eval_loss": 0.6388325095176697,
|
||
|
|
"eval_mean_token_accuracy": 0.8475761367082596,
|
||
|
|
"eval_num_tokens": 572708.0,
|
||
|
|
"eval_runtime": 88.0014,
|
||
|
|
"eval_samples_per_second": 45.454,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.629493715763092,
|
||
|
|
"epoch": 0.36666666666666664,
|
||
|
|
"grad_norm": 0.4576176106929779,
|
||
|
|
"learning_rate": 0.00014627857011872893,
|
||
|
|
"loss": 0.6369,
|
||
|
|
"mean_token_accuracy": 0.8483493518829346,
|
||
|
|
"num_tokens": 590473.0,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6264066398143768,
|
||
|
|
"epoch": 0.37777777777777777,
|
||
|
|
"grad_norm": 0.3063693940639496,
|
||
|
|
"learning_rate": 0.00014305922965349857,
|
||
|
|
"loss": 0.6223,
|
||
|
|
"mean_token_accuracy": 0.8495516037940979,
|
||
|
|
"num_tokens": 608122.0,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6332901197671891,
|
||
|
|
"epoch": 0.3888888888888889,
|
||
|
|
"grad_norm": 0.45330291986465454,
|
||
|
|
"learning_rate": 0.00013978413331201158,
|
||
|
|
"loss": 0.6393,
|
||
|
|
"mean_token_accuracy": 0.8468031466007233,
|
||
|
|
"num_tokens": 626153.0,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6384645104408264,
|
||
|
|
"epoch": 0.4,
|
||
|
|
"grad_norm": 0.4152071475982666,
|
||
|
|
"learning_rate": 0.00013645752190089206,
|
||
|
|
"loss": 0.6325,
|
||
|
|
"mean_token_accuracy": 0.8465726387500763,
|
||
|
|
"num_tokens": 644466.0,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4,
|
||
|
|
"eval_entropy": 0.6132586502432823,
|
||
|
|
"eval_loss": 0.6346827149391174,
|
||
|
|
"eval_mean_token_accuracy": 0.8480832484960557,
|
||
|
|
"eval_num_tokens": 644466.0,
|
||
|
|
"eval_runtime": 87.9901,
|
||
|
|
"eval_samples_per_second": 45.46,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.631557651758194,
|
||
|
|
"epoch": 0.4111111111111111,
|
||
|
|
"grad_norm": 0.2891533076763153,
|
||
|
|
"learning_rate": 0.00013308370293180902,
|
||
|
|
"loss": 0.6255,
|
||
|
|
"mean_token_accuracy": 0.8511977612972259,
|
||
|
|
"num_tokens": 662209.0,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6207279634475708,
|
||
|
|
"epoch": 0.4222222222222222,
|
||
|
|
"grad_norm": 0.42430344223976135,
|
||
|
|
"learning_rate": 0.00012966704504383168,
|
||
|
|
"loss": 0.6205,
|
||
|
|
"mean_token_accuracy": 0.8519205749034882,
|
||
|
|
"num_tokens": 679790.0,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6246707677841187,
|
||
|
|
"epoch": 0.43333333333333335,
|
||
|
|
"grad_norm": 0.29151901602745056,
|
||
|
|
"learning_rate": 0.00012621197234663283,
|
||
|
|
"loss": 0.6247,
|
||
|
|
"mean_token_accuracy": 0.8536795401573181,
|
||
|
|
"num_tokens": 697693.0,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6282322096824646,
|
||
|
|
"epoch": 0.4444444444444444,
|
||
|
|
"grad_norm": 0.40175795555114746,
|
||
|
|
"learning_rate": 0.0001227229586918655,
|
||
|
|
"loss": 0.6315,
|
||
|
|
"mean_token_accuracy": 0.8500750088691711,
|
||
|
|
"num_tokens": 715480.0,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4444444444444444,
|
||
|
|
"eval_entropy": 0.614536872446537,
|
||
|
|
"eval_loss": 0.6332096457481384,
|
||
|
|
"eval_mean_token_accuracy": 0.8484884305000305,
|
||
|
|
"eval_num_tokens": 715480.0,
|
||
|
|
"eval_runtime": 87.9964,
|
||
|
|
"eval_samples_per_second": 45.456,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6323211324214936,
|
||
|
|
"epoch": 0.45555555555555555,
|
||
|
|
"grad_norm": 0.3964557945728302,
|
||
|
|
"learning_rate": 0.00011920452188013029,
|
||
|
|
"loss": 0.6327,
|
||
|
|
"mean_token_accuracy": 0.8506696331501007,
|
||
|
|
"num_tokens": 733381.0,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6420121788978577,
|
||
|
|
"epoch": 0.4666666666666667,
|
||
|
|
"grad_norm": 0.270047664642334,
|
||
|
|
"learning_rate": 0.0001156612178110351,
|
||
|
|
"loss": 0.6356,
|
||
|
|
"mean_token_accuracy": 0.8502523565292358,
|
||
|
|
"num_tokens": 751399.0,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6098494738340378,
|
||
|
|
"epoch": 0.4777777777777778,
|
||
|
|
"grad_norm": 0.30119839310646057,
|
||
|
|
"learning_rate": 0.00011209763458392135,
|
||
|
|
"loss": 0.6028,
|
||
|
|
"mean_token_accuracy": 0.8526248228549957,
|
||
|
|
"num_tokens": 769220.0,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6257252705097198,
|
||
|
|
"epoch": 0.4888888888888889,
|
||
|
|
"grad_norm": 0.41217416524887085,
|
||
|
|
"learning_rate": 0.00010851838655689625,
|
||
|
|
"loss": 0.6278,
|
||
|
|
"mean_token_accuracy": 0.8503323125839234,
|
||
|
|
"num_tokens": 787060.0,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4888888888888889,
|
||
|
|
"eval_entropy": 0.6182988230586052,
|
||
|
|
"eval_loss": 0.629729151725769,
|
||
|
|
"eval_mean_token_accuracy": 0.8488966919183731,
|
||
|
|
"eval_num_tokens": 787060.0,
|
||
|
|
"eval_runtime": 88.02,
|
||
|
|
"eval_samples_per_second": 45.444,
|
||
|
|
"eval_steps_per_second": 5.681,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.629525854587555,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 0.33919742703437805,
|
||
|
|
"learning_rate": 0.00010492810837186333,
|
||
|
|
"loss": 0.6288,
|
||
|
|
"mean_token_accuracy": 0.8490475380420685,
|
||
|
|
"num_tokens": 804988.0,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6336910331249237,
|
||
|
|
"epoch": 0.5111111111111111,
|
||
|
|
"grad_norm": 0.42546504735946655,
|
||
|
|
"learning_rate": 0.00010133144895328832,
|
||
|
|
"loss": 0.6302,
|
||
|
|
"mean_token_accuracy": 0.848661150932312,
|
||
|
|
"num_tokens": 823030.0,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6298299109935761,
|
||
|
|
"epoch": 0.5222222222222223,
|
||
|
|
"grad_norm": 0.4121692180633545,
|
||
|
|
"learning_rate": 9.7733065488471e-05,
|
||
|
|
"loss": 0.6261,
|
||
|
|
"mean_token_accuracy": 0.8524516999721528,
|
||
|
|
"num_tokens": 840873.0,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6273639261722564,
|
||
|
|
"epoch": 0.5333333333333333,
|
||
|
|
"grad_norm": 0.33091071248054504,
|
||
|
|
"learning_rate": 9.413761739711771e-05,
|
||
|
|
"loss": 0.6279,
|
||
|
|
"mean_token_accuracy": 0.8502094805240631,
|
||
|
|
"num_tokens": 858631.0,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5333333333333333,
|
||
|
|
"eval_entropy": 0.6137934300303459,
|
||
|
|
"eval_loss": 0.6281214356422424,
|
||
|
|
"eval_mean_token_accuracy": 0.849094120979309,
|
||
|
|
"eval_num_tokens": 858631.0,
|
||
|
|
"eval_runtime": 88.0567,
|
||
|
|
"eval_samples_per_second": 45.425,
|
||
|
|
"eval_steps_per_second": 5.678,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6171291017532349,
|
||
|
|
"epoch": 0.5444444444444444,
|
||
|
|
"grad_norm": 0.41635480523109436,
|
||
|
|
"learning_rate": 9.054976029802337e-05,
|
||
|
|
"loss": 0.6157,
|
||
|
|
"mean_token_accuracy": 0.8495936059951782,
|
||
|
|
"num_tokens": 876423.0,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6276765954494476,
|
||
|
|
"epoch": 0.5555555555555556,
|
||
|
|
"grad_norm": 0.43940269947052,
|
||
|
|
"learning_rate": 8.6974139980675e-05,
|
||
|
|
"loss": 0.6233,
|
||
|
|
"mean_token_accuracy": 0.8525845003128052,
|
||
|
|
"num_tokens": 894210.0,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6247690558433533,
|
||
|
|
"epoch": 0.5666666666666667,
|
||
|
|
"grad_norm": 0.31268852949142456,
|
||
|
|
"learning_rate": 8.341538638958291e-05,
|
||
|
|
"loss": 0.6255,
|
||
|
|
"mean_token_accuracy": 0.8500683605670929,
|
||
|
|
"num_tokens": 912075.0,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6128390139341354,
|
||
|
|
"epoch": 0.5777777777777777,
|
||
|
|
"grad_norm": 0.3916880190372467,
|
||
|
|
"learning_rate": 7.987810762912924e-05,
|
||
|
|
"loss": 0.615,
|
||
|
|
"mean_token_accuracy": 0.8547429955005645,
|
||
|
|
"num_tokens": 929888.0,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5777777777777777,
|
||
|
|
"eval_entropy": 0.612749766767025,
|
||
|
|
"eval_loss": 0.6256938576698303,
|
||
|
|
"eval_mean_token_accuracy": 0.8497577202320099,
|
||
|
|
"eval_num_tokens": 929888.0,
|
||
|
|
"eval_runtime": 87.9613,
|
||
|
|
"eval_samples_per_second": 45.475,
|
||
|
|
"eval_steps_per_second": 5.684,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6389735889434814,
|
||
|
|
"epoch": 0.5888888888888889,
|
||
|
|
"grad_norm": 0.38676777482032776,
|
||
|
|
"learning_rate": 7.636688399669589e-05,
|
||
|
|
"loss": 0.6435,
|
||
|
|
"mean_token_accuracy": 0.84725133061409,
|
||
|
|
"num_tokens": 947902.0,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6316736024618149,
|
||
|
|
"epoch": 0.6,
|
||
|
|
"grad_norm": 0.3665507733821869,
|
||
|
|
"learning_rate": 7.288626205179951e-05,
|
||
|
|
"loss": 0.6242,
|
||
|
|
"mean_token_accuracy": 0.8496048271656036,
|
||
|
|
"num_tokens": 965905.0,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6109506344795227,
|
||
|
|
"epoch": 0.6111111111111112,
|
||
|
|
"grad_norm": 0.4900154769420624,
|
||
|
|
"learning_rate": 6.944074872891199e-05,
|
||
|
|
"loss": 0.6063,
|
||
|
|
"mean_token_accuracy": 0.852264449596405,
|
||
|
|
"num_tokens": 983884.0,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6153418481349945,
|
||
|
|
"epoch": 0.6222222222222222,
|
||
|
|
"grad_norm": 0.47226202487945557,
|
||
|
|
"learning_rate": 6.603480550158995e-05,
|
||
|
|
"loss": 0.62,
|
||
|
|
"mean_token_accuracy": 0.8504341340065003,
|
||
|
|
"num_tokens": 1001717.0,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6222222222222222,
|
||
|
|
"eval_entropy": 0.619692858338356,
|
||
|
|
"eval_loss": 0.6247742772102356,
|
||
|
|
"eval_mean_token_accuracy": 0.8500253454446792,
|
||
|
|
"eval_num_tokens": 1001717.0,
|
||
|
|
"eval_runtime": 87.9899,
|
||
|
|
"eval_samples_per_second": 45.46,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6241399937868118,
|
||
|
|
"epoch": 0.6333333333333333,
|
||
|
|
"grad_norm": 0.4187294840812683,
|
||
|
|
"learning_rate": 6.267284260547049e-05,
|
||
|
|
"loss": 0.6156,
|
||
|
|
"mean_token_accuracy": 0.8507660686969757,
|
||
|
|
"num_tokens": 1019451.0,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6226123148202896,
|
||
|
|
"epoch": 0.6444444444444445,
|
||
|
|
"grad_norm": 0.33195361495018005,
|
||
|
|
"learning_rate": 5.9359213327612416e-05,
|
||
|
|
"loss": 0.6129,
|
||
|
|
"mean_token_accuracy": 0.8508730328083038,
|
||
|
|
"num_tokens": 1037299.0,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6067140877246857,
|
||
|
|
"epoch": 0.6555555555555556,
|
||
|
|
"grad_norm": 0.4044334292411804,
|
||
|
|
"learning_rate": 5.609820836957871e-05,
|
||
|
|
"loss": 0.5978,
|
||
|
|
"mean_token_accuracy": 0.8547445130348206,
|
||
|
|
"num_tokens": 1054997.0,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6072817480564118,
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"grad_norm": 0.3837581276893616,
|
||
|
|
"learning_rate": 5.28940502915587e-05,
|
||
|
|
"loss": 0.6195,
|
||
|
|
"mean_token_accuracy": 0.8500410854816437,
|
||
|
|
"num_tokens": 1072981.0,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"eval_entropy": 0.6186771767735482,
|
||
|
|
"eval_loss": 0.6234644055366516,
|
||
|
|
"eval_mean_token_accuracy": 0.8501761881113052,
|
||
|
|
"eval_num_tokens": 1072981.0,
|
||
|
|
"eval_runtime": 88.0305,
|
||
|
|
"eval_samples_per_second": 45.439,
|
||
|
|
"eval_steps_per_second": 5.68,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6253303802013397,
|
||
|
|
"epoch": 0.6777777777777778,
|
||
|
|
"grad_norm": 0.4809369742870331,
|
||
|
|
"learning_rate": 4.975088804472356e-05,
|
||
|
|
"loss": 0.6199,
|
||
|
|
"mean_token_accuracy": 0.850438643693924,
|
||
|
|
"num_tokens": 1090627.0,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6156042230129242,
|
||
|
|
"epoch": 0.6888888888888889,
|
||
|
|
"grad_norm": 0.35011181235313416,
|
||
|
|
"learning_rate": 4.667279159889624e-05,
|
||
|
|
"loss": 0.6211,
|
||
|
|
"mean_token_accuracy": 0.8514653861522674,
|
||
|
|
"num_tokens": 1108362.0,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6167298531532288,
|
||
|
|
"epoch": 0.7,
|
||
|
|
"grad_norm": 0.41358232498168945,
|
||
|
|
"learning_rate": 4.366374667249118e-05,
|
||
|
|
"loss": 0.6069,
|
||
|
|
"mean_token_accuracy": 0.8523639941215515,
|
||
|
|
"num_tokens": 1126044.0,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6212670838832856,
|
||
|
|
"epoch": 0.7111111111111111,
|
||
|
|
"grad_norm": 0.44910740852355957,
|
||
|
|
"learning_rate": 4.0727649571548146e-05,
|
||
|
|
"loss": 0.6272,
|
||
|
|
"mean_token_accuracy": 0.8506516909599304,
|
||
|
|
"num_tokens": 1143877.0,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7111111111111111,
|
||
|
|
"eval_entropy": 0.6189550485610962,
|
||
|
|
"eval_loss": 0.622456431388855,
|
||
|
|
"eval_mean_token_accuracy": 0.8506602959632874,
|
||
|
|
"eval_num_tokens": 1143877.0,
|
||
|
|
"eval_runtime": 87.9986,
|
||
|
|
"eval_samples_per_second": 45.455,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6167463368177414,
|
||
|
|
"epoch": 0.7222222222222222,
|
||
|
|
"grad_norm": 0.4499205946922302,
|
||
|
|
"learning_rate": 3.786830214454315e-05,
|
||
|
|
"loss": 0.6159,
|
||
|
|
"mean_token_accuracy": 0.8481730723381042,
|
||
|
|
"num_tokens": 1161796.0,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6131170511245727,
|
||
|
|
"epoch": 0.7333333333333333,
|
||
|
|
"grad_norm": 0.38012415170669556,
|
||
|
|
"learning_rate": 3.5089406859509166e-05,
|
||
|
|
"loss": 0.6219,
|
||
|
|
"mean_token_accuracy": 0.8537890160083771,
|
||
|
|
"num_tokens": 1179566.0,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6255488413572311,
|
||
|
|
"epoch": 0.7444444444444445,
|
||
|
|
"grad_norm": 0.3967890739440918,
|
||
|
|
"learning_rate": 3.2394562009840835e-05,
|
||
|
|
"loss": 0.6268,
|
||
|
|
"mean_token_accuracy": 0.8487379801273346,
|
||
|
|
"num_tokens": 1197824.0,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6168732368946075,
|
||
|
|
"epoch": 0.7555555555555555,
|
||
|
|
"grad_norm": 0.38855499029159546,
|
||
|
|
"learning_rate": 2.9787257054991592e-05,
|
||
|
|
"loss": 0.6036,
|
||
|
|
"mean_token_accuracy": 0.8553757643699647,
|
||
|
|
"num_tokens": 1215503.0,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7555555555555555,
|
||
|
|
"eval_entropy": 0.6248494290113449,
|
||
|
|
"eval_loss": 0.6211217045783997,
|
||
|
|
"eval_mean_token_accuracy": 0.8511025402545929,
|
||
|
|
"eval_num_tokens": 1215503.0,
|
||
|
|
"eval_runtime": 87.9806,
|
||
|
|
"eval_samples_per_second": 45.465,
|
||
|
|
"eval_steps_per_second": 5.683,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6244831168651581,
|
||
|
|
"epoch": 0.7666666666666667,
|
||
|
|
"grad_norm": 0.40160423517227173,
|
||
|
|
"learning_rate": 2.727086810209559e-05,
|
||
|
|
"loss": 0.6144,
|
||
|
|
"mean_token_accuracy": 0.8539444077014923,
|
||
|
|
"num_tokens": 1233233.0,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6258886575698852,
|
||
|
|
"epoch": 0.7777777777777778,
|
||
|
|
"grad_norm": 0.4197278618812561,
|
||
|
|
"learning_rate": 2.4848653534365886e-05,
|
||
|
|
"loss": 0.6157,
|
||
|
|
"mean_token_accuracy": 0.8522231721878052,
|
||
|
|
"num_tokens": 1251135.0,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6242718535661698,
|
||
|
|
"epoch": 0.7888888888888889,
|
||
|
|
"grad_norm": 0.31663864850997925,
|
||
|
|
"learning_rate": 2.2523749791929127e-05,
|
||
|
|
"loss": 0.6236,
|
||
|
|
"mean_token_accuracy": 0.8481460773944854,
|
||
|
|
"num_tokens": 1269437.0,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6332012844085694,
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 0.4183398485183716,
|
||
|
|
"learning_rate": 2.029916731055981e-05,
|
||
|
|
"loss": 0.6331,
|
||
|
|
"mean_token_accuracy": 0.8506816875934601,
|
||
|
|
"num_tokens": 1287466.0,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8,
|
||
|
|
"eval_entropy": 0.6211995969414711,
|
||
|
|
"eval_loss": 0.6201685667037964,
|
||
|
|
"eval_mean_token_accuracy": 0.8512796934843063,
|
||
|
|
"eval_num_tokens": 1287466.0,
|
||
|
|
"eval_runtime": 87.995,
|
||
|
|
"eval_samples_per_second": 45.457,
|
||
|
|
"eval_steps_per_second": 5.682,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6207393455505371,
|
||
|
|
"epoch": 0.8111111111111111,
|
||
|
|
"grad_norm": 0.3644893169403076,
|
||
|
|
"learning_rate": 1.8177786623573322e-05,
|
||
|
|
"loss": 0.6051,
|
||
|
|
"mean_token_accuracy": 0.8554283630847931,
|
||
|
|
"num_tokens": 1305114.0,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6093165588378906,
|
||
|
|
"epoch": 0.8222222222222222,
|
||
|
|
"grad_norm": 0.4035656154155731,
|
||
|
|
"learning_rate": 1.6162354631925204e-05,
|
||
|
|
"loss": 0.5942,
|
||
|
|
"mean_token_accuracy": 0.8568060100078583,
|
||
|
|
"num_tokens": 1322570.0,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.610592405796051,
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"grad_norm": 0.4430359899997711,
|
||
|
|
"learning_rate": 1.425548104734583e-05,
|
||
|
|
"loss": 0.6228,
|
||
|
|
"mean_token_accuracy": 0.853781110048294,
|
||
|
|
"num_tokens": 1340182.0,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.624454995393753,
|
||
|
|
"epoch": 0.8444444444444444,
|
||
|
|
"grad_norm": 0.47560906410217285,
|
||
|
|
"learning_rate": 1.2459635013117043e-05,
|
||
|
|
"loss": 0.6285,
|
||
|
|
"mean_token_accuracy": 0.8506780207157135,
|
||
|
|
"num_tokens": 1357930.0,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8444444444444444,
|
||
|
|
"eval_entropy": 0.6203543889522553,
|
||
|
|
"eval_loss": 0.6197088360786438,
|
||
|
|
"eval_mean_token_accuracy": 0.851304793715477,
|
||
|
|
"eval_num_tokens": 1357930.0,
|
||
|
|
"eval_runtime": 88.0719,
|
||
|
|
"eval_samples_per_second": 45.417,
|
||
|
|
"eval_steps_per_second": 5.677,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6153292739391327,
|
||
|
|
"epoch": 0.8555555555555555,
|
||
|
|
"grad_norm": 0.6369743347167969,
|
||
|
|
"learning_rate": 1.0777141906865584e-05,
|
||
|
|
"loss": 0.6206,
|
||
|
|
"mean_token_accuracy": 0.8497644782066345,
|
||
|
|
"num_tokens": 1375720.0,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6167460489273071,
|
||
|
|
"epoch": 0.8666666666666667,
|
||
|
|
"grad_norm": 0.31762850284576416,
|
||
|
|
"learning_rate": 9.210180329513674e-06,
|
||
|
|
"loss": 0.6164,
|
||
|
|
"mean_token_accuracy": 0.8522521209716797,
|
||
|
|
"num_tokens": 1393343.0,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6256991571187973,
|
||
|
|
"epoch": 0.8777777777777778,
|
||
|
|
"grad_norm": 0.44901618361473083,
|
||
|
|
"learning_rate": 7.760779284285724e-06,
|
||
|
|
"loss": 0.6304,
|
||
|
|
"mean_token_accuracy": 0.8484922182559967,
|
||
|
|
"num_tokens": 1411415.0,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6351810383796692,
|
||
|
|
"epoch": 0.8888888888888888,
|
||
|
|
"grad_norm": 0.3676467537879944,
|
||
|
|
"learning_rate": 6.430815549423541e-06,
|
||
|
|
"loss": 0.6343,
|
||
|
|
"mean_token_accuracy": 0.8446752560138703,
|
||
|
|
"num_tokens": 1429355.0,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8888888888888888,
|
||
|
|
"eval_entropy": 0.6227085783481597,
|
||
|
|
"eval_loss": 0.6193701028823853,
|
||
|
|
"eval_mean_token_accuracy": 0.8513179312944412,
|
||
|
|
"eval_num_tokens": 1429355.0,
|
||
|
|
"eval_runtime": 87.8963,
|
||
|
|
"eval_samples_per_second": 45.508,
|
||
|
|
"eval_steps_per_second": 5.689,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6218395137786865,
|
||
|
|
"epoch": 0.9,
|
||
|
|
"grad_norm": 0.352983683347702,
|
||
|
|
"learning_rate": 5.222011248012537e-06,
|
||
|
|
"loss": 0.6208,
|
||
|
|
"mean_token_accuracy": 0.8524598634243011,
|
||
|
|
"num_tokens": 1447378.0,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6192989981174469,
|
||
|
|
"epoch": 0.9111111111111111,
|
||
|
|
"grad_norm": 0.5437832474708557,
|
||
|
|
"learning_rate": 4.1359316180653806e-06,
|
||
|
|
"loss": 0.6071,
|
||
|
|
"mean_token_accuracy": 0.8508526837825775,
|
||
|
|
"num_tokens": 1465326.0,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6204692393541336,
|
||
|
|
"epoch": 0.9222222222222223,
|
||
|
|
"grad_norm": 0.29962158203125,
|
||
|
|
"learning_rate": 3.1739829857504234e-06,
|
||
|
|
"loss": 0.6173,
|
||
|
|
"mean_token_accuracy": 0.8498383402824402,
|
||
|
|
"num_tokens": 1483262.0,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6081379109621048,
|
||
|
|
"epoch": 0.9333333333333333,
|
||
|
|
"grad_norm": 0.39637628197669983,
|
||
|
|
"learning_rate": 2.3374109443897065e-06,
|
||
|
|
"loss": 0.6151,
|
||
|
|
"mean_token_accuracy": 0.8525230586528778,
|
||
|
|
"num_tokens": 1500855.0,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9333333333333333,
|
||
|
|
"eval_entropy": 0.6221803342103958,
|
||
|
|
"eval_loss": 0.6191594004631042,
|
||
|
|
"eval_mean_token_accuracy": 0.8513582646846771,
|
||
|
|
"eval_num_tokens": 1500855.0,
|
||
|
|
"eval_runtime": 87.871,
|
||
|
|
"eval_samples_per_second": 45.521,
|
||
|
|
"eval_steps_per_second": 5.69,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6116468846797943,
|
||
|
|
"epoch": 0.9444444444444444,
|
||
|
|
"grad_norm": 0.3078557252883911,
|
||
|
|
"learning_rate": 1.6272987415841267e-06,
|
||
|
|
"loss": 0.598,
|
||
|
|
"mean_token_accuracy": 0.8564710664749146,
|
||
|
|
"num_tokens": 1518914.0,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6252698361873626,
|
||
|
|
"epoch": 0.9555555555555556,
|
||
|
|
"grad_norm": 0.38908475637435913,
|
||
|
|
"learning_rate": 1.0445658765543153e-06,
|
||
|
|
"loss": 0.6194,
|
||
|
|
"mean_token_accuracy": 0.8515545094013214,
|
||
|
|
"num_tokens": 1536933.0,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6222954159975052,
|
||
|
|
"epoch": 0.9666666666666667,
|
||
|
|
"grad_norm": 0.4802163541316986,
|
||
|
|
"learning_rate": 5.899669095136174e-07,
|
||
|
|
"loss": 0.6291,
|
||
|
|
"mean_token_accuracy": 0.8507583463191986,
|
||
|
|
"num_tokens": 1555013.0,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.6046170508861541,
|
||
|
|
"epoch": 0.9777777777777777,
|
||
|
|
"grad_norm": 0.3905353546142578,
|
||
|
|
"learning_rate": 2.640904846146652e-07,
|
||
|
|
"loss": 0.6059,
|
||
|
|
"mean_token_accuracy": 0.8536884272098542,
|
||
|
|
"num_tokens": 1572842.0,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9777777777777777,
|
||
|
|
"eval_entropy": 0.6219840022921562,
|
||
|
|
"eval_loss": 0.6191316843032837,
|
||
|
|
"eval_mean_token_accuracy": 0.8512439979314804,
|
||
|
|
"eval_num_tokens": 1572842.0,
|
||
|
|
"eval_runtime": 87.8917,
|
||
|
|
"eval_samples_per_second": 45.511,
|
||
|
|
"eval_steps_per_second": 5.689,
|
||
|
|
"step": 4400
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 50,
|
||
|
|
"max_steps": 4500,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 200,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 6.357263504886989e+16,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|