1727 lines
48 KiB
JSON
1727 lines
48 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"eval_steps": 50,
|
||
|
|
"global_step": 846,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.01182033096926714,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 3.2832,
|
||
|
|
"mean_token_accuracy": 0.4556728000442187,
|
||
|
|
"num_tokens": 74681.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02364066193853428,
|
||
|
|
"grad_norm": 104.05796232824258,
|
||
|
|
"learning_rate": 1.7647058823529414e-07,
|
||
|
|
"loss": 3.2183,
|
||
|
|
"mean_token_accuracy": 0.45771179099877674,
|
||
|
|
"num_tokens": 153902.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03546099290780142,
|
||
|
|
"grad_norm": 44.640553186839156,
|
||
|
|
"learning_rate": 4.7058823529411767e-07,
|
||
|
|
"loss": 2.7453,
|
||
|
|
"mean_token_accuracy": 0.4893781746427218,
|
||
|
|
"num_tokens": 230170.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04728132387706856,
|
||
|
|
"grad_norm": 16.641432874457237,
|
||
|
|
"learning_rate": 7.647058823529413e-07,
|
||
|
|
"loss": 2.4251,
|
||
|
|
"mean_token_accuracy": 0.5127047290404637,
|
||
|
|
"num_tokens": 311284.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0591016548463357,
|
||
|
|
"grad_norm": 12.092821042180054,
|
||
|
|
"learning_rate": 1.0588235294117648e-06,
|
||
|
|
"loss": 2.3421,
|
||
|
|
"mean_token_accuracy": 0.5212494264046351,
|
||
|
|
"num_tokens": 385963.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07092198581560284,
|
||
|
|
"grad_norm": 12.48879299232815,
|
||
|
|
"learning_rate": 1.3529411764705883e-06,
|
||
|
|
"loss": 2.2408,
|
||
|
|
"mean_token_accuracy": 0.5327892646193504,
|
||
|
|
"num_tokens": 458884.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08274231678486997,
|
||
|
|
"grad_norm": 9.244193713594534,
|
||
|
|
"learning_rate": 1.6470588235294118e-06,
|
||
|
|
"loss": 2.1777,
|
||
|
|
"mean_token_accuracy": 0.5375485748052597,
|
||
|
|
"num_tokens": 540057.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09456264775413711,
|
||
|
|
"grad_norm": 12.566401264702058,
|
||
|
|
"learning_rate": 1.9411764705882353e-06,
|
||
|
|
"loss": 2.1226,
|
||
|
|
"mean_token_accuracy": 0.5466068352262179,
|
||
|
|
"num_tokens": 621727.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10638297872340426,
|
||
|
|
"grad_norm": 12.966605888942672,
|
||
|
|
"learning_rate": 2.2352941176470592e-06,
|
||
|
|
"loss": 2.0219,
|
||
|
|
"mean_token_accuracy": 0.5580442860722542,
|
||
|
|
"num_tokens": 704767.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1182033096926714,
|
||
|
|
"grad_norm": 12.223856725829414,
|
||
|
|
"learning_rate": 2.5294117647058823e-06,
|
||
|
|
"loss": 1.9071,
|
||
|
|
"mean_token_accuracy": 0.5804560139775277,
|
||
|
|
"num_tokens": 787574.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1182033096926714,
|
||
|
|
"eval_loss": 2.261831045150757,
|
||
|
|
"eval_mean_token_accuracy": 0.5334406300379445,
|
||
|
|
"eval_num_tokens": 787574.0,
|
||
|
|
"eval_runtime": 112.2462,
|
||
|
|
"eval_samples_per_second": 33.444,
|
||
|
|
"eval_steps_per_second": 5.577,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13002364066193853,
|
||
|
|
"grad_norm": 8.808307531063669,
|
||
|
|
"learning_rate": 2.8235294117647062e-06,
|
||
|
|
"loss": 1.8351,
|
||
|
|
"mean_token_accuracy": 0.5938245743513108,
|
||
|
|
"num_tokens": 868536.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14184397163120568,
|
||
|
|
"grad_norm": 11.082814688242964,
|
||
|
|
"learning_rate": 3.1176470588235297e-06,
|
||
|
|
"loss": 1.8068,
|
||
|
|
"mean_token_accuracy": 0.597475977241993,
|
||
|
|
"num_tokens": 944129.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1536643026004728,
|
||
|
|
"grad_norm": 11.206601503508725,
|
||
|
|
"learning_rate": 3.4117647058823532e-06,
|
||
|
|
"loss": 1.7111,
|
||
|
|
"mean_token_accuracy": 0.6089037587245305,
|
||
|
|
"num_tokens": 1032000.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16548463356973994,
|
||
|
|
"grad_norm": 14.333554845362597,
|
||
|
|
"learning_rate": 3.7058823529411767e-06,
|
||
|
|
"loss": 1.6025,
|
||
|
|
"mean_token_accuracy": 0.6384305556615194,
|
||
|
|
"num_tokens": 1110048.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1773049645390071,
|
||
|
|
"grad_norm": 17.238389536700844,
|
||
|
|
"learning_rate": 4.000000000000001e-06,
|
||
|
|
"loss": 1.4971,
|
||
|
|
"mean_token_accuracy": 0.6512377719084422,
|
||
|
|
"num_tokens": 1189270.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18912529550827423,
|
||
|
|
"grad_norm": 15.105419987740975,
|
||
|
|
"learning_rate": 4.294117647058823e-06,
|
||
|
|
"loss": 1.405,
|
||
|
|
"mean_token_accuracy": 0.6698001553614934,
|
||
|
|
"num_tokens": 1276274.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20094562647754138,
|
||
|
|
"grad_norm": 18.16302751183317,
|
||
|
|
"learning_rate": 4.588235294117647e-06,
|
||
|
|
"loss": 1.3866,
|
||
|
|
"mean_token_accuracy": 0.677808458606402,
|
||
|
|
"num_tokens": 1347635.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2127659574468085,
|
||
|
|
"grad_norm": 16.544595608890432,
|
||
|
|
"learning_rate": 4.882352941176471e-06,
|
||
|
|
"loss": 1.2625,
|
||
|
|
"mean_token_accuracy": 0.7019270072380702,
|
||
|
|
"num_tokens": 1425994.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22458628841607564,
|
||
|
|
"grad_norm": 16.879827911667025,
|
||
|
|
"learning_rate": 4.999808275592979e-06,
|
||
|
|
"loss": 1.2236,
|
||
|
|
"mean_token_accuracy": 0.7105038404464722,
|
||
|
|
"num_tokens": 1507998.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2364066193853428,
|
||
|
|
"grad_norm": 14.093813445236359,
|
||
|
|
"learning_rate": 4.998636732930301e-06,
|
||
|
|
"loss": 1.231,
|
||
|
|
"mean_token_accuracy": 0.7122000068426132,
|
||
|
|
"num_tokens": 1588043.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2364066193853428,
|
||
|
|
"eval_loss": 2.1943154335021973,
|
||
|
|
"eval_mean_token_accuracy": 0.5279583666271295,
|
||
|
|
"eval_num_tokens": 1588043.0,
|
||
|
|
"eval_runtime": 110.4732,
|
||
|
|
"eval_samples_per_second": 33.981,
|
||
|
|
"eval_steps_per_second": 5.667,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24822695035460993,
|
||
|
|
"grad_norm": 11.560621474065723,
|
||
|
|
"learning_rate": 4.9964006596886895e-06,
|
||
|
|
"loss": 1.0953,
|
||
|
|
"mean_token_accuracy": 0.7428930242856343,
|
||
|
|
"num_tokens": 1676024.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26004728132387706,
|
||
|
|
"grad_norm": 15.262549473091083,
|
||
|
|
"learning_rate": 4.993101008534978e-06,
|
||
|
|
"loss": 1.0258,
|
||
|
|
"mean_token_accuracy": 0.7531117727359136,
|
||
|
|
"num_tokens": 1758985.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2718676122931442,
|
||
|
|
"grad_norm": 16.492390914481756,
|
||
|
|
"learning_rate": 4.988739185267578e-06,
|
||
|
|
"loss": 1.0085,
|
||
|
|
"mean_token_accuracy": 0.7558938791354497,
|
||
|
|
"num_tokens": 1840987.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28368794326241137,
|
||
|
|
"grad_norm": 10.824079042185982,
|
||
|
|
"learning_rate": 4.9833170482175505e-06,
|
||
|
|
"loss": 0.8932,
|
||
|
|
"mean_token_accuracy": 0.774032841126124,
|
||
|
|
"num_tokens": 1926935.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29550827423167847,
|
||
|
|
"grad_norm": 10.282044151979626,
|
||
|
|
"learning_rate": 4.97683690745687e-06,
|
||
|
|
"loss": 0.9501,
|
||
|
|
"mean_token_accuracy": 0.7681633462508519,
|
||
|
|
"num_tokens": 2009169.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3073286052009456,
|
||
|
|
"grad_norm": 12.335809357841024,
|
||
|
|
"learning_rate": 4.969301523814234e-06,
|
||
|
|
"loss": 0.9667,
|
||
|
|
"mean_token_accuracy": 0.765592094262441,
|
||
|
|
"num_tokens": 2085142.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3191489361702128,
|
||
|
|
"grad_norm": 33.465797114274885,
|
||
|
|
"learning_rate": 4.9607141076988244e-06,
|
||
|
|
"loss": 0.8265,
|
||
|
|
"mean_token_accuracy": 0.7936822563409806,
|
||
|
|
"num_tokens": 2163220.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3309692671394799,
|
||
|
|
"grad_norm": 11.83225961982719,
|
||
|
|
"learning_rate": 4.9510783177325335e-06,
|
||
|
|
"loss": 0.7997,
|
||
|
|
"mean_token_accuracy": 0.8059376885493597,
|
||
|
|
"num_tokens": 2248409.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34278959810874704,
|
||
|
|
"grad_norm": 10.335334720960685,
|
||
|
|
"learning_rate": 4.9403982591912235e-06,
|
||
|
|
"loss": 0.7572,
|
||
|
|
"mean_token_accuracy": 0.8129442622264226,
|
||
|
|
"num_tokens": 2327346.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3546099290780142,
|
||
|
|
"grad_norm": 10.178136355537001,
|
||
|
|
"learning_rate": 4.9286784822557e-06,
|
||
|
|
"loss": 0.7467,
|
||
|
|
"mean_token_accuracy": 0.8148943414290746,
|
||
|
|
"num_tokens": 2406245.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3546099290780142,
|
||
|
|
"eval_loss": 2.279350996017456,
|
||
|
|
"eval_mean_token_accuracy": 0.520639596846157,
|
||
|
|
"eval_num_tokens": 2406245.0,
|
||
|
|
"eval_runtime": 109.1976,
|
||
|
|
"eval_samples_per_second": 34.378,
|
||
|
|
"eval_steps_per_second": 5.733,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3664302600472813,
|
||
|
|
"grad_norm": 10.080113241918701,
|
||
|
|
"learning_rate": 4.915923980073132e-06,
|
||
|
|
"loss": 0.6972,
|
||
|
|
"mean_token_accuracy": 0.8263093769550324,
|
||
|
|
"num_tokens": 2490950.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37825059101654845,
|
||
|
|
"grad_norm": 11.063889721157608,
|
||
|
|
"learning_rate": 4.902140186629744e-06,
|
||
|
|
"loss": 0.7158,
|
||
|
|
"mean_token_accuracy": 0.824493623773257,
|
||
|
|
"num_tokens": 2576510.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3900709219858156,
|
||
|
|
"grad_norm": 10.278296874650637,
|
||
|
|
"learning_rate": 4.887332974435705e-06,
|
||
|
|
"loss": 0.6446,
|
||
|
|
"mean_token_accuracy": 0.8359018911918005,
|
||
|
|
"num_tokens": 2662979.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40189125295508277,
|
||
|
|
"grad_norm": 9.750471315097762,
|
||
|
|
"learning_rate": 4.871508652023164e-06,
|
||
|
|
"loss": 0.59,
|
||
|
|
"mean_token_accuracy": 0.850665803750356,
|
||
|
|
"num_tokens": 2744509.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41371158392434987,
|
||
|
|
"grad_norm": 10.48760989183566,
|
||
|
|
"learning_rate": 4.854673961258549e-06,
|
||
|
|
"loss": 0.5977,
|
||
|
|
"mean_token_accuracy": 0.8467812786499659,
|
||
|
|
"num_tokens": 2821398.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.425531914893617,
|
||
|
|
"grad_norm": 8.730182101129285,
|
||
|
|
"learning_rate": 4.836836074470223e-06,
|
||
|
|
"loss": 0.5764,
|
||
|
|
"mean_token_accuracy": 0.8605570962031682,
|
||
|
|
"num_tokens": 2906463.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4373522458628842,
|
||
|
|
"grad_norm": 8.019194726296847,
|
||
|
|
"learning_rate": 4.818002591392751e-06,
|
||
|
|
"loss": 0.5152,
|
||
|
|
"mean_token_accuracy": 0.8619820535182953,
|
||
|
|
"num_tokens": 2990659.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4491725768321513,
|
||
|
|
"grad_norm": 7.390157783777142,
|
||
|
|
"learning_rate": 4.7981815359290805e-06,
|
||
|
|
"loss": 0.5046,
|
||
|
|
"mean_token_accuracy": 0.8695660372575124,
|
||
|
|
"num_tokens": 3078550.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46099290780141844,
|
||
|
|
"grad_norm": 10.541421255580739,
|
||
|
|
"learning_rate": 4.777381352731997e-06,
|
||
|
|
"loss": 0.5843,
|
||
|
|
"mean_token_accuracy": 0.8512990067402522,
|
||
|
|
"num_tokens": 3159131.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4728132387706856,
|
||
|
|
"grad_norm": 10.078950854270674,
|
||
|
|
"learning_rate": 4.7556109036063275e-06,
|
||
|
|
"loss": 0.6007,
|
||
|
|
"mean_token_accuracy": 0.8467452943325042,
|
||
|
|
"num_tokens": 3229781.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4728132387706856,
|
||
|
|
"eval_loss": 2.253284215927124,
|
||
|
|
"eval_mean_token_accuracy": 0.5191043009297155,
|
||
|
|
"eval_num_tokens": 3229781.0,
|
||
|
|
"eval_runtime": 109.5026,
|
||
|
|
"eval_samples_per_second": 34.282,
|
||
|
|
"eval_steps_per_second": 5.717,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4846335697399527,
|
||
|
|
"grad_norm": 9.992622922165191,
|
||
|
|
"learning_rate": 4.732879463733416e-06,
|
||
|
|
"loss": 0.4923,
|
||
|
|
"mean_token_accuracy": 0.8700054933627447,
|
||
|
|
"num_tokens": 3313277.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49645390070921985,
|
||
|
|
"grad_norm": 6.744124934432935,
|
||
|
|
"learning_rate": 4.7091967177194855e-06,
|
||
|
|
"loss": 0.4855,
|
||
|
|
"mean_token_accuracy": 0.8768873423337936,
|
||
|
|
"num_tokens": 3389646.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.508274231678487,
|
||
|
|
"grad_norm": 11.128091166162552,
|
||
|
|
"learning_rate": 4.684572755469557e-06,
|
||
|
|
"loss": 0.4945,
|
||
|
|
"mean_token_accuracy": 0.8679842710494995,
|
||
|
|
"num_tokens": 3469439.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5200945626477541,
|
||
|
|
"grad_norm": 9.062221614300752,
|
||
|
|
"learning_rate": 4.6590180678887106e-06,
|
||
|
|
"loss": 0.4731,
|
||
|
|
"mean_token_accuracy": 0.8787163704633713,
|
||
|
|
"num_tokens": 3551443.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5319148936170213,
|
||
|
|
"grad_norm": 8.982474548381703,
|
||
|
|
"learning_rate": 4.632543542412485e-06,
|
||
|
|
"loss": 0.4604,
|
||
|
|
"mean_token_accuracy": 0.8834071298440297,
|
||
|
|
"num_tokens": 3633871.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5437352245862884,
|
||
|
|
"grad_norm": 9.887115562771593,
|
||
|
|
"learning_rate": 4.6051604583683466e-06,
|
||
|
|
"loss": 0.4134,
|
||
|
|
"mean_token_accuracy": 0.8951348612705866,
|
||
|
|
"num_tokens": 3719608.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5555555555555556,
|
||
|
|
"grad_norm": 8.821856914951912,
|
||
|
|
"learning_rate": 4.5768804821701955e-06,
|
||
|
|
"loss": 0.3942,
|
||
|
|
"mean_token_accuracy": 0.8990837872028351,
|
||
|
|
"num_tokens": 3803920.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5673758865248227,
|
||
|
|
"grad_norm": 7.16860636832362,
|
||
|
|
"learning_rate": 4.54771566234795e-06,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"mean_token_accuracy": 0.8890527258316676,
|
||
|
|
"num_tokens": 3888982.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5791962174940898,
|
||
|
|
"grad_norm": 8.029202329935103,
|
||
|
|
"learning_rate": 4.51767842441434e-06,
|
||
|
|
"loss": 0.5444,
|
||
|
|
"mean_token_accuracy": 0.8689338505268097,
|
||
|
|
"num_tokens": 3968283.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5910165484633569,
|
||
|
|
"grad_norm": 7.713164000798632,
|
||
|
|
"learning_rate": 4.486781565571082e-06,
|
||
|
|
"loss": 0.3567,
|
||
|
|
"mean_token_accuracy": 0.8971066276232401,
|
||
|
|
"num_tokens": 4050822.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5910165484633569,
|
||
|
|
"eval_loss": 2.40179705619812,
|
||
|
|
"eval_mean_token_accuracy": 0.5091435096134393,
|
||
|
|
"eval_num_tokens": 4050822.0,
|
||
|
|
"eval_runtime": 108.43,
|
||
|
|
"eval_samples_per_second": 34.621,
|
||
|
|
"eval_steps_per_second": 5.773,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6028368794326241,
|
||
|
|
"grad_norm": 7.981770465410259,
|
||
|
|
"learning_rate": 4.455038249256702e-06,
|
||
|
|
"loss": 0.4175,
|
||
|
|
"mean_token_accuracy": 0.8864375064770381,
|
||
|
|
"num_tokens": 4125571.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6146572104018913,
|
||
|
|
"grad_norm": 5.5094271110797655,
|
||
|
|
"learning_rate": 4.42246199953832e-06,
|
||
|
|
"loss": 0.4184,
|
||
|
|
"mean_token_accuracy": 0.8935485412677129,
|
||
|
|
"num_tokens": 4205062.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6264775413711584,
|
||
|
|
"grad_norm": 6.038016625662183,
|
||
|
|
"learning_rate": 4.389066695349807e-06,
|
||
|
|
"loss": 0.3899,
|
||
|
|
"mean_token_accuracy": 0.8980253010988235,
|
||
|
|
"num_tokens": 4284113.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6382978723404256,
|
||
|
|
"grad_norm": 7.483680898980003,
|
||
|
|
"learning_rate": 4.354866564578725e-06,
|
||
|
|
"loss": 0.3331,
|
||
|
|
"mean_token_accuracy": 0.9092767784992853,
|
||
|
|
"num_tokens": 4363941.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6501182033096927,
|
||
|
|
"grad_norm": 7.385021253615098,
|
||
|
|
"learning_rate": 4.319876178004624e-06,
|
||
|
|
"loss": 0.3928,
|
||
|
|
"mean_token_accuracy": 0.9003854801257452,
|
||
|
|
"num_tokens": 4438097.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6619385342789598,
|
||
|
|
"grad_norm": 7.463896490898541,
|
||
|
|
"learning_rate": 4.284110443091236e-06,
|
||
|
|
"loss": 0.3341,
|
||
|
|
"mean_token_accuracy": 0.9117354313532512,
|
||
|
|
"num_tokens": 4527132.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6737588652482269,
|
||
|
|
"grad_norm": 6.952725217970187,
|
||
|
|
"learning_rate": 4.247584597635234e-06,
|
||
|
|
"loss": 0.3499,
|
||
|
|
"mean_token_accuracy": 0.903310830394427,
|
||
|
|
"num_tokens": 4608279.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6855791962174941,
|
||
|
|
"grad_norm": 6.943838612781438,
|
||
|
|
"learning_rate": 4.210314203274247e-06,
|
||
|
|
"loss": 0.3287,
|
||
|
|
"mean_token_accuracy": 0.9131078998247782,
|
||
|
|
"num_tokens": 4691886.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6973995271867612,
|
||
|
|
"grad_norm": 7.381800762619572,
|
||
|
|
"learning_rate": 4.1723151388569165e-06,
|
||
|
|
"loss": 0.3576,
|
||
|
|
"mean_token_accuracy": 0.906527488430341,
|
||
|
|
"num_tokens": 4769203.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7092198581560284,
|
||
|
|
"grad_norm": 9.418331863507088,
|
||
|
|
"learning_rate": 4.133603593677792e-06,
|
||
|
|
"loss": 0.3257,
|
||
|
|
"mean_token_accuracy": 0.9139421621958415,
|
||
|
|
"num_tokens": 4850864.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7092198581560284,
|
||
|
|
"eval_loss": 2.4337592124938965,
|
||
|
|
"eval_mean_token_accuracy": 0.5087787045743137,
|
||
|
|
"eval_num_tokens": 4850864.0,
|
||
|
|
"eval_runtime": 111.3343,
|
||
|
|
"eval_samples_per_second": 33.718,
|
||
|
|
"eval_steps_per_second": 5.623,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7210401891252955,
|
||
|
|
"grad_norm": 7.874468218736852,
|
||
|
|
"learning_rate": 4.094196060579972e-06,
|
||
|
|
"loss": 0.31,
|
||
|
|
"mean_token_accuracy": 0.9130396594603857,
|
||
|
|
"num_tokens": 4933811.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7328605200945626,
|
||
|
|
"grad_norm": 7.223379947683901,
|
||
|
|
"learning_rate": 4.054109328928423e-06,
|
||
|
|
"loss": 0.3371,
|
||
|
|
"mean_token_accuracy": 0.9120342075824738,
|
||
|
|
"num_tokens": 5017994.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7446808510638298,
|
||
|
|
"grad_norm": 5.83651158137616,
|
||
|
|
"learning_rate": 4.013360477456956e-06,
|
||
|
|
"loss": 0.2646,
|
||
|
|
"mean_token_accuracy": 0.9279499053955078,
|
||
|
|
"num_tokens": 5102351.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7565011820330969,
|
||
|
|
"grad_norm": 7.113298483675116,
|
||
|
|
"learning_rate": 3.971966866991926e-06,
|
||
|
|
"loss": 0.3273,
|
||
|
|
"mean_token_accuracy": 0.9165905058383942,
|
||
|
|
"num_tokens": 5184621.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7683215130023641,
|
||
|
|
"grad_norm": 8.025597832460408,
|
||
|
|
"learning_rate": 3.92994613305575e-06,
|
||
|
|
"loss": 0.3111,
|
||
|
|
"mean_token_accuracy": 0.9173123935858408,
|
||
|
|
"num_tokens": 5269023.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7801418439716312,
|
||
|
|
"grad_norm": 6.537392517022451,
|
||
|
|
"learning_rate": 3.887316178353384e-06,
|
||
|
|
"loss": 0.2939,
|
||
|
|
"mean_token_accuracy": 0.9172585904598236,
|
||
|
|
"num_tokens": 5347155.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7919621749408984,
|
||
|
|
"grad_norm": 4.806081208802315,
|
||
|
|
"learning_rate": 3.844095165144977e-06,
|
||
|
|
"loss": 0.2873,
|
||
|
|
"mean_token_accuracy": 0.9166339705387752,
|
||
|
|
"num_tokens": 5424281.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8037825059101655,
|
||
|
|
"grad_norm": 5.822318247837762,
|
||
|
|
"learning_rate": 3.800301507507935e-06,
|
||
|
|
"loss": 0.2692,
|
||
|
|
"mean_token_accuracy": 0.9217310587565104,
|
||
|
|
"num_tokens": 5509893.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8156028368794326,
|
||
|
|
"grad_norm": 4.6655813133266575,
|
||
|
|
"learning_rate": 3.755953863491709e-06,
|
||
|
|
"loss": 0.295,
|
||
|
|
"mean_token_accuracy": 0.9214087873697281,
|
||
|
|
"num_tokens": 5587333.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8274231678486997,
|
||
|
|
"grad_norm": 5.2738562954379296,
|
||
|
|
"learning_rate": 3.7110711271686276e-06,
|
||
|
|
"loss": 0.2995,
|
||
|
|
"mean_token_accuracy": 0.9183420379956563,
|
||
|
|
"num_tokens": 5667797.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8274231678486997,
|
||
|
|
"eval_loss": 2.484821319580078,
|
||
|
|
"eval_mean_token_accuracy": 0.5057851479838069,
|
||
|
|
"eval_num_tokens": 5667797.0,
|
||
|
|
"eval_runtime": 108.6037,
|
||
|
|
"eval_samples_per_second": 34.566,
|
||
|
|
"eval_steps_per_second": 5.764,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8392434988179669,
|
||
|
|
"grad_norm": 5.288946153592396,
|
||
|
|
"learning_rate": 3.6656724205841866e-06,
|
||
|
|
"loss": 0.3035,
|
||
|
|
"mean_token_accuracy": 0.9168096592028936,
|
||
|
|
"num_tokens": 5740150.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.851063829787234,
|
||
|
|
"grad_norm": 5.709113745487931,
|
||
|
|
"learning_rate": 3.619777085610201e-06,
|
||
|
|
"loss": 0.2726,
|
||
|
|
"mean_token_accuracy": 0.920536317427953,
|
||
|
|
"num_tokens": 5820836.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8628841607565012,
|
||
|
|
"grad_norm": 6.035834640936946,
|
||
|
|
"learning_rate": 3.57340467570431e-06,
|
||
|
|
"loss": 0.2892,
|
||
|
|
"mean_token_accuracy": 0.9215306291977564,
|
||
|
|
"num_tokens": 5899213.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8747044917257684,
|
||
|
|
"grad_norm": 5.74179547981077,
|
||
|
|
"learning_rate": 3.5265749475793274e-06,
|
||
|
|
"loss": 0.2759,
|
||
|
|
"mean_token_accuracy": 0.9214441031217575,
|
||
|
|
"num_tokens": 5978332.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8865248226950354,
|
||
|
|
"grad_norm": 4.69842226017672,
|
||
|
|
"learning_rate": 3.47930785278601e-06,
|
||
|
|
"loss": 0.25,
|
||
|
|
"mean_token_accuracy": 0.92916699051857,
|
||
|
|
"num_tokens": 6061339.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8983451536643026,
|
||
|
|
"grad_norm": 6.7345436238624155,
|
||
|
|
"learning_rate": 3.431623529212797e-06,
|
||
|
|
"loss": 0.2819,
|
||
|
|
"mean_token_accuracy": 0.9222736060619354,
|
||
|
|
"num_tokens": 6137077.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9101654846335697,
|
||
|
|
"grad_norm": 5.563878030106144,
|
||
|
|
"learning_rate": 3.3835422925061826e-06,
|
||
|
|
"loss": 0.2486,
|
||
|
|
"mean_token_accuracy": 0.9305912603934606,
|
||
|
|
"num_tokens": 6227518.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9219858156028369,
|
||
|
|
"grad_norm": 5.065962877403909,
|
||
|
|
"learning_rate": 3.3350846274153387e-06,
|
||
|
|
"loss": 0.2716,
|
||
|
|
"mean_token_accuracy": 0.9261675874392191,
|
||
|
|
"num_tokens": 6302707.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.933806146572104,
|
||
|
|
"grad_norm": 5.045177866951061,
|
||
|
|
"learning_rate": 3.286271179064701e-06,
|
||
|
|
"loss": 0.3013,
|
||
|
|
"mean_token_accuracy": 0.9195235311985016,
|
||
|
|
"num_tokens": 6380736.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9456264775413712,
|
||
|
|
"grad_norm": 4.252712034080436,
|
||
|
|
"learning_rate": 3.2371227441582285e-06,
|
||
|
|
"loss": 0.2931,
|
||
|
|
"mean_token_accuracy": 0.9227547705173492,
|
||
|
|
"num_tokens": 6453122.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9456264775413712,
|
||
|
|
"eval_loss": 2.4907124042510986,
|
||
|
|
"eval_mean_token_accuracy": 0.5083554677974683,
|
||
|
|
"eval_num_tokens": 6453122.0,
|
||
|
|
"eval_runtime": 109.7712,
|
||
|
|
"eval_samples_per_second": 34.198,
|
||
|
|
"eval_steps_per_second": 5.703,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9574468085106383,
|
||
|
|
"grad_norm": 3.815988429880776,
|
||
|
|
"learning_rate": 3.187660262119077e-06,
|
||
|
|
"loss": 0.2545,
|
||
|
|
"mean_token_accuracy": 0.926997916897138,
|
||
|
|
"num_tokens": 6533869.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9692671394799054,
|
||
|
|
"grad_norm": 7.321292264517903,
|
||
|
|
"learning_rate": 3.1379048061684735e-06,
|
||
|
|
"loss": 0.2727,
|
||
|
|
"mean_token_accuracy": 0.9246514956156413,
|
||
|
|
"num_tokens": 6608291.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9810874704491725,
|
||
|
|
"grad_norm": 5.072077367784098,
|
||
|
|
"learning_rate": 3.087877574347587e-06,
|
||
|
|
"loss": 0.2427,
|
||
|
|
"mean_token_accuracy": 0.9336031595865886,
|
||
|
|
"num_tokens": 6688125.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9929078014184397,
|
||
|
|
"grad_norm": 4.533402816180472,
|
||
|
|
"learning_rate": 3.0375998804862146e-06,
|
||
|
|
"loss": 0.2597,
|
||
|
|
"mean_token_accuracy": 0.9298433562119802,
|
||
|
|
"num_tokens": 6765643.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0047281323877069,
|
||
|
|
"grad_norm": 4.67945464989491,
|
||
|
|
"learning_rate": 2.9870931451221436e-06,
|
||
|
|
"loss": 0.2356,
|
||
|
|
"mean_token_accuracy": 0.9360798348983129,
|
||
|
|
"num_tokens": 6842035.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.016548463356974,
|
||
|
|
"grad_norm": 4.067854721801768,
|
||
|
|
"learning_rate": 2.9363788863750465e-06,
|
||
|
|
"loss": 0.1656,
|
||
|
|
"mean_token_accuracy": 0.9521991004546483,
|
||
|
|
"num_tokens": 6932082.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0283687943262412,
|
||
|
|
"grad_norm": 3.6749072345955294,
|
||
|
|
"learning_rate": 2.885478710778803e-06,
|
||
|
|
"loss": 0.1761,
|
||
|
|
"mean_token_accuracy": 0.9476487815380097,
|
||
|
|
"num_tokens": 7011600.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0401891252955082,
|
||
|
|
"grad_norm": 5.286620213070309,
|
||
|
|
"learning_rate": 2.834414304076155e-06,
|
||
|
|
"loss": 0.1617,
|
||
|
|
"mean_token_accuracy": 0.9531299829483032,
|
||
|
|
"num_tokens": 7097718.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0520094562647755,
|
||
|
|
"grad_norm": 3.99180921580787,
|
||
|
|
"learning_rate": 2.783207421979614e-06,
|
||
|
|
"loss": 0.1743,
|
||
|
|
"mean_token_accuracy": 0.9491087565819423,
|
||
|
|
"num_tokens": 7180644.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0638297872340425,
|
||
|
|
"grad_norm": 3.5159362565459324,
|
||
|
|
"learning_rate": 2.731879880902555e-06,
|
||
|
|
"loss": 0.1773,
|
||
|
|
"mean_token_accuracy": 0.9466255068778991,
|
||
|
|
"num_tokens": 7259970.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0638297872340425,
|
||
|
|
"eval_loss": 2.6374220848083496,
|
||
|
|
"eval_mean_token_accuracy": 0.5018131742938258,
|
||
|
|
"eval_num_tokens": 7259970.0,
|
||
|
|
"eval_runtime": 110.8709,
|
||
|
|
"eval_samples_per_second": 33.859,
|
||
|
|
"eval_steps_per_second": 5.646,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0756501182033098,
|
||
|
|
"grad_norm": 4.265917733350775,
|
||
|
|
"learning_rate": 2.680453548664458e-06,
|
||
|
|
"loss": 0.1733,
|
||
|
|
"mean_token_accuracy": 0.9452025413513183,
|
||
|
|
"num_tokens": 7340719.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0874704491725768,
|
||
|
|
"grad_norm": 4.156827622094583,
|
||
|
|
"learning_rate": 2.6289503351742365e-06,
|
||
|
|
"loss": 0.1692,
|
||
|
|
"mean_token_accuracy": 0.9517264991998673,
|
||
|
|
"num_tokens": 7422782.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.099290780141844,
|
||
|
|
"grad_norm": 3.6858687333016302,
|
||
|
|
"learning_rate": 2.5773921830956455e-06,
|
||
|
|
"loss": 0.1735,
|
||
|
|
"mean_token_accuracy": 0.9458188633124034,
|
||
|
|
"num_tokens": 7504521.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1111111111111112,
|
||
|
|
"grad_norm": 4.471806923768531,
|
||
|
|
"learning_rate": 2.525801058498725e-06,
|
||
|
|
"loss": 0.1656,
|
||
|
|
"mean_token_accuracy": 0.948051197330157,
|
||
|
|
"num_tokens": 7590826.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1229314420803782,
|
||
|
|
"grad_norm": 3.417137678628772,
|
||
|
|
"learning_rate": 2.474198941501276e-06,
|
||
|
|
"loss": 0.1558,
|
||
|
|
"mean_token_accuracy": 0.9528810689846675,
|
||
|
|
"num_tokens": 7678043.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1347517730496455,
|
||
|
|
"grad_norm": 5.039333815788341,
|
||
|
|
"learning_rate": 2.4226078169043554e-06,
|
||
|
|
"loss": 0.1805,
|
||
|
|
"mean_token_accuracy": 0.9472235560417175,
|
||
|
|
"num_tokens": 7755209.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1465721040189125,
|
||
|
|
"grad_norm": 3.7176041502221917,
|
||
|
|
"learning_rate": 2.3710496648257644e-06,
|
||
|
|
"loss": 0.1709,
|
||
|
|
"mean_token_accuracy": 0.9491322924693425,
|
||
|
|
"num_tokens": 7838175.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1583924349881798,
|
||
|
|
"grad_norm": 3.2115692600213652,
|
||
|
|
"learning_rate": 2.319546451335543e-06,
|
||
|
|
"loss": 0.1672,
|
||
|
|
"mean_token_accuracy": 0.949834555387497,
|
||
|
|
"num_tokens": 7922235.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1702127659574468,
|
||
|
|
"grad_norm": 4.136417148989969,
|
||
|
|
"learning_rate": 2.2681201190974454e-06,
|
||
|
|
"loss": 0.1771,
|
||
|
|
"mean_token_accuracy": 0.9477785180012385,
|
||
|
|
"num_tokens": 8001243.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1820330969267139,
|
||
|
|
"grad_norm": 3.8397139922722716,
|
||
|
|
"learning_rate": 2.2167925780203865e-06,
|
||
|
|
"loss": 0.1888,
|
||
|
|
"mean_token_accuracy": 0.9445441563924154,
|
||
|
|
"num_tokens": 8076462.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1820330969267139,
|
||
|
|
"eval_loss": 2.685175895690918,
|
||
|
|
"eval_mean_token_accuracy": 0.49984380817070556,
|
||
|
|
"eval_num_tokens": 8076462.0,
|
||
|
|
"eval_runtime": 112.2773,
|
||
|
|
"eval_samples_per_second": 33.435,
|
||
|
|
"eval_steps_per_second": 5.575,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1938534278959811,
|
||
|
|
"grad_norm": 3.315216919284447,
|
||
|
|
"learning_rate": 2.1655856959238452e-06,
|
||
|
|
"loss": 0.1539,
|
||
|
|
"mean_token_accuracy": 0.952604294816653,
|
||
|
|
"num_tokens": 8164712.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2056737588652482,
|
||
|
|
"grad_norm": 4.235323705607921,
|
||
|
|
"learning_rate": 2.114521289221198e-06,
|
||
|
|
"loss": 0.1608,
|
||
|
|
"mean_token_accuracy": 0.9499759902556737,
|
||
|
|
"num_tokens": 8248878.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2174940898345155,
|
||
|
|
"grad_norm": 4.608242768223314,
|
||
|
|
"learning_rate": 2.0636211136249543e-06,
|
||
|
|
"loss": 0.1672,
|
||
|
|
"mean_token_accuracy": 0.9494648416837056,
|
||
|
|
"num_tokens": 8326064.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2293144208037825,
|
||
|
|
"grad_norm": 3.502689021987934,
|
||
|
|
"learning_rate": 2.0129068548778572e-06,
|
||
|
|
"loss": 0.1537,
|
||
|
|
"mean_token_accuracy": 0.9527056773503622,
|
||
|
|
"num_tokens": 8414776.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2411347517730495,
|
||
|
|
"grad_norm": 4.235942485988551,
|
||
|
|
"learning_rate": 1.962400119513786e-06,
|
||
|
|
"loss": 0.174,
|
||
|
|
"mean_token_accuracy": 0.9458868026733398,
|
||
|
|
"num_tokens": 8491600.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2529550827423168,
|
||
|
|
"grad_norm": 3.2078232835897222,
|
||
|
|
"learning_rate": 1.9121224256524134e-06,
|
||
|
|
"loss": 0.172,
|
||
|
|
"mean_token_accuracy": 0.9478765934705734,
|
||
|
|
"num_tokens": 8570603.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2647754137115839,
|
||
|
|
"grad_norm": 3.0687990919983323,
|
||
|
|
"learning_rate": 1.862095193831527e-06,
|
||
|
|
"loss": 0.1601,
|
||
|
|
"mean_token_accuracy": 0.9530910869439443,
|
||
|
|
"num_tokens": 8654941.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2765957446808511,
|
||
|
|
"grad_norm": 3.5905336673591757,
|
||
|
|
"learning_rate": 1.8123397378809232e-06,
|
||
|
|
"loss": 0.16,
|
||
|
|
"mean_token_accuracy": 0.9501720656951268,
|
||
|
|
"num_tokens": 8737108.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2884160756501182,
|
||
|
|
"grad_norm": 3.4606821671127697,
|
||
|
|
"learning_rate": 1.7628772558417717e-06,
|
||
|
|
"loss": 0.1609,
|
||
|
|
"mean_token_accuracy": 0.9513311187426249,
|
||
|
|
"num_tokens": 8816132.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3002364066193852,
|
||
|
|
"grad_norm": 3.0342047963429017,
|
||
|
|
"learning_rate": 1.7137288209352994e-06,
|
||
|
|
"loss": 0.1568,
|
||
|
|
"mean_token_accuracy": 0.9533144136269888,
|
||
|
|
"num_tokens": 8899608.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3002364066193852,
|
||
|
|
"eval_loss": 2.695701837539673,
|
||
|
|
"eval_mean_token_accuracy": 0.5027767699747421,
|
||
|
|
"eval_num_tokens": 8899608.0,
|
||
|
|
"eval_runtime": 110.95,
|
||
|
|
"eval_samples_per_second": 33.835,
|
||
|
|
"eval_steps_per_second": 5.642,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3120567375886525,
|
||
|
|
"grad_norm": 4.541312231558626,
|
||
|
|
"learning_rate": 1.664915372584662e-06,
|
||
|
|
"loss": 0.1609,
|
||
|
|
"mean_token_accuracy": 0.950713715950648,
|
||
|
|
"num_tokens": 8981457.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3238770685579198,
|
||
|
|
"grad_norm": 2.7175384180465874,
|
||
|
|
"learning_rate": 1.6164577074938182e-06,
|
||
|
|
"loss": 0.174,
|
||
|
|
"mean_token_accuracy": 0.9483120679855347,
|
||
|
|
"num_tokens": 9060540.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3356973995271868,
|
||
|
|
"grad_norm": 3.366989184203709,
|
||
|
|
"learning_rate": 1.5683764707872037e-06,
|
||
|
|
"loss": 0.1704,
|
||
|
|
"mean_token_accuracy": 0.9461151162783304,
|
||
|
|
"num_tokens": 9141972.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3475177304964538,
|
||
|
|
"grad_norm": 4.153873238683359,
|
||
|
|
"learning_rate": 1.5206921472139907e-06,
|
||
|
|
"loss": 0.1783,
|
||
|
|
"mean_token_accuracy": 0.9454526672760646,
|
||
|
|
"num_tokens": 9215271.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3593380614657211,
|
||
|
|
"grad_norm": 2.855098111823155,
|
||
|
|
"learning_rate": 1.4734250524206727e-06,
|
||
|
|
"loss": 0.1756,
|
||
|
|
"mean_token_accuracy": 0.9473081688086192,
|
||
|
|
"num_tokens": 9293349.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3711583924349882,
|
||
|
|
"grad_norm": 3.486431999394093,
|
||
|
|
"learning_rate": 1.4265953242956914e-06,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"mean_token_accuracy": 0.9501691997051239,
|
||
|
|
"num_tokens": 9372365.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3829787234042552,
|
||
|
|
"grad_norm": 3.118171717428614,
|
||
|
|
"learning_rate": 1.3802229143897993e-06,
|
||
|
|
"loss": 0.1705,
|
||
|
|
"mean_token_accuracy": 0.9490540792544683,
|
||
|
|
"num_tokens": 9450218.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3947990543735225,
|
||
|
|
"grad_norm": 4.284146957939775,
|
||
|
|
"learning_rate": 1.3343275794158138e-06,
|
||
|
|
"loss": 0.1656,
|
||
|
|
"mean_token_accuracy": 0.9492589155832927,
|
||
|
|
"num_tokens": 9532191.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4066193853427895,
|
||
|
|
"grad_norm": 4.987440979362939,
|
||
|
|
"learning_rate": 1.2889288728313732e-06,
|
||
|
|
"loss": 0.1715,
|
||
|
|
"mean_token_accuracy": 0.9477653960386913,
|
||
|
|
"num_tokens": 9607710.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4184397163120568,
|
||
|
|
"grad_norm": 3.4008167160659326,
|
||
|
|
"learning_rate": 1.2440461365082917e-06,
|
||
|
|
"loss": 0.1541,
|
||
|
|
"mean_token_accuracy": 0.9530154536167781,
|
||
|
|
"num_tokens": 9691506.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4184397163120568,
|
||
|
|
"eval_loss": 2.6637179851531982,
|
||
|
|
"eval_mean_token_accuracy": 0.5024550324811722,
|
||
|
|
"eval_num_tokens": 9691506.0,
|
||
|
|
"eval_runtime": 110.6668,
|
||
|
|
"eval_samples_per_second": 33.922,
|
||
|
|
"eval_steps_per_second": 5.657,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4302600472813238,
|
||
|
|
"grad_norm": 3.4244372133508523,
|
||
|
|
"learning_rate": 1.1996984924920651e-06,
|
||
|
|
"loss": 0.1739,
|
||
|
|
"mean_token_accuracy": 0.9478476305802663,
|
||
|
|
"num_tokens": 9766964.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.442080378250591,
|
||
|
|
"grad_norm": 3.8359155778997174,
|
||
|
|
"learning_rate": 1.1559048348550245e-06,
|
||
|
|
"loss": 0.1687,
|
||
|
|
"mean_token_accuracy": 0.9482156693935394,
|
||
|
|
"num_tokens": 9846839.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4539007092198581,
|
||
|
|
"grad_norm": 2.6947789470311263,
|
||
|
|
"learning_rate": 1.1126838216466171e-06,
|
||
|
|
"loss": 0.1585,
|
||
|
|
"mean_token_accuracy": 0.9508161584536234,
|
||
|
|
"num_tokens": 9928630.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4657210401891252,
|
||
|
|
"grad_norm": 2.4566267363432215,
|
||
|
|
"learning_rate": 1.0700538669442512e-06,
|
||
|
|
"loss": 0.162,
|
||
|
|
"mean_token_accuracy": 0.9484717577695847,
|
||
|
|
"num_tokens": 10007569.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4775413711583925,
|
||
|
|
"grad_norm": 2.98051235327796,
|
||
|
|
"learning_rate": 1.0280331330080756e-06,
|
||
|
|
"loss": 0.1731,
|
||
|
|
"mean_token_accuracy": 0.9485133985678355,
|
||
|
|
"num_tokens": 10082100.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4893617021276595,
|
||
|
|
"grad_norm": 2.5966978913461576,
|
||
|
|
"learning_rate": 9.866395225430455e-07,
|
||
|
|
"loss": 0.1699,
|
||
|
|
"mean_token_accuracy": 0.9498284469048183,
|
||
|
|
"num_tokens": 10160498.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5011820330969265,
|
||
|
|
"grad_norm": 3.0272596986145848,
|
||
|
|
"learning_rate": 9.458906710715776e-07,
|
||
|
|
"loss": 0.1442,
|
||
|
|
"mean_token_accuracy": 0.9565223922332128,
|
||
|
|
"num_tokens": 10247185.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5130023640661938,
|
||
|
|
"grad_norm": 2.902942105541407,
|
||
|
|
"learning_rate": 9.058039394200283e-07,
|
||
|
|
"loss": 0.1586,
|
||
|
|
"mean_token_accuracy": 0.9523343364397685,
|
||
|
|
"num_tokens": 10329810.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.524822695035461,
|
||
|
|
"grad_norm": 3.8091691785911803,
|
||
|
|
"learning_rate": 8.663964063222094e-07,
|
||
|
|
"loss": 0.1712,
|
||
|
|
"mean_token_accuracy": 0.9478918602069218,
|
||
|
|
"num_tokens": 10406342.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5366430260047281,
|
||
|
|
"grad_norm": 2.84704352427359,
|
||
|
|
"learning_rate": 8.27684861143084e-07,
|
||
|
|
"loss": 0.1591,
|
||
|
|
"mean_token_accuracy": 0.9527056852976481,
|
||
|
|
"num_tokens": 10487338.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5366430260047281,
|
||
|
|
"eval_loss": 2.6855952739715576,
|
||
|
|
"eval_mean_token_accuracy": 0.5021388608331497,
|
||
|
|
"eval_num_tokens": 10487338.0,
|
||
|
|
"eval_runtime": 108.7473,
|
||
|
|
"eval_samples_per_second": 34.52,
|
||
|
|
"eval_steps_per_second": 5.756,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5484633569739952,
|
||
|
|
"grad_norm": 3.395443058239169,
|
||
|
|
"learning_rate": 7.896857967257532e-07,
|
||
|
|
"loss": 0.1608,
|
||
|
|
"mean_token_accuracy": 0.9517467439174652,
|
||
|
|
"num_tokens": 10566656.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5602836879432624,
|
||
|
|
"grad_norm": 2.6081166041756334,
|
||
|
|
"learning_rate": 7.524154023647678e-07,
|
||
|
|
"loss": 0.1621,
|
||
|
|
"mean_token_accuracy": 0.9508071412642797,
|
||
|
|
"num_tokens": 10642301.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5721040189125297,
|
||
|
|
"grad_norm": 2.3224792159461134,
|
||
|
|
"learning_rate": 7.158895569087651e-07,
|
||
|
|
"loss": 0.1558,
|
||
|
|
"mean_token_accuracy": 0.9522350211938222,
|
||
|
|
"num_tokens": 10724129.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5839243498817965,
|
||
|
|
"grad_norm": 2.9407695481451954,
|
||
|
|
"learning_rate": 6.801238219953774e-07,
|
||
|
|
"loss": 0.1573,
|
||
|
|
"mean_token_accuracy": 0.9532469352086385,
|
||
|
|
"num_tokens": 10803308.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5957446808510638,
|
||
|
|
"grad_norm": 2.461257725656036,
|
||
|
|
"learning_rate": 6.451334354212765e-07,
|
||
|
|
"loss": 0.1581,
|
||
|
|
"mean_token_accuracy": 0.9496888081232707,
|
||
|
|
"num_tokens": 10882366.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.607565011820331,
|
||
|
|
"grad_norm": 2.9648253517967116,
|
||
|
|
"learning_rate": 6.109333046501942e-07,
|
||
|
|
"loss": 0.1556,
|
||
|
|
"mean_token_accuracy": 0.9546032408873241,
|
||
|
|
"num_tokens": 10962923.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6193853427895981,
|
||
|
|
"grad_norm": 3.8083000466171475,
|
||
|
|
"learning_rate": 5.775380004616804e-07,
|
||
|
|
"loss": 0.1683,
|
||
|
|
"mean_token_accuracy": 0.9476418197154999,
|
||
|
|
"num_tokens": 11038680.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6312056737588652,
|
||
|
|
"grad_norm": 2.3462649916503864,
|
||
|
|
"learning_rate": 5.449617507433002e-07,
|
||
|
|
"loss": 0.1526,
|
||
|
|
"mean_token_accuracy": 0.9539241482814153,
|
||
|
|
"num_tokens": 11120210.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6430260047281324,
|
||
|
|
"grad_norm": 3.2163417105630074,
|
||
|
|
"learning_rate": 5.132184344289187e-07,
|
||
|
|
"loss": 0.1601,
|
||
|
|
"mean_token_accuracy": 0.9499970803658168,
|
||
|
|
"num_tokens": 11198180.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6548463356973995,
|
||
|
|
"grad_norm": 2.785105592615648,
|
||
|
|
"learning_rate": 4.823215755856603e-07,
|
||
|
|
"loss": 0.1695,
|
||
|
|
"mean_token_accuracy": 0.948690946896871,
|
||
|
|
"num_tokens": 11272183.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6548463356973995,
|
||
|
|
"eval_loss": 2.7217941284179688,
|
||
|
|
"eval_mean_token_accuracy": 0.5019449446909725,
|
||
|
|
"eval_num_tokens": 11272183.0,
|
||
|
|
"eval_runtime": 95.876,
|
||
|
|
"eval_samples_per_second": 39.155,
|
||
|
|
"eval_steps_per_second": 6.529,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"grad_norm": 2.7877400675525514,
|
||
|
|
"learning_rate": 4.522843376520508e-07,
|
||
|
|
"loss": 0.1535,
|
||
|
|
"mean_token_accuracy": 0.9523400167624155,
|
||
|
|
"num_tokens": 11354735.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6784869976359338,
|
||
|
|
"grad_norm": 3.4767887349168283,
|
||
|
|
"learning_rate": 4.2311951782980587e-07,
|
||
|
|
"loss": 0.1579,
|
||
|
|
"mean_token_accuracy": 0.9517521331707637,
|
||
|
|
"num_tokens": 11432857.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.690307328605201,
|
||
|
|
"grad_norm": 3.638265461712605,
|
||
|
|
"learning_rate": 3.9483954163165363e-07,
|
||
|
|
"loss": 0.1683,
|
||
|
|
"mean_token_accuracy": 0.9472279886404673,
|
||
|
|
"num_tokens": 11506345.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.702127659574468,
|
||
|
|
"grad_norm": 3.329843084823806,
|
||
|
|
"learning_rate": 3.674564575875156e-07,
|
||
|
|
"loss": 0.1625,
|
||
|
|
"mean_token_accuracy": 0.9498989204565684,
|
||
|
|
"num_tokens": 11584102.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7139479905437351,
|
||
|
|
"grad_norm": 2.693105492226511,
|
||
|
|
"learning_rate": 3.4098193211128975e-07,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"mean_token_accuracy": 0.94957415163517,
|
||
|
|
"num_tokens": 11662420.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7257683215130024,
|
||
|
|
"grad_norm": 2.347451562983623,
|
||
|
|
"learning_rate": 3.1542724453044323e-07,
|
||
|
|
"loss": 0.1547,
|
||
|
|
"mean_token_accuracy": 0.9542993714412054,
|
||
|
|
"num_tokens": 11745725.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7375886524822695,
|
||
|
|
"grad_norm": 3.428528200094209,
|
||
|
|
"learning_rate": 2.908032822805157e-07,
|
||
|
|
"loss": 0.1629,
|
||
|
|
"mean_token_accuracy": 0.949917741616567,
|
||
|
|
"num_tokens": 11825500.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7494089834515365,
|
||
|
|
"grad_norm": 2.5067600170197095,
|
||
|
|
"learning_rate": 2.671205362665841e-07,
|
||
|
|
"loss": 0.1581,
|
||
|
|
"mean_token_accuracy": 0.9531556775172552,
|
||
|
|
"num_tokens": 11906463.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7612293144208038,
|
||
|
|
"grad_norm": 2.5295945128152417,
|
||
|
|
"learning_rate": 2.4438909639367294e-07,
|
||
|
|
"loss": 0.1405,
|
||
|
|
"mean_token_accuracy": 0.9579586456219356,
|
||
|
|
"num_tokens": 11994373.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.773049645390071,
|
||
|
|
"grad_norm": 2.4227200543725185,
|
||
|
|
"learning_rate": 2.2261864726800364e-07,
|
||
|
|
"loss": 0.1454,
|
||
|
|
"mean_token_accuracy": 0.9546575715144475,
|
||
|
|
"num_tokens": 12078968.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.773049645390071,
|
||
|
|
"eval_loss": 2.739609479904175,
|
||
|
|
"eval_mean_token_accuracy": 0.5016750158212436,
|
||
|
|
"eval_num_tokens": 12078968.0,
|
||
|
|
"eval_runtime": 96.4627,
|
||
|
|
"eval_samples_per_second": 38.917,
|
||
|
|
"eval_steps_per_second": 6.49,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.784869976359338,
|
||
|
|
"grad_norm": 2.734440331002493,
|
||
|
|
"learning_rate": 2.0181846407092003e-07,
|
||
|
|
"loss": 0.1562,
|
||
|
|
"mean_token_accuracy": 0.953593663374583,
|
||
|
|
"num_tokens": 12157200.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7966903073286051,
|
||
|
|
"grad_norm": 3.0212485027888847,
|
||
|
|
"learning_rate": 1.8199740860724928e-07,
|
||
|
|
"loss": 0.1491,
|
||
|
|
"mean_token_accuracy": 0.9538020372390748,
|
||
|
|
"num_tokens": 12239141.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8085106382978724,
|
||
|
|
"grad_norm": 3.7603456536650057,
|
||
|
|
"learning_rate": 1.6316392552977732e-07,
|
||
|
|
"loss": 0.1677,
|
||
|
|
"mean_token_accuracy": 0.9494908769925435,
|
||
|
|
"num_tokens": 12319055.0,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8203309692671394,
|
||
|
|
"grad_norm": 3.856800815054017,
|
||
|
|
"learning_rate": 1.4532603874145068e-07,
|
||
|
|
"loss": 0.1563,
|
||
|
|
"mean_token_accuracy": 0.9524733434120815,
|
||
|
|
"num_tokens": 12399598.0,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8321513002364065,
|
||
|
|
"grad_norm": 2.7252151319976603,
|
||
|
|
"learning_rate": 1.2849134797683627e-07,
|
||
|
|
"loss": 0.1515,
|
||
|
|
"mean_token_accuracy": 0.9528288086255391,
|
||
|
|
"num_tokens": 12480923.0,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8439716312056738,
|
||
|
|
"grad_norm": 3.203424503912004,
|
||
|
|
"learning_rate": 1.1266702556429615e-07,
|
||
|
|
"loss": 0.1745,
|
||
|
|
"mean_token_accuracy": 0.9487062722444535,
|
||
|
|
"num_tokens": 12555434.0,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.855791962174941,
|
||
|
|
"grad_norm": 6.562777724476359,
|
||
|
|
"learning_rate": 9.785981337025602e-08,
|
||
|
|
"loss": 0.1518,
|
||
|
|
"mean_token_accuracy": 0.9551320135593414,
|
||
|
|
"num_tokens": 12635397.0,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.867612293144208,
|
||
|
|
"grad_norm": 2.524203238556445,
|
||
|
|
"learning_rate": 8.407601992686864e-08,
|
||
|
|
"loss": 0.1587,
|
||
|
|
"mean_token_accuracy": 0.9509722570578257,
|
||
|
|
"num_tokens": 12712118.0,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8794326241134751,
|
||
|
|
"grad_norm": 2.700254226198366,
|
||
|
|
"learning_rate": 7.132151774429996e-08,
|
||
|
|
"loss": 0.1488,
|
||
|
|
"mean_token_accuracy": 0.9541502008835475,
|
||
|
|
"num_tokens": 12794157.0,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8912529550827424,
|
||
|
|
"grad_norm": 2.528281516063428,
|
||
|
|
"learning_rate": 5.9601740808777065e-08,
|
||
|
|
"loss": 0.152,
|
||
|
|
"mean_token_accuracy": 0.956131245692571,
|
||
|
|
"num_tokens": 12876278.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8912529550827424,
|
||
|
|
"eval_loss": 2.7611730098724365,
|
||
|
|
"eval_mean_token_accuracy": 0.501501605247918,
|
||
|
|
"eval_num_tokens": 12876278.0,
|
||
|
|
"eval_runtime": 94.7046,
|
||
|
|
"eval_samples_per_second": 39.639,
|
||
|
|
"eval_steps_per_second": 6.61,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9030732860520094,
|
||
|
|
"grad_norm": 2.8738724345889053,
|
||
|
|
"learning_rate": 4.8921682267467075e-08,
|
||
|
|
"loss": 0.1525,
|
||
|
|
"mean_token_accuracy": 0.9533034404118855,
|
||
|
|
"num_tokens": 12958123.0,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9148936170212765,
|
||
|
|
"grad_norm": 2.5890563371947253,
|
||
|
|
"learning_rate": 3.9285892301175744e-08,
|
||
|
|
"loss": 0.1665,
|
||
|
|
"mean_token_accuracy": 0.9502290070056916,
|
||
|
|
"num_tokens": 13033488.0,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9267139479905437,
|
||
|
|
"grad_norm": 2.8437599872465507,
|
||
|
|
"learning_rate": 3.069847618576649e-08,
|
||
|
|
"loss": 0.143,
|
||
|
|
"mean_token_accuracy": 0.9558833320935567,
|
||
|
|
"num_tokens": 13118515.0,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.938534278959811,
|
||
|
|
"grad_norm": 3.2653301157414907,
|
||
|
|
"learning_rate": 2.3163092543130317e-08,
|
||
|
|
"loss": 0.1565,
|
||
|
|
"mean_token_accuracy": 0.9513460914293925,
|
||
|
|
"num_tokens": 13200181.0,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.950354609929078,
|
||
|
|
"grad_norm": 2.2911215598352026,
|
||
|
|
"learning_rate": 1.6682951782449887e-08,
|
||
|
|
"loss": 0.14,
|
||
|
|
"mean_token_accuracy": 0.9551817645629247,
|
||
|
|
"num_tokens": 13287198.0,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.962174940898345,
|
||
|
|
"grad_norm": 2.528025657021651,
|
||
|
|
"learning_rate": 1.1260814732422242e-08,
|
||
|
|
"loss": 0.1577,
|
||
|
|
"mean_token_accuracy": 0.9525305430094401,
|
||
|
|
"num_tokens": 13365313.0,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9739952718676124,
|
||
|
|
"grad_norm": 2.737000504326715,
|
||
|
|
"learning_rate": 6.898991465022487e-09,
|
||
|
|
"loss": 0.1457,
|
||
|
|
"mean_token_accuracy": 0.9558167507251104,
|
||
|
|
"num_tokens": 13448375.0,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9858156028368794,
|
||
|
|
"grad_norm": 2.851536894418566,
|
||
|
|
"learning_rate": 3.5993403113107616e-09,
|
||
|
|
"loss": 0.1493,
|
||
|
|
"mean_token_accuracy": 0.9524207770824432,
|
||
|
|
"num_tokens": 13529509.0,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9976359338061465,
|
||
|
|
"grad_norm": 2.5596855272672463,
|
||
|
|
"learning_rate": 1.3632670696991922e-09,
|
||
|
|
"loss": 0.1561,
|
||
|
|
"mean_token_accuracy": 0.9530758639176686,
|
||
|
|
"num_tokens": 13608446.0,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"mean_token_accuracy": 0.958593467871348,
|
||
|
|
"num_tokens": 13625289.0,
|
||
|
|
"step": 846,
|
||
|
|
"total_flos": 38833750351872.0,
|
||
|
|
"train_loss": 0.4931214354914695,
|
||
|
|
"train_runtime": 8229.9073,
|
||
|
|
"train_samples_per_second": 7.4,
|
||
|
|
"train_steps_per_second": 0.103
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 846,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 2,
|
||
|
|
"save_steps": 200,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 38833750351872.0,
|
||
|
|
"train_batch_size": 3,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|