5199 lines
146 KiB
JSON
5199 lines
146 KiB
JSON
{
|
|
"best_global_step": 512,
|
|
"best_metric": 1.4060174226760864,
|
|
"best_model_checkpoint": "Dually/checkpoint-512",
|
|
"epoch": 7.426078971533517,
|
|
"eval_steps": 128,
|
|
"global_step": 512,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.2646127492189407,
|
|
"epoch": 0.014692378328741965,
|
|
"grad_norm": 9.75,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.744248867034912,
|
|
"mean_token_accuracy": 0.6102629192173481,
|
|
"num_tokens": 7917.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"entropy": 1.378300040960312,
|
|
"epoch": 0.02938475665748393,
|
|
"grad_norm": 10.9375,
|
|
"learning_rate": 1.5625e-07,
|
|
"loss": 1.9481500387191772,
|
|
"mean_token_accuracy": 0.5874052550643682,
|
|
"num_tokens": 14736.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"entropy": 1.3521380089223385,
|
|
"epoch": 0.0440771349862259,
|
|
"grad_norm": 11.625,
|
|
"learning_rate": 3.125e-07,
|
|
"loss": 1.9126718044281006,
|
|
"mean_token_accuracy": 0.6104181408882141,
|
|
"num_tokens": 20834.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"entropy": 1.0458027385175228,
|
|
"epoch": 0.05876951331496786,
|
|
"grad_norm": 13.75,
|
|
"learning_rate": 4.6875000000000006e-07,
|
|
"loss": 1.5388509035110474,
|
|
"mean_token_accuracy": 0.6652283705770969,
|
|
"num_tokens": 26971.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"entropy": 1.4032921269536018,
|
|
"epoch": 0.07346189164370982,
|
|
"grad_norm": 11.6875,
|
|
"learning_rate": 6.25e-07,
|
|
"loss": 2.099245071411133,
|
|
"mean_token_accuracy": 0.5665754359215498,
|
|
"num_tokens": 34700.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 1.3070856295526028,
|
|
"epoch": 0.0881542699724518,
|
|
"grad_norm": 12.625,
|
|
"learning_rate": 7.8125e-07,
|
|
"loss": 1.9474432468414307,
|
|
"mean_token_accuracy": 0.5986234582960606,
|
|
"num_tokens": 41284.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"entropy": 1.246600879356265,
|
|
"epoch": 0.10284664830119375,
|
|
"grad_norm": 11.5625,
|
|
"learning_rate": 9.375000000000001e-07,
|
|
"loss": 1.7984871864318848,
|
|
"mean_token_accuracy": 0.6245174538344145,
|
|
"num_tokens": 47262.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"entropy": 1.1514122374355793,
|
|
"epoch": 0.11753902662993572,
|
|
"grad_norm": 10.8125,
|
|
"learning_rate": 1.0937500000000001e-06,
|
|
"loss": 1.658499002456665,
|
|
"mean_token_accuracy": 0.6269348040223122,
|
|
"num_tokens": 54501.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"entropy": 1.2789062187075615,
|
|
"epoch": 0.1322314049586777,
|
|
"grad_norm": 12.9375,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 1.8666932582855225,
|
|
"mean_token_accuracy": 0.6204132493585348,
|
|
"num_tokens": 59298.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"entropy": 1.4759281110018492,
|
|
"epoch": 0.14692378328741965,
|
|
"grad_norm": 11.25,
|
|
"learning_rate": 1.40625e-06,
|
|
"loss": 2.1906216144561768,
|
|
"mean_token_accuracy": 0.5752640906721354,
|
|
"num_tokens": 65096.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.2384399138391018,
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 11.625,
|
|
"learning_rate": 1.5625e-06,
|
|
"loss": 1.7301772832870483,
|
|
"mean_token_accuracy": 0.6402938701212406,
|
|
"num_tokens": 70811.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"entropy": 1.2877945825457573,
|
|
"epoch": 0.1763085399449036,
|
|
"grad_norm": 9.9375,
|
|
"learning_rate": 1.71875e-06,
|
|
"loss": 1.7703232765197754,
|
|
"mean_token_accuracy": 0.6134329959750175,
|
|
"num_tokens": 78037.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"entropy": 1.1743841245770454,
|
|
"epoch": 0.19100091827364554,
|
|
"grad_norm": 12.6875,
|
|
"learning_rate": 1.8750000000000003e-06,
|
|
"loss": 1.7103441953659058,
|
|
"mean_token_accuracy": 0.6294799540191889,
|
|
"num_tokens": 84881.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"entropy": 1.1487970873713493,
|
|
"epoch": 0.2056932966023875,
|
|
"grad_norm": 11.625,
|
|
"learning_rate": 2.0312500000000002e-06,
|
|
"loss": 1.724797010421753,
|
|
"mean_token_accuracy": 0.637124864384532,
|
|
"num_tokens": 91378.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"entropy": 1.2112453859299421,
|
|
"epoch": 0.22038567493112948,
|
|
"grad_norm": 10.6875,
|
|
"learning_rate": 2.1875000000000002e-06,
|
|
"loss": 1.6525938510894775,
|
|
"mean_token_accuracy": 0.6341305579990149,
|
|
"num_tokens": 97461.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 1.2508288510143757,
|
|
"epoch": 0.23507805325987144,
|
|
"grad_norm": 10.0625,
|
|
"learning_rate": 2.3437500000000002e-06,
|
|
"loss": 1.7525631189346313,
|
|
"mean_token_accuracy": 0.6133801508694887,
|
|
"num_tokens": 103802.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"entropy": 1.1752750612795353,
|
|
"epoch": 0.2497704315886134,
|
|
"grad_norm": 10.8125,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 1.7630412578582764,
|
|
"mean_token_accuracy": 0.6321723479777575,
|
|
"num_tokens": 111377.0,
|
|
"step": 17
|
|
},
|
|
{
|
|
"entropy": 1.3553732447326183,
|
|
"epoch": 0.2644628099173554,
|
|
"grad_norm": 11.4375,
|
|
"learning_rate": 2.65625e-06,
|
|
"loss": 1.9398893117904663,
|
|
"mean_token_accuracy": 0.5853390581905842,
|
|
"num_tokens": 118146.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"entropy": 1.1929924674332142,
|
|
"epoch": 0.27915518824609736,
|
|
"grad_norm": 8.6875,
|
|
"learning_rate": 2.8125e-06,
|
|
"loss": 1.6914823055267334,
|
|
"mean_token_accuracy": 0.6384953130036592,
|
|
"num_tokens": 126437.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"entropy": 1.2564715202897787,
|
|
"epoch": 0.2938475665748393,
|
|
"grad_norm": 9.125,
|
|
"learning_rate": 2.96875e-06,
|
|
"loss": 1.7073726654052734,
|
|
"mean_token_accuracy": 0.6299608834087849,
|
|
"num_tokens": 133562.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.1569741740822792,
|
|
"epoch": 0.3085399449035813,
|
|
"grad_norm": 9.0,
|
|
"learning_rate": 3.125e-06,
|
|
"loss": 1.7045152187347412,
|
|
"mean_token_accuracy": 0.6431939471513033,
|
|
"num_tokens": 141389.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"entropy": 1.2686172276735306,
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 11.4375,
|
|
"learning_rate": 3.28125e-06,
|
|
"loss": 1.9201271533966064,
|
|
"mean_token_accuracy": 0.6149793621152639,
|
|
"num_tokens": 149659.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"entropy": 1.3068278022110462,
|
|
"epoch": 0.3379247015610652,
|
|
"grad_norm": 9.75,
|
|
"learning_rate": 3.4375e-06,
|
|
"loss": 1.757298231124878,
|
|
"mean_token_accuracy": 0.6182622388005257,
|
|
"num_tokens": 156962.0,
|
|
"step": 23
|
|
},
|
|
{
|
|
"entropy": 1.1734640449285507,
|
|
"epoch": 0.3526170798898072,
|
|
"grad_norm": 9.625,
|
|
"learning_rate": 3.59375e-06,
|
|
"loss": 1.7031861543655396,
|
|
"mean_token_accuracy": 0.6426031272858381,
|
|
"num_tokens": 164117.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"entropy": 1.161806859076023,
|
|
"epoch": 0.3673094582185491,
|
|
"grad_norm": 8.25,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 1.7877576351165771,
|
|
"mean_token_accuracy": 0.6392325963824987,
|
|
"num_tokens": 171972.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 1.1784982681274414,
|
|
"epoch": 0.3820018365472911,
|
|
"grad_norm": 8.9375,
|
|
"learning_rate": 3.90625e-06,
|
|
"loss": 1.5683488845825195,
|
|
"mean_token_accuracy": 0.6583592146635056,
|
|
"num_tokens": 178684.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"entropy": 1.2632846124470234,
|
|
"epoch": 0.39669421487603307,
|
|
"grad_norm": 8.8125,
|
|
"learning_rate": 4.0625000000000005e-06,
|
|
"loss": 1.7520173788070679,
|
|
"mean_token_accuracy": 0.633663909509778,
|
|
"num_tokens": 186596.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"entropy": 1.3619473539292812,
|
|
"epoch": 0.411386593204775,
|
|
"grad_norm": 8.8125,
|
|
"learning_rate": 4.21875e-06,
|
|
"loss": 1.6584488153457642,
|
|
"mean_token_accuracy": 0.6089041493833065,
|
|
"num_tokens": 193256.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"entropy": 1.5768938176333904,
|
|
"epoch": 0.426078971533517,
|
|
"grad_norm": 10.0,
|
|
"learning_rate": 4.3750000000000005e-06,
|
|
"loss": 2.0062549114227295,
|
|
"mean_token_accuracy": 0.5824067778885365,
|
|
"num_tokens": 199459.0,
|
|
"step": 29
|
|
},
|
|
{
|
|
"entropy": 1.1113787479698658,
|
|
"epoch": 0.44077134986225897,
|
|
"grad_norm": 7.46875,
|
|
"learning_rate": 4.53125e-06,
|
|
"loss": 1.6244635581970215,
|
|
"mean_token_accuracy": 0.6660626344382763,
|
|
"num_tokens": 208347.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.3061846159398556,
|
|
"epoch": 0.4554637281910009,
|
|
"grad_norm": 8.75,
|
|
"learning_rate": 4.6875000000000004e-06,
|
|
"loss": 1.8602163791656494,
|
|
"mean_token_accuracy": 0.6074783802032471,
|
|
"num_tokens": 215643.0,
|
|
"step": 31
|
|
},
|
|
{
|
|
"entropy": 1.3427062667906284,
|
|
"epoch": 0.4701561065197429,
|
|
"grad_norm": 9.25,
|
|
"learning_rate": 4.84375e-06,
|
|
"loss": 1.8641481399536133,
|
|
"mean_token_accuracy": 0.6179640628397465,
|
|
"num_tokens": 221856.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"entropy": 1.5843740738928318,
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 9.3125,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.9042216539382935,
|
|
"mean_token_accuracy": 0.5840670578181744,
|
|
"num_tokens": 228502.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"entropy": 1.2740740850567818,
|
|
"epoch": 0.4995408631772268,
|
|
"grad_norm": 8.6875,
|
|
"learning_rate": 4.999946454160323e-06,
|
|
"loss": 1.657892107963562,
|
|
"mean_token_accuracy": 0.650751706212759,
|
|
"num_tokens": 235856.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"entropy": 1.2723774798214436,
|
|
"epoch": 0.5142332415059688,
|
|
"grad_norm": 6.6875,
|
|
"learning_rate": 4.999785818935018e-06,
|
|
"loss": 1.7274625301361084,
|
|
"mean_token_accuracy": 0.6495076902210712,
|
|
"num_tokens": 243386.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 1.2608134560286999,
|
|
"epoch": 0.5289256198347108,
|
|
"grad_norm": 6.96875,
|
|
"learning_rate": 4.999518101205162e-06,
|
|
"loss": 1.5035489797592163,
|
|
"mean_token_accuracy": 0.6534168235957623,
|
|
"num_tokens": 250842.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"entropy": 1.3372750878334045,
|
|
"epoch": 0.5436179981634527,
|
|
"grad_norm": 7.1875,
|
|
"learning_rate": 4.999143312438893e-06,
|
|
"loss": 1.5423352718353271,
|
|
"mean_token_accuracy": 0.635912710800767,
|
|
"num_tokens": 257542.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"entropy": 1.3917848393321037,
|
|
"epoch": 0.5583103764921947,
|
|
"grad_norm": 6.53125,
|
|
"learning_rate": 4.998661468690914e-06,
|
|
"loss": 1.7300214767456055,
|
|
"mean_token_accuracy": 0.6232790667563677,
|
|
"num_tokens": 265642.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"entropy": 1.391078781336546,
|
|
"epoch": 0.5730027548209367,
|
|
"grad_norm": 7.6875,
|
|
"learning_rate": 4.998072590601808e-06,
|
|
"loss": 1.776171326637268,
|
|
"mean_token_accuracy": 0.6446701735258102,
|
|
"num_tokens": 271325.0,
|
|
"step": 39
|
|
},
|
|
{
|
|
"entropy": 1.419153816998005,
|
|
"epoch": 0.5876951331496786,
|
|
"grad_norm": 9.375,
|
|
"learning_rate": 4.997376703397151e-06,
|
|
"loss": 1.8279757499694824,
|
|
"mean_token_accuracy": 0.622003948315978,
|
|
"num_tokens": 276627.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.32320836186409,
|
|
"epoch": 0.6023875114784206,
|
|
"grad_norm": 6.53125,
|
|
"learning_rate": 4.9965738368864345e-06,
|
|
"loss": 1.5522156953811646,
|
|
"mean_token_accuracy": 0.647944763302803,
|
|
"num_tokens": 283056.0,
|
|
"step": 41
|
|
},
|
|
{
|
|
"entropy": 1.4761433601379395,
|
|
"epoch": 0.6170798898071626,
|
|
"grad_norm": 6.09375,
|
|
"learning_rate": 4.99566402546179e-06,
|
|
"loss": 1.8013895750045776,
|
|
"mean_token_accuracy": 0.625993836671114,
|
|
"num_tokens": 290032.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"entropy": 1.309795543551445,
|
|
"epoch": 0.6317722681359045,
|
|
"grad_norm": 5.84375,
|
|
"learning_rate": 4.994647308096509e-06,
|
|
"loss": 1.5540151596069336,
|
|
"mean_token_accuracy": 0.6547523811459541,
|
|
"num_tokens": 297226.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"entropy": 1.356013897806406,
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 7.375,
|
|
"learning_rate": 4.99352372834338e-06,
|
|
"loss": 1.6234813928604126,
|
|
"mean_token_accuracy": 0.643750274553895,
|
|
"num_tokens": 302919.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"entropy": 1.3376823589205742,
|
|
"epoch": 0.6611570247933884,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 4.992293334332821e-06,
|
|
"loss": 1.8485695123672485,
|
|
"mean_token_accuracy": 0.6466748863458633,
|
|
"num_tokens": 312466.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 1.4561820216476917,
|
|
"epoch": 0.6758494031221304,
|
|
"grad_norm": 5.71875,
|
|
"learning_rate": 4.990956178770814e-06,
|
|
"loss": 1.6671806573867798,
|
|
"mean_token_accuracy": 0.6320093534886837,
|
|
"num_tokens": 319919.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"entropy": 1.3384652398526669,
|
|
"epoch": 0.6905417814508723,
|
|
"grad_norm": 5.25,
|
|
"learning_rate": 4.989512318936654e-06,
|
|
"loss": 1.5594499111175537,
|
|
"mean_token_accuracy": 0.6558941937983036,
|
|
"num_tokens": 327790.0,
|
|
"step": 47
|
|
},
|
|
{
|
|
"entropy": 1.6121552139520645,
|
|
"epoch": 0.7052341597796143,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 4.987961816680493e-06,
|
|
"loss": 1.823395848274231,
|
|
"mean_token_accuracy": 0.5975735988467932,
|
|
"num_tokens": 334440.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"entropy": 1.473178207874298,
|
|
"epoch": 0.7199265381083563,
|
|
"grad_norm": 6.4375,
|
|
"learning_rate": 4.986304738420684e-06,
|
|
"loss": 1.7415186166763306,
|
|
"mean_token_accuracy": 0.6342501733452082,
|
|
"num_tokens": 340524.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"entropy": 1.472764991223812,
|
|
"epoch": 0.7346189164370982,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.984541155140945e-06,
|
|
"loss": 1.6061500310897827,
|
|
"mean_token_accuracy": 0.6294202730059624,
|
|
"num_tokens": 347745.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.453477706760168,
|
|
"epoch": 0.7493112947658402,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 4.982671142387316e-06,
|
|
"loss": 1.656001091003418,
|
|
"mean_token_accuracy": 0.6334934048354626,
|
|
"num_tokens": 354277.0,
|
|
"step": 51
|
|
},
|
|
{
|
|
"entropy": 1.5710610300302505,
|
|
"epoch": 0.7640036730945822,
|
|
"grad_norm": 7.8125,
|
|
"learning_rate": 4.980694780264918e-06,
|
|
"loss": 1.879157543182373,
|
|
"mean_token_accuracy": 0.6145644318312407,
|
|
"num_tokens": 361212.0,
|
|
"step": 52
|
|
},
|
|
{
|
|
"entropy": 1.5348852053284645,
|
|
"epoch": 0.7786960514233241,
|
|
"grad_norm": 5.46875,
|
|
"learning_rate": 4.978612153434527e-06,
|
|
"loss": 1.7147362232208252,
|
|
"mean_token_accuracy": 0.6237640716135502,
|
|
"num_tokens": 368602.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"entropy": 1.553167935460806,
|
|
"epoch": 0.7933884297520661,
|
|
"grad_norm": 5.40625,
|
|
"learning_rate": 4.976423351108943e-06,
|
|
"loss": 1.5548394918441772,
|
|
"mean_token_accuracy": 0.6278982330113649,
|
|
"num_tokens": 375209.0,
|
|
"step": 54
|
|
},
|
|
{
|
|
"entropy": 1.4203950092196465,
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 4.974128467049177e-06,
|
|
"loss": 1.5801376104354858,
|
|
"mean_token_accuracy": 0.6398332640528679,
|
|
"num_tokens": 381525.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 1.5313997939229012,
|
|
"epoch": 0.82277318640955,
|
|
"grad_norm": 6.0,
|
|
"learning_rate": 4.971727599560418e-06,
|
|
"loss": 1.455580472946167,
|
|
"mean_token_accuracy": 0.6429965775460005,
|
|
"num_tokens": 386924.0,
|
|
"step": 56
|
|
},
|
|
{
|
|
"entropy": 1.2961117215454578,
|
|
"epoch": 0.837465564738292,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 4.9692208514878445e-06,
|
|
"loss": 1.2825771570205688,
|
|
"mean_token_accuracy": 0.6828609891235828,
|
|
"num_tokens": 393737.0,
|
|
"step": 57
|
|
},
|
|
{
|
|
"entropy": 1.4601662941277027,
|
|
"epoch": 0.852157943067034,
|
|
"grad_norm": 5.34375,
|
|
"learning_rate": 4.966608330212198e-06,
|
|
"loss": 1.4792706966400146,
|
|
"mean_token_accuracy": 0.6504263170063496,
|
|
"num_tokens": 400614.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"entropy": 1.5047795996069908,
|
|
"epoch": 0.8668503213957759,
|
|
"grad_norm": 5.3125,
|
|
"learning_rate": 4.963890147645195e-06,
|
|
"loss": 1.6770135164260864,
|
|
"mean_token_accuracy": 0.6391430255025625,
|
|
"num_tokens": 408341.0,
|
|
"step": 59
|
|
},
|
|
{
|
|
"entropy": 1.6163291186094284,
|
|
"epoch": 0.8815426997245179,
|
|
"grad_norm": 6.53125,
|
|
"learning_rate": 4.961066420224729e-06,
|
|
"loss": 1.6832696199417114,
|
|
"mean_token_accuracy": 0.6109438072890043,
|
|
"num_tokens": 413972.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.3349718637764454,
|
|
"epoch": 0.8962350780532599,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 4.958137268909887e-06,
|
|
"loss": 1.410157322883606,
|
|
"mean_token_accuracy": 0.6598032023757696,
|
|
"num_tokens": 420932.0,
|
|
"step": 61
|
|
},
|
|
{
|
|
"entropy": 1.623923797160387,
|
|
"epoch": 0.9109274563820018,
|
|
"grad_norm": 6.6875,
|
|
"learning_rate": 4.95510281917576e-06,
|
|
"loss": 1.634577751159668,
|
|
"mean_token_accuracy": 0.6326909828931093,
|
|
"num_tokens": 425816.0,
|
|
"step": 62
|
|
},
|
|
{
|
|
"entropy": 1.7040501534938812,
|
|
"epoch": 0.9256198347107438,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 4.9519632010080765e-06,
|
|
"loss": 1.8456324338912964,
|
|
"mean_token_accuracy": 0.6114997789263725,
|
|
"num_tokens": 432371.0,
|
|
"step": 63
|
|
},
|
|
{
|
|
"entropy": 1.3237117007374763,
|
|
"epoch": 0.9403122130394858,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 4.9487185488976284e-06,
|
|
"loss": 1.4221746921539307,
|
|
"mean_token_accuracy": 0.6745892986655235,
|
|
"num_tokens": 439631.0,
|
|
"step": 64
|
|
},
|
|
{
|
|
"entropy": 1.5274745747447014,
|
|
"epoch": 0.9550045913682277,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 4.9453690018345144e-06,
|
|
"loss": 1.7518843412399292,
|
|
"mean_token_accuracy": 0.6415467616170645,
|
|
"num_tokens": 447294.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 1.6217340305447578,
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 5.90625,
|
|
"learning_rate": 4.941914703302181e-06,
|
|
"loss": 1.9085001945495605,
|
|
"mean_token_accuracy": 0.602933943271637,
|
|
"num_tokens": 453188.0,
|
|
"step": 66
|
|
},
|
|
{
|
|
"entropy": 1.620997928082943,
|
|
"epoch": 0.9843893480257117,
|
|
"grad_norm": 6.1875,
|
|
"learning_rate": 4.938355801271282e-06,
|
|
"loss": 1.7774909734725952,
|
|
"mean_token_accuracy": 0.6129048503935337,
|
|
"num_tokens": 458849.0,
|
|
"step": 67
|
|
},
|
|
{
|
|
"entropy": 1.656703669577837,
|
|
"epoch": 0.9990817263544536,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 4.9346924481933345e-06,
|
|
"loss": 1.8143261671066284,
|
|
"mean_token_accuracy": 0.6097159385681152,
|
|
"num_tokens": 467166.0,
|
|
"step": 68
|
|
},
|
|
{
|
|
"entropy": 1.6068611145019531,
|
|
"epoch": 1.0,
|
|
"grad_norm": 12.75,
|
|
"learning_rate": 4.930924800994192e-06,
|
|
"loss": 1.6701020002365112,
|
|
"mean_token_accuracy": 0.673130214214325,
|
|
"num_tokens": 467528.0,
|
|
"step": 69
|
|
},
|
|
{
|
|
"entropy": 1.3061356097459793,
|
|
"epoch": 1.014692378328742,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 4.927053021067321e-06,
|
|
"loss": 1.3863725662231445,
|
|
"mean_token_accuracy": 0.6834770254790783,
|
|
"num_tokens": 474488.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.3934976756572723,
|
|
"epoch": 1.0293847566574839,
|
|
"grad_norm": 5.1875,
|
|
"learning_rate": 4.923077274266886e-06,
|
|
"loss": 1.356377124786377,
|
|
"mean_token_accuracy": 0.6749065890908241,
|
|
"num_tokens": 480487.0,
|
|
"step": 71
|
|
},
|
|
{
|
|
"entropy": 1.2166254296898842,
|
|
"epoch": 1.044077134986226,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 4.91899773090065e-06,
|
|
"loss": 1.2729101181030273,
|
|
"mean_token_accuracy": 0.7036809120327234,
|
|
"num_tokens": 486565.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"entropy": 1.3788570798933506,
|
|
"epoch": 1.058769513314968,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 4.914814565722671e-06,
|
|
"loss": 1.3389383554458618,
|
|
"mean_token_accuracy": 0.66689308360219,
|
|
"num_tokens": 494464.0,
|
|
"step": 73
|
|
},
|
|
{
|
|
"entropy": 1.4210151471197605,
|
|
"epoch": 1.0734618916437098,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 4.9105279579258234e-06,
|
|
"loss": 1.5810019969940186,
|
|
"mean_token_accuracy": 0.6520100049674511,
|
|
"num_tokens": 502023.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"entropy": 1.3476647343486547,
|
|
"epoch": 1.0881542699724518,
|
|
"grad_norm": 5.125,
|
|
"learning_rate": 4.906138091134118e-06,
|
|
"loss": 1.4704662561416626,
|
|
"mean_token_accuracy": 0.6666250489652157,
|
|
"num_tokens": 508991.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 1.715055987238884,
|
|
"epoch": 1.1028466483011938,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 4.901645153394838e-06,
|
|
"loss": 1.7850326299667358,
|
|
"mean_token_accuracy": 0.6200868934392929,
|
|
"num_tokens": 515213.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"entropy": 1.33231370896101,
|
|
"epoch": 1.1175390266299357,
|
|
"grad_norm": 5.03125,
|
|
"learning_rate": 4.897049337170483e-06,
|
|
"loss": 1.4512275457382202,
|
|
"mean_token_accuracy": 0.6738582514226437,
|
|
"num_tokens": 522712.0,
|
|
"step": 77
|
|
},
|
|
{
|
|
"entropy": 1.430236928164959,
|
|
"epoch": 1.1322314049586777,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 4.8923508393305224e-06,
|
|
"loss": 1.4064946174621582,
|
|
"mean_token_accuracy": 0.668969176709652,
|
|
"num_tokens": 529191.0,
|
|
"step": 78
|
|
},
|
|
{
|
|
"entropy": 1.3905731923878193,
|
|
"epoch": 1.1469237832874197,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 4.887549861142967e-06,
|
|
"loss": 1.507306694984436,
|
|
"mean_token_accuracy": 0.6702582351863384,
|
|
"num_tokens": 538221.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"entropy": 1.6136963441967964,
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 5.875,
|
|
"learning_rate": 4.882646608265743e-06,
|
|
"loss": 1.8179521560668945,
|
|
"mean_token_accuracy": 0.6243273708969355,
|
|
"num_tokens": 544126.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.3634940460324287,
|
|
"epoch": 1.1763085399449036,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 4.8776412907378845e-06,
|
|
"loss": 1.4865293502807617,
|
|
"mean_token_accuracy": 0.6671657040715218,
|
|
"num_tokens": 550605.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"entropy": 1.4086507372558117,
|
|
"epoch": 1.1910009182736456,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.872534122970536e-06,
|
|
"loss": 1.4881434440612793,
|
|
"mean_token_accuracy": 0.6586938947439194,
|
|
"num_tokens": 557623.0,
|
|
"step": 82
|
|
},
|
|
{
|
|
"entropy": 1.5293696448206902,
|
|
"epoch": 1.2056932966023874,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 4.867325323737765e-06,
|
|
"loss": 1.5523103475570679,
|
|
"mean_token_accuracy": 0.6492738723754883,
|
|
"num_tokens": 564872.0,
|
|
"step": 83
|
|
},
|
|
{
|
|
"entropy": 1.5812500715255737,
|
|
"epoch": 1.2203856749311295,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 4.862015116167195e-06,
|
|
"loss": 1.5752160549163818,
|
|
"mean_token_accuracy": 0.6451857574284077,
|
|
"num_tokens": 570930.0,
|
|
"step": 84
|
|
},
|
|
{
|
|
"entropy": 1.5821347422897816,
|
|
"epoch": 1.2350780532598715,
|
|
"grad_norm": 5.78125,
|
|
"learning_rate": 4.856603727730446e-06,
|
|
"loss": 1.615751028060913,
|
|
"mean_token_accuracy": 0.6259971559047699,
|
|
"num_tokens": 577356.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 1.2754014991223812,
|
|
"epoch": 1.2497704315886133,
|
|
"grad_norm": 5.59375,
|
|
"learning_rate": 4.8510913902333876e-06,
|
|
"loss": 1.2676167488098145,
|
|
"mean_token_accuracy": 0.6968142054975033,
|
|
"num_tokens": 584015.0,
|
|
"step": 86
|
|
},
|
|
{
|
|
"entropy": 1.4495088569819927,
|
|
"epoch": 1.2644628099173554,
|
|
"grad_norm": 6.78125,
|
|
"learning_rate": 4.845478339806211e-06,
|
|
"loss": 1.69454026222229,
|
|
"mean_token_accuracy": 0.6445063762366772,
|
|
"num_tokens": 590581.0,
|
|
"step": 87
|
|
},
|
|
{
|
|
"entropy": 1.3460834063589573,
|
|
"epoch": 1.2791551882460974,
|
|
"grad_norm": 5.4375,
|
|
"learning_rate": 4.839764816893315e-06,
|
|
"loss": 1.3651117086410522,
|
|
"mean_token_accuracy": 0.6775182671844959,
|
|
"num_tokens": 596816.0,
|
|
"step": 88
|
|
},
|
|
{
|
|
"entropy": 1.2347406335175037,
|
|
"epoch": 1.2938475665748392,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 4.833951066243004e-06,
|
|
"loss": 1.2977527379989624,
|
|
"mean_token_accuracy": 0.6960192620754242,
|
|
"num_tokens": 604695.0,
|
|
"step": 89
|
|
},
|
|
{
|
|
"entropy": 1.3695692755281925,
|
|
"epoch": 1.3085399449035813,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 4.828037336897009e-06,
|
|
"loss": 1.640378475189209,
|
|
"mean_token_accuracy": 0.6644081249833107,
|
|
"num_tokens": 611293.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.4080765470862389,
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 4.822023882179811e-06,
|
|
"loss": 1.437829852104187,
|
|
"mean_token_accuracy": 0.6620082408189774,
|
|
"num_tokens": 618241.0,
|
|
"step": 91
|
|
},
|
|
{
|
|
"entropy": 1.4261119589209557,
|
|
"epoch": 1.3379247015610651,
|
|
"grad_norm": 6.75,
|
|
"learning_rate": 4.815910959687795e-06,
|
|
"loss": 1.401389479637146,
|
|
"mean_token_accuracy": 0.6730383820831776,
|
|
"num_tokens": 623399.0,
|
|
"step": 92
|
|
},
|
|
{
|
|
"entropy": 1.444845873862505,
|
|
"epoch": 1.3526170798898072,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 4.809698831278217e-06,
|
|
"loss": 1.4253414869308472,
|
|
"mean_token_accuracy": 0.6722631379961967,
|
|
"num_tokens": 630189.0,
|
|
"step": 93
|
|
},
|
|
{
|
|
"entropy": 1.5948089621961117,
|
|
"epoch": 1.367309458218549,
|
|
"grad_norm": 5.53125,
|
|
"learning_rate": 4.803387763057981e-06,
|
|
"loss": 1.7228381633758545,
|
|
"mean_token_accuracy": 0.624824620783329,
|
|
"num_tokens": 637677.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"entropy": 1.4382336772978306,
|
|
"epoch": 1.382001836547291,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 4.796978025372247e-06,
|
|
"loss": 1.484128475189209,
|
|
"mean_token_accuracy": 0.6511365957558155,
|
|
"num_tokens": 644602.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 1.4599979110062122,
|
|
"epoch": 1.396694214876033,
|
|
"grad_norm": 5.65625,
|
|
"learning_rate": 4.79046989279284e-06,
|
|
"loss": 1.4486807584762573,
|
|
"mean_token_accuracy": 0.6637902185320854,
|
|
"num_tokens": 651044.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"entropy": 1.4183036126196384,
|
|
"epoch": 1.4113865932047749,
|
|
"grad_norm": 5.75,
|
|
"learning_rate": 4.783863644106502e-06,
|
|
"loss": 1.427507996559143,
|
|
"mean_token_accuracy": 0.6631320789456367,
|
|
"num_tokens": 657609.0,
|
|
"step": 97
|
|
},
|
|
{
|
|
"entropy": 1.36999873816967,
|
|
"epoch": 1.426078971533517,
|
|
"grad_norm": 5.21875,
|
|
"learning_rate": 4.77715956230294e-06,
|
|
"loss": 1.3143718242645264,
|
|
"mean_token_accuracy": 0.6901484504342079,
|
|
"num_tokens": 664907.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"entropy": 1.4660401456058025,
|
|
"epoch": 1.440771349862259,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 4.770357934562704e-06,
|
|
"loss": 1.449824333190918,
|
|
"mean_token_accuracy": 0.6540784798562527,
|
|
"num_tokens": 672931.0,
|
|
"step": 99
|
|
},
|
|
{
|
|
"entropy": 1.4816335625946522,
|
|
"epoch": 1.4554637281910008,
|
|
"grad_norm": 5.6875,
|
|
"learning_rate": 4.7634590522448886e-06,
|
|
"loss": 1.5329768657684326,
|
|
"mean_token_accuracy": 0.6551914289593697,
|
|
"num_tokens": 679477.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.7057891301810741,
|
|
"epoch": 1.4701561065197428,
|
|
"grad_norm": 5.28125,
|
|
"learning_rate": 4.7564632108746524e-06,
|
|
"loss": 1.9816747903823853,
|
|
"mean_token_accuracy": 0.5978529918938875,
|
|
"num_tokens": 687941.0,
|
|
"step": 101
|
|
},
|
|
{
|
|
"entropy": 1.5502509884536266,
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 4.78125,
|
|
"learning_rate": 4.7493707101305545e-06,
|
|
"loss": 1.7086496353149414,
|
|
"mean_token_accuracy": 0.6390546467155218,
|
|
"num_tokens": 696935.0,
|
|
"step": 102
|
|
},
|
|
{
|
|
"entropy": 1.5186870731413364,
|
|
"epoch": 1.4995408631772267,
|
|
"grad_norm": 5.5,
|
|
"learning_rate": 4.742181853831721e-06,
|
|
"loss": 1.626766324043274,
|
|
"mean_token_accuracy": 0.6448409371078014,
|
|
"num_tokens": 703847.0,
|
|
"step": 103
|
|
},
|
|
{
|
|
"entropy": 1.652857031673193,
|
|
"epoch": 1.514233241505969,
|
|
"grad_norm": 5.9375,
|
|
"learning_rate": 4.734896949924831e-06,
|
|
"loss": 1.8684097528457642,
|
|
"mean_token_accuracy": 0.6228403430432081,
|
|
"num_tokens": 709915.0,
|
|
"step": 104
|
|
},
|
|
{
|
|
"entropy": 1.3462006263434887,
|
|
"epoch": 1.5289256198347108,
|
|
"grad_norm": 6.5,
|
|
"learning_rate": 4.72751631047092e-06,
|
|
"loss": 1.3426733016967773,
|
|
"mean_token_accuracy": 0.696257971227169,
|
|
"num_tokens": 714957.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 1.5189800672233105,
|
|
"epoch": 1.5436179981634526,
|
|
"grad_norm": 6.03125,
|
|
"learning_rate": 4.720040251632019e-06,
|
|
"loss": 1.5830191373825073,
|
|
"mean_token_accuracy": 0.6607260629534721,
|
|
"num_tokens": 720412.0,
|
|
"step": 106
|
|
},
|
|
{
|
|
"entropy": 1.3767579533159733,
|
|
"epoch": 1.5583103764921948,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 4.712469093657605e-06,
|
|
"loss": 1.5013158321380615,
|
|
"mean_token_accuracy": 0.6755428463220596,
|
|
"num_tokens": 728047.0,
|
|
"step": 107
|
|
},
|
|
{
|
|
"entropy": 1.2227111794054508,
|
|
"epoch": 1.5730027548209367,
|
|
"grad_norm": 4.71875,
|
|
"learning_rate": 4.704803160870888e-06,
|
|
"loss": 1.4295673370361328,
|
|
"mean_token_accuracy": 0.6972721088677645,
|
|
"num_tokens": 735231.0,
|
|
"step": 108
|
|
},
|
|
{
|
|
"entropy": 1.4339848309755325,
|
|
"epoch": 1.5876951331496785,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 4.697042781654913e-06,
|
|
"loss": 1.5147660970687866,
|
|
"mean_token_accuracy": 0.6610862240195274,
|
|
"num_tokens": 742143.0,
|
|
"step": 109
|
|
},
|
|
{
|
|
"entropy": 1.7120601199567318,
|
|
"epoch": 1.6023875114784207,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 4.6891882884384994e-06,
|
|
"loss": 1.8835252523422241,
|
|
"mean_token_accuracy": 0.6148366816341877,
|
|
"num_tokens": 748878.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.366846838966012,
|
|
"epoch": 1.6170798898071626,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 4.681240017681994e-06,
|
|
"loss": 1.314517617225647,
|
|
"mean_token_accuracy": 0.6879813298583031,
|
|
"num_tokens": 755068.0,
|
|
"step": 111
|
|
},
|
|
{
|
|
"entropy": 1.5042930357158184,
|
|
"epoch": 1.6317722681359044,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 4.67319830986286e-06,
|
|
"loss": 1.4018523693084717,
|
|
"mean_token_accuracy": 0.6578907147049904,
|
|
"num_tokens": 762233.0,
|
|
"step": 112
|
|
},
|
|
{
|
|
"entropy": 1.2661687284708023,
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 4.65625,
|
|
"learning_rate": 4.665063509461098e-06,
|
|
"loss": 1.2542436122894287,
|
|
"mean_token_accuracy": 0.698942206799984,
|
|
"num_tokens": 768426.0,
|
|
"step": 113
|
|
},
|
|
{
|
|
"entropy": 1.402594517916441,
|
|
"epoch": 1.6611570247933884,
|
|
"grad_norm": 4.71875,
|
|
"learning_rate": 4.65683596494448e-06,
|
|
"loss": 1.4597134590148926,
|
|
"mean_token_accuracy": 0.6789544485509396,
|
|
"num_tokens": 775863.0,
|
|
"step": 114
|
|
},
|
|
{
|
|
"entropy": 1.3753325566649437,
|
|
"epoch": 1.6758494031221303,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 4.648516028753632e-06,
|
|
"loss": 1.4206632375717163,
|
|
"mean_token_accuracy": 0.6716446243226528,
|
|
"num_tokens": 782179.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 1.3194672428071499,
|
|
"epoch": 1.6905417814508723,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 4.6401040572869295e-06,
|
|
"loss": 1.3911067247390747,
|
|
"mean_token_accuracy": 0.6902661826461554,
|
|
"num_tokens": 789999.0,
|
|
"step": 116
|
|
},
|
|
{
|
|
"entropy": 1.1185118220746517,
|
|
"epoch": 1.7052341597796143,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 4.631600410885231e-06,
|
|
"loss": 1.1200969219207764,
|
|
"mean_token_accuracy": 0.7202032878994942,
|
|
"num_tokens": 797816.0,
|
|
"step": 117
|
|
},
|
|
{
|
|
"entropy": 1.6263831928372383,
|
|
"epoch": 1.7199265381083562,
|
|
"grad_norm": 4.6875,
|
|
"learning_rate": 4.623005453816447e-06,
|
|
"loss": 1.7719974517822266,
|
|
"mean_token_accuracy": 0.6271817404776812,
|
|
"num_tokens": 803655.0,
|
|
"step": 118
|
|
},
|
|
{
|
|
"entropy": 1.5183000564575195,
|
|
"epoch": 1.7346189164370982,
|
|
"grad_norm": 4.875,
|
|
"learning_rate": 4.614319554259934e-06,
|
|
"loss": 1.5544443130493164,
|
|
"mean_token_accuracy": 0.657878614962101,
|
|
"num_tokens": 809724.0,
|
|
"step": 119
|
|
},
|
|
{
|
|
"entropy": 1.6561345160007477,
|
|
"epoch": 1.7493112947658402,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 4.605543084290716e-06,
|
|
"loss": 1.8985499143600464,
|
|
"mean_token_accuracy": 0.6286845244467258,
|
|
"num_tokens": 815942.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.3539353236556053,
|
|
"epoch": 1.764003673094582,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 4.596676419863561e-06,
|
|
"loss": 1.453987956047058,
|
|
"mean_token_accuracy": 0.6819523572921753,
|
|
"num_tokens": 823392.0,
|
|
"step": 121
|
|
},
|
|
{
|
|
"entropy": 1.341381138190627,
|
|
"epoch": 1.778696051423324,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 4.587719940796858e-06,
|
|
"loss": 1.3191108703613281,
|
|
"mean_token_accuracy": 0.6868677549064159,
|
|
"num_tokens": 830069.0,
|
|
"step": 122
|
|
},
|
|
{
|
|
"entropy": 1.5383460223674774,
|
|
"epoch": 1.7933884297520661,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 4.578674030756364e-06,
|
|
"loss": 1.5670809745788574,
|
|
"mean_token_accuracy": 0.6535362396389246,
|
|
"num_tokens": 836019.0,
|
|
"step": 123
|
|
},
|
|
{
|
|
"entropy": 1.517351869493723,
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 4.569539077238756e-06,
|
|
"loss": 1.5154145956039429,
|
|
"mean_token_accuracy": 0.6488186921924353,
|
|
"num_tokens": 842399.0,
|
|
"step": 124
|
|
},
|
|
{
|
|
"entropy": 1.4582207016646862,
|
|
"epoch": 1.82277318640955,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 4.560315471555039e-06,
|
|
"loss": 1.5142159461975098,
|
|
"mean_token_accuracy": 0.6643350049853325,
|
|
"num_tokens": 848509.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 1.8213122673332691,
|
|
"epoch": 1.837465564738292,
|
|
"grad_norm": 4.375,
|
|
"learning_rate": 4.551003608813784e-06,
|
|
"loss": 1.9438310861587524,
|
|
"mean_token_accuracy": 0.6077302508056164,
|
|
"num_tokens": 854624.0,
|
|
"step": 126
|
|
},
|
|
{
|
|
"entropy": 1.2630420215427876,
|
|
"epoch": 1.8521579430670339,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 4.541603887904198e-06,
|
|
"loss": 1.3397103548049927,
|
|
"mean_token_accuracy": 0.6940340362489223,
|
|
"num_tokens": 862970.0,
|
|
"step": 127
|
|
},
|
|
{
|
|
"entropy": 1.521993912756443,
|
|
"epoch": 1.866850321395776,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 4.532116711479039e-06,
|
|
"loss": 1.649178147315979,
|
|
"mean_token_accuracy": 0.6439413316547871,
|
|
"num_tokens": 870524.0,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 1.866850321395776,
|
|
"eval_entropy": 1.3699656426906586,
|
|
"eval_loss": 1.4730713367462158,
|
|
"eval_mean_token_accuracy": 0.6691670566797256,
|
|
"eval_num_tokens": 870524.0,
|
|
"eval_runtime": 1.6753,
|
|
"eval_samples_per_second": 34.621,
|
|
"eval_steps_per_second": 4.775,
|
|
"step": 128
|
|
},
|
|
{
|
|
"entropy": 1.1558082550764084,
|
|
"epoch": 1.881542699724518,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 4.522542485937369e-06,
|
|
"loss": 1.233929991722107,
|
|
"mean_token_accuracy": 0.7151609733700752,
|
|
"num_tokens": 878419.0,
|
|
"step": 129
|
|
},
|
|
{
|
|
"entropy": 1.4429849721491337,
|
|
"epoch": 1.8962350780532597,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 4.512881621407146e-06,
|
|
"loss": 1.3959054946899414,
|
|
"mean_token_accuracy": 0.6707526985555887,
|
|
"num_tokens": 885787.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.315425992012024,
|
|
"epoch": 1.9109274563820018,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 4.503134531727652e-06,
|
|
"loss": 1.2897546291351318,
|
|
"mean_token_accuracy": 0.6948176473379135,
|
|
"num_tokens": 891849.0,
|
|
"step": 131
|
|
},
|
|
{
|
|
"entropy": 1.3664759285748005,
|
|
"epoch": 1.9256198347107438,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 4.493301634431768e-06,
|
|
"loss": 1.2926387786865234,
|
|
"mean_token_accuracy": 0.6821446903049946,
|
|
"num_tokens": 898855.0,
|
|
"step": 132
|
|
},
|
|
{
|
|
"entropy": 1.4670868627727032,
|
|
"epoch": 1.9403122130394856,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.4833833507280884e-06,
|
|
"loss": 1.7085148096084595,
|
|
"mean_token_accuracy": 0.6683255881071091,
|
|
"num_tokens": 906434.0,
|
|
"step": 133
|
|
},
|
|
{
|
|
"entropy": 1.323768761008978,
|
|
"epoch": 1.9550045913682277,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 4.473380105482875e-06,
|
|
"loss": 1.2290809154510498,
|
|
"mean_token_accuracy": 0.695992011576891,
|
|
"num_tokens": 911789.0,
|
|
"step": 134
|
|
},
|
|
{
|
|
"entropy": 1.228843528777361,
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 4.463292327201862e-06,
|
|
"loss": 1.2656970024108887,
|
|
"mean_token_accuracy": 0.7151234671473503,
|
|
"num_tokens": 918845.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 1.5310376584529877,
|
|
"epoch": 1.9843893480257115,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 4.453120448011897e-06,
|
|
"loss": 1.666202425956726,
|
|
"mean_token_accuracy": 0.6378280259668827,
|
|
"num_tokens": 926751.0,
|
|
"step": 136
|
|
},
|
|
{
|
|
"entropy": 1.4586737379431725,
|
|
"epoch": 1.9990817263544536,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 4.442864903642428e-06,
|
|
"loss": 1.6007022857666016,
|
|
"mean_token_accuracy": 0.6548678297549486,
|
|
"num_tokens": 934942.0,
|
|
"step": 137
|
|
},
|
|
{
|
|
"entropy": 1.197638750076294,
|
|
"epoch": 2.0,
|
|
"grad_norm": 24.625,
|
|
"learning_rate": 4.432526133406843e-06,
|
|
"loss": 1.1470692157745361,
|
|
"mean_token_accuracy": 0.7256637215614319,
|
|
"num_tokens": 935056.0,
|
|
"step": 138
|
|
},
|
|
{
|
|
"entropy": 1.3461281508207321,
|
|
"epoch": 2.014692378328742,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 4.422104580183649e-06,
|
|
"loss": 1.4025999307632446,
|
|
"mean_token_accuracy": 0.6926173456013203,
|
|
"num_tokens": 941229.0,
|
|
"step": 139
|
|
},
|
|
{
|
|
"entropy": 1.6541437543928623,
|
|
"epoch": 2.029384756657484,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 4.4116006903975015e-06,
|
|
"loss": 1.6707371473312378,
|
|
"mean_token_accuracy": 0.6277699284255505,
|
|
"num_tokens": 948566.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.2840190175920725,
|
|
"epoch": 2.044077134986226,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 4.401014914000078e-06,
|
|
"loss": 1.2021766901016235,
|
|
"mean_token_accuracy": 0.6986711807549,
|
|
"num_tokens": 954773.0,
|
|
"step": 141
|
|
},
|
|
{
|
|
"entropy": 1.4697693847119808,
|
|
"epoch": 2.0587695133149677,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 4.3903477044508066e-06,
|
|
"loss": 1.5293481349945068,
|
|
"mean_token_accuracy": 0.6505694799125195,
|
|
"num_tokens": 961834.0,
|
|
"step": 142
|
|
},
|
|
{
|
|
"entropy": 1.5127912238240242,
|
|
"epoch": 2.07346189164371,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 4.379599518697444e-06,
|
|
"loss": 1.735708236694336,
|
|
"mean_token_accuracy": 0.6535333096981049,
|
|
"num_tokens": 968542.0,
|
|
"step": 143
|
|
},
|
|
{
|
|
"entropy": 1.3300835229456425,
|
|
"epoch": 2.088154269972452,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 4.368770817156493e-06,
|
|
"loss": 1.3301336765289307,
|
|
"mean_token_accuracy": 0.6858880035579205,
|
|
"num_tokens": 976389.0,
|
|
"step": 144
|
|
},
|
|
{
|
|
"entropy": 1.2104117833077908,
|
|
"epoch": 2.1028466483011936,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 4.357862063693486e-06,
|
|
"loss": 1.1409953832626343,
|
|
"mean_token_accuracy": 0.7150565646588802,
|
|
"num_tokens": 983299.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 1.3603767342865467,
|
|
"epoch": 2.117539026629936,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 4.3468737256031155e-06,
|
|
"loss": 1.3795615434646606,
|
|
"mean_token_accuracy": 0.6860510632395744,
|
|
"num_tokens": 989539.0,
|
|
"step": 146
|
|
},
|
|
{
|
|
"entropy": 1.4871582835912704,
|
|
"epoch": 2.1322314049586777,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 4.335806273589214e-06,
|
|
"loss": 1.573998212814331,
|
|
"mean_token_accuracy": 0.651427399367094,
|
|
"num_tokens": 997548.0,
|
|
"step": 147
|
|
},
|
|
{
|
|
"entropy": 1.4681177996098995,
|
|
"epoch": 2.1469237832874195,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 4.324660181744589e-06,
|
|
"loss": 1.5126701593399048,
|
|
"mean_token_accuracy": 0.6612453535199165,
|
|
"num_tokens": 1002910.0,
|
|
"step": 148
|
|
},
|
|
{
|
|
"entropy": 1.3668258637189865,
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 4.313435927530719e-06,
|
|
"loss": 1.3531981706619263,
|
|
"mean_token_accuracy": 0.6994708813726902,
|
|
"num_tokens": 1008967.0,
|
|
"step": 149
|
|
},
|
|
{
|
|
"entropy": 1.478852679952979,
|
|
"epoch": 2.1763085399449036,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 4.3021339917572975e-06,
|
|
"loss": 1.4089787006378174,
|
|
"mean_token_accuracy": 0.6642785873264074,
|
|
"num_tokens": 1015107.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.364537600427866,
|
|
"epoch": 2.1910009182736454,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 4.290754858561636e-06,
|
|
"loss": 1.4817034006118774,
|
|
"mean_token_accuracy": 0.6840209178626537,
|
|
"num_tokens": 1022757.0,
|
|
"step": 151
|
|
},
|
|
{
|
|
"entropy": 1.5241572260856628,
|
|
"epoch": 2.2056932966023877,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 4.2792990153879286e-06,
|
|
"loss": 1.6508989334106445,
|
|
"mean_token_accuracy": 0.6462781187146902,
|
|
"num_tokens": 1030034.0,
|
|
"step": 152
|
|
},
|
|
{
|
|
"entropy": 1.4099011793732643,
|
|
"epoch": 2.2203856749311295,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 4.267766952966369e-06,
|
|
"loss": 1.361659288406372,
|
|
"mean_token_accuracy": 0.6699612885713577,
|
|
"num_tokens": 1036957.0,
|
|
"step": 153
|
|
},
|
|
{
|
|
"entropy": 1.5446589030325413,
|
|
"epoch": 2.2350780532598713,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 4.25615916529213e-06,
|
|
"loss": 1.724553108215332,
|
|
"mean_token_accuracy": 0.6526541039347649,
|
|
"num_tokens": 1043955.0,
|
|
"step": 154
|
|
},
|
|
{
|
|
"entropy": 1.2551699057221413,
|
|
"epoch": 2.2497704315886136,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 4.244476149604201e-06,
|
|
"loss": 1.2087091207504272,
|
|
"mean_token_accuracy": 0.6985193602740765,
|
|
"num_tokens": 1050087.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 1.414844986051321,
|
|
"epoch": 2.2644628099173554,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 4.2327184063640905e-06,
|
|
"loss": 1.4239919185638428,
|
|
"mean_token_accuracy": 0.6687978152185678,
|
|
"num_tokens": 1056578.0,
|
|
"step": 156
|
|
},
|
|
{
|
|
"entropy": 1.3670090530067682,
|
|
"epoch": 2.279155188246097,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 4.220886439234385e-06,
|
|
"loss": 1.4656591415405273,
|
|
"mean_token_accuracy": 0.6766840294003487,
|
|
"num_tokens": 1063625.0,
|
|
"step": 157
|
|
},
|
|
{
|
|
"entropy": 1.3662168271839619,
|
|
"epoch": 2.2938475665748395,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 4.2089807550571786e-06,
|
|
"loss": 1.3282774686813354,
|
|
"mean_token_accuracy": 0.6898504607379436,
|
|
"num_tokens": 1070304.0,
|
|
"step": 158
|
|
},
|
|
{
|
|
"entropy": 1.39240912348032,
|
|
"epoch": 2.3085399449035813,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 4.197001863832355e-06,
|
|
"loss": 1.341300368309021,
|
|
"mean_token_accuracy": 0.6747097820043564,
|
|
"num_tokens": 1076203.0,
|
|
"step": 159
|
|
},
|
|
{
|
|
"entropy": 1.2145243491977453,
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 4.184950278695745e-06,
|
|
"loss": 1.2274876832962036,
|
|
"mean_token_accuracy": 0.7132319957017899,
|
|
"num_tokens": 1084492.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.4162307903170586,
|
|
"epoch": 2.3379247015610654,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 4.172826515897146e-06,
|
|
"loss": 1.4108020067214966,
|
|
"mean_token_accuracy": 0.6850063428282738,
|
|
"num_tokens": 1090620.0,
|
|
"step": 161
|
|
},
|
|
{
|
|
"entropy": 1.5210106298327446,
|
|
"epoch": 2.352617079889807,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 4.160631094778205e-06,
|
|
"loss": 1.6292668581008911,
|
|
"mean_token_accuracy": 0.6409319471567869,
|
|
"num_tokens": 1096968.0,
|
|
"step": 162
|
|
},
|
|
{
|
|
"entropy": 1.3625940009951591,
|
|
"epoch": 2.367309458218549,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 4.1483645377501726e-06,
|
|
"loss": 1.4572113752365112,
|
|
"mean_token_accuracy": 0.6829789131879807,
|
|
"num_tokens": 1104565.0,
|
|
"step": 163
|
|
},
|
|
{
|
|
"entropy": 1.3159149996936321,
|
|
"epoch": 2.3820018365472913,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.136027370271526e-06,
|
|
"loss": 1.4974924325942993,
|
|
"mean_token_accuracy": 0.688617680221796,
|
|
"num_tokens": 1111651.0,
|
|
"step": 164
|
|
},
|
|
{
|
|
"entropy": 1.2680894508957863,
|
|
"epoch": 2.396694214876033,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 4.123620120825459e-06,
|
|
"loss": 1.19207763671875,
|
|
"mean_token_accuracy": 0.7037702575325966,
|
|
"num_tokens": 1118713.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 1.471781674772501,
|
|
"epoch": 2.411386593204775,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 4.111143320897244e-06,
|
|
"loss": 1.400710105895996,
|
|
"mean_token_accuracy": 0.6655568517744541,
|
|
"num_tokens": 1123776.0,
|
|
"step": 166
|
|
},
|
|
{
|
|
"entropy": 1.3692159168422222,
|
|
"epoch": 2.426078971533517,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.098597504951462e-06,
|
|
"loss": 1.358292579650879,
|
|
"mean_token_accuracy": 0.6901015266776085,
|
|
"num_tokens": 1130383.0,
|
|
"step": 167
|
|
},
|
|
{
|
|
"entropy": 1.4371467269957066,
|
|
"epoch": 2.440771349862259,
|
|
"grad_norm": 4.40625,
|
|
"learning_rate": 4.085983210409114e-06,
|
|
"loss": 1.5146667957305908,
|
|
"mean_token_accuracy": 0.6645361706614494,
|
|
"num_tokens": 1137520.0,
|
|
"step": 168
|
|
},
|
|
{
|
|
"entropy": 1.2199561521410942,
|
|
"epoch": 2.455463728191001,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 4.073300977624594e-06,
|
|
"loss": 1.229698657989502,
|
|
"mean_token_accuracy": 0.701645290479064,
|
|
"num_tokens": 1144374.0,
|
|
"step": 169
|
|
},
|
|
{
|
|
"entropy": 1.3611398451030254,
|
|
"epoch": 2.470156106519743,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.060551349862545e-06,
|
|
"loss": 1.5089250802993774,
|
|
"mean_token_accuracy": 0.6758165024220943,
|
|
"num_tokens": 1152178.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.3143669590353966,
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 4.047734873274586e-06,
|
|
"loss": 1.3569583892822266,
|
|
"mean_token_accuracy": 0.6815139781683683,
|
|
"num_tokens": 1158215.0,
|
|
"step": 171
|
|
},
|
|
{
|
|
"entropy": 1.4629556462168694,
|
|
"epoch": 2.4995408631772267,
|
|
"grad_norm": 3.03125,
|
|
"learning_rate": 4.034852096875917e-06,
|
|
"loss": 1.4647691249847412,
|
|
"mean_token_accuracy": 0.6731207054108381,
|
|
"num_tokens": 1165732.0,
|
|
"step": 172
|
|
},
|
|
{
|
|
"entropy": 1.2865781262516975,
|
|
"epoch": 2.514233241505969,
|
|
"grad_norm": 3.140625,
|
|
"learning_rate": 4.021903572521802e-06,
|
|
"loss": 1.2602851390838623,
|
|
"mean_token_accuracy": 0.6870401427149773,
|
|
"num_tokens": 1173656.0,
|
|
"step": 173
|
|
},
|
|
{
|
|
"entropy": 1.2481586299836636,
|
|
"epoch": 2.5289256198347108,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 4.0088898548839285e-06,
|
|
"loss": 1.4004302024841309,
|
|
"mean_token_accuracy": 0.7000849805772305,
|
|
"num_tokens": 1180446.0,
|
|
"step": 174
|
|
},
|
|
{
|
|
"entropy": 1.4398693591356277,
|
|
"epoch": 2.5436179981634526,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 3.995811501426648e-06,
|
|
"loss": 1.4066277742385864,
|
|
"mean_token_accuracy": 0.6720283292233944,
|
|
"num_tokens": 1186431.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 1.5063546746969223,
|
|
"epoch": 2.558310376492195,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 3.982669072383093e-06,
|
|
"loss": 1.59934663772583,
|
|
"mean_token_accuracy": 0.6534310914576054,
|
|
"num_tokens": 1193242.0,
|
|
"step": 176
|
|
},
|
|
{
|
|
"entropy": 1.3044538162648678,
|
|
"epoch": 2.5730027548209367,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 3.969463130731183e-06,
|
|
"loss": 1.319208025932312,
|
|
"mean_token_accuracy": 0.6941639743745327,
|
|
"num_tokens": 1200529.0,
|
|
"step": 177
|
|
},
|
|
{
|
|
"entropy": 1.3389928713440895,
|
|
"epoch": 2.5876951331496785,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 3.956194242169506e-06,
|
|
"loss": 1.21915864944458,
|
|
"mean_token_accuracy": 0.696018248796463,
|
|
"num_tokens": 1206836.0,
|
|
"step": 178
|
|
},
|
|
{
|
|
"entropy": 1.424593310803175,
|
|
"epoch": 2.6023875114784207,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 3.942862975093085e-06,
|
|
"loss": 1.4455430507659912,
|
|
"mean_token_accuracy": 0.6746221072971821,
|
|
"num_tokens": 1212825.0,
|
|
"step": 179
|
|
},
|
|
{
|
|
"entropy": 1.3970806077122688,
|
|
"epoch": 2.6170798898071626,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 3.929469900569031e-06,
|
|
"loss": 1.4156700372695923,
|
|
"mean_token_accuracy": 0.6885317526757717,
|
|
"num_tokens": 1218891.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.4365531019866467,
|
|
"epoch": 2.6317722681359044,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 3.916015592312083e-06,
|
|
"loss": 1.4302277565002441,
|
|
"mean_token_accuracy": 0.677625959739089,
|
|
"num_tokens": 1225720.0,
|
|
"step": 181
|
|
},
|
|
{
|
|
"entropy": 1.4176970534026623,
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 3.902500626660025e-06,
|
|
"loss": 1.4448661804199219,
|
|
"mean_token_accuracy": 0.6666108258068562,
|
|
"num_tokens": 1232834.0,
|
|
"step": 182
|
|
},
|
|
{
|
|
"entropy": 1.481460090726614,
|
|
"epoch": 2.6611570247933884,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 3.888925582549006e-06,
|
|
"loss": 1.395293951034546,
|
|
"mean_token_accuracy": 0.6644096374511719,
|
|
"num_tokens": 1240900.0,
|
|
"step": 183
|
|
},
|
|
{
|
|
"entropy": 1.2771461643278599,
|
|
"epoch": 2.6758494031221303,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 3.875291041488734e-06,
|
|
"loss": 1.2739803791046143,
|
|
"mean_token_accuracy": 0.6979184821248055,
|
|
"num_tokens": 1247930.0,
|
|
"step": 184
|
|
},
|
|
{
|
|
"entropy": 1.484820794314146,
|
|
"epoch": 2.6905417814508725,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 3.861597587537568e-06,
|
|
"loss": 1.5094577074050903,
|
|
"mean_token_accuracy": 0.6664229705929756,
|
|
"num_tokens": 1254535.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 1.3980318270623684,
|
|
"epoch": 2.7052341597796143,
|
|
"grad_norm": 3.21875,
|
|
"learning_rate": 3.847845807277501e-06,
|
|
"loss": 1.3485972881317139,
|
|
"mean_token_accuracy": 0.6803864054381847,
|
|
"num_tokens": 1261710.0,
|
|
"step": 186
|
|
},
|
|
{
|
|
"entropy": 1.429235778748989,
|
|
"epoch": 2.719926538108356,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 3.83403628978903e-06,
|
|
"loss": 1.463432788848877,
|
|
"mean_token_accuracy": 0.6720171179622412,
|
|
"num_tokens": 1268676.0,
|
|
"step": 187
|
|
},
|
|
{
|
|
"entropy": 1.2592237815260887,
|
|
"epoch": 2.734618916437098,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 3.82016962662592e-06,
|
|
"loss": 1.3745115995407104,
|
|
"mean_token_accuracy": 0.7008189521729946,
|
|
"num_tokens": 1276737.0,
|
|
"step": 188
|
|
},
|
|
{
|
|
"entropy": 1.5628399066627026,
|
|
"epoch": 2.7493112947658402,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 3.806246411789872e-06,
|
|
"loss": 1.67648446559906,
|
|
"mean_token_accuracy": 0.6497368421405554,
|
|
"num_tokens": 1283918.0,
|
|
"step": 189
|
|
},
|
|
{
|
|
"entropy": 1.4043805077672005,
|
|
"epoch": 2.764003673094582,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 3.7922672417050687e-06,
|
|
"loss": 1.4935755729675293,
|
|
"mean_token_accuracy": 0.6647081095725298,
|
|
"num_tokens": 1291464.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.4052232280373573,
|
|
"epoch": 2.7786960514233243,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 3.77823271519263e-06,
|
|
"loss": 1.4137238264083862,
|
|
"mean_token_accuracy": 0.6810254417359829,
|
|
"num_tokens": 1298358.0,
|
|
"step": 191
|
|
},
|
|
{
|
|
"entropy": 1.2509154602885246,
|
|
"epoch": 2.793388429752066,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 3.764143433444962e-06,
|
|
"loss": 1.3093594312667847,
|
|
"mean_token_accuracy": 0.7068472765386105,
|
|
"num_tokens": 1306536.0,
|
|
"step": 192
|
|
},
|
|
{
|
|
"entropy": 1.424871776252985,
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 1.388112187385559,
|
|
"mean_token_accuracy": 0.675894346088171,
|
|
"num_tokens": 1312053.0,
|
|
"step": 193
|
|
},
|
|
{
|
|
"entropy": 1.4783972389996052,
|
|
"epoch": 2.8227731864095498,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 3.735803020715362e-06,
|
|
"loss": 1.4788545370101929,
|
|
"mean_token_accuracy": 0.6643631141632795,
|
|
"num_tokens": 1317701.0,
|
|
"step": 194
|
|
},
|
|
{
|
|
"entropy": 1.5499090813100338,
|
|
"epoch": 2.837465564738292,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 3.721553103742388e-06,
|
|
"loss": 1.5269482135772705,
|
|
"mean_token_accuracy": 0.6374738682061434,
|
|
"num_tokens": 1324429.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 1.4945064820349216,
|
|
"epoch": 2.852157943067034,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 3.7072508595000935e-06,
|
|
"loss": 1.5595450401306152,
|
|
"mean_token_accuracy": 0.6467197947204113,
|
|
"num_tokens": 1330791.0,
|
|
"step": 196
|
|
},
|
|
{
|
|
"entropy": 1.3186984993517399,
|
|
"epoch": 2.866850321395776,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 3.6928969006490212e-06,
|
|
"loss": 1.2567209005355835,
|
|
"mean_token_accuracy": 0.6920228451490402,
|
|
"num_tokens": 1338599.0,
|
|
"step": 197
|
|
},
|
|
{
|
|
"entropy": 1.6448290199041367,
|
|
"epoch": 2.881542699724518,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 3.6784918420649952e-06,
|
|
"loss": 1.6368999481201172,
|
|
"mean_token_accuracy": 0.6448632068932056,
|
|
"num_tokens": 1345432.0,
|
|
"step": 198
|
|
},
|
|
{
|
|
"entropy": 1.4667476452887058,
|
|
"epoch": 2.8962350780532597,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 3.664036300812779e-06,
|
|
"loss": 1.43418288230896,
|
|
"mean_token_accuracy": 0.6738253943622112,
|
|
"num_tokens": 1351852.0,
|
|
"step": 199
|
|
},
|
|
{
|
|
"entropy": 1.4334681946784258,
|
|
"epoch": 2.9109274563820016,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 3.64953089611965e-06,
|
|
"loss": 1.4027811288833618,
|
|
"mean_token_accuracy": 0.6735595650970936,
|
|
"num_tokens": 1358988.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.4100973345339298,
|
|
"epoch": 2.925619834710744,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 3.634976249348867e-06,
|
|
"loss": 1.6405863761901855,
|
|
"mean_token_accuracy": 0.6570570301264524,
|
|
"num_tokens": 1367775.0,
|
|
"step": 201
|
|
},
|
|
{
|
|
"entropy": 1.3591697476804256,
|
|
"epoch": 2.9403122130394856,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 3.6203729839730567e-06,
|
|
"loss": 1.395234227180481,
|
|
"mean_token_accuracy": 0.6774147320538759,
|
|
"num_tokens": 1374916.0,
|
|
"step": 202
|
|
},
|
|
{
|
|
"entropy": 1.3890802599489689,
|
|
"epoch": 2.955004591368228,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 3.6057217255475034e-06,
|
|
"loss": 1.4430360794067383,
|
|
"mean_token_accuracy": 0.6852261833846569,
|
|
"num_tokens": 1381503.0,
|
|
"step": 203
|
|
},
|
|
{
|
|
"entropy": 1.1807276085019112,
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 3.591023101683355e-06,
|
|
"loss": 1.140221118927002,
|
|
"mean_token_accuracy": 0.7134524993598461,
|
|
"num_tokens": 1387530.0,
|
|
"step": 204
|
|
},
|
|
{
|
|
"entropy": 1.5116482749581337,
|
|
"epoch": 2.9843893480257115,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 3.5762777420207382e-06,
|
|
"loss": 1.4948053359985352,
|
|
"mean_token_accuracy": 0.6597633976489305,
|
|
"num_tokens": 1394436.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 1.4017915055155754,
|
|
"epoch": 2.9990817263544534,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 3.5614862782017833e-06,
|
|
"loss": 1.3626996278762817,
|
|
"mean_token_accuracy": 0.6668695509433746,
|
|
"num_tokens": 1402172.0,
|
|
"step": 206
|
|
},
|
|
{
|
|
"entropy": 2.1258840560913086,
|
|
"epoch": 3.0,
|
|
"grad_norm": 14.5,
|
|
"learning_rate": 3.5466493438435707e-06,
|
|
"loss": 2.277696132659912,
|
|
"mean_token_accuracy": 0.540145993232727,
|
|
"num_tokens": 1402584.0,
|
|
"step": 207
|
|
},
|
|
{
|
|
"entropy": 1.4028206542134285,
|
|
"epoch": 3.014692378328742,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 3.531767574510987e-06,
|
|
"loss": 1.37511146068573,
|
|
"mean_token_accuracy": 0.6790164671838284,
|
|
"num_tokens": 1409137.0,
|
|
"step": 208
|
|
},
|
|
{
|
|
"entropy": 1.1828167587518692,
|
|
"epoch": 3.029384756657484,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 3.516841607689501e-06,
|
|
"loss": 1.20148766040802,
|
|
"mean_token_accuracy": 0.700589882209897,
|
|
"num_tokens": 1416171.0,
|
|
"step": 209
|
|
},
|
|
{
|
|
"entropy": 1.3335931710898876,
|
|
"epoch": 3.044077134986226,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 3.5018720827578523e-06,
|
|
"loss": 1.328529715538025,
|
|
"mean_token_accuracy": 0.6999878343194723,
|
|
"num_tokens": 1423155.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.6100288890302181,
|
|
"epoch": 3.0587695133149677,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 3.486859640960668e-06,
|
|
"loss": 1.5763328075408936,
|
|
"mean_token_accuracy": 0.6437154449522495,
|
|
"num_tokens": 1429836.0,
|
|
"step": 211
|
|
},
|
|
{
|
|
"entropy": 1.2661379724740982,
|
|
"epoch": 3.07346189164371,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 3.4718049253809894e-06,
|
|
"loss": 1.2684996128082275,
|
|
"mean_token_accuracy": 0.7078960034996271,
|
|
"num_tokens": 1436023.0,
|
|
"step": 212
|
|
},
|
|
{
|
|
"entropy": 1.5840991362929344,
|
|
"epoch": 3.088154269972452,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 3.4567085809127247e-06,
|
|
"loss": 1.6588152647018433,
|
|
"mean_token_accuracy": 0.6349230799823999,
|
|
"num_tokens": 1444044.0,
|
|
"step": 213
|
|
},
|
|
{
|
|
"entropy": 1.4915673546493053,
|
|
"epoch": 3.1028466483011936,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 3.441571254233027e-06,
|
|
"loss": 1.5922610759735107,
|
|
"mean_token_accuracy": 0.6678136102855206,
|
|
"num_tokens": 1450155.0,
|
|
"step": 214
|
|
},
|
|
{
|
|
"entropy": 1.3006605990231037,
|
|
"epoch": 3.117539026629936,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 3.426393593774591e-06,
|
|
"loss": 1.3100212812423706,
|
|
"mean_token_accuracy": 0.6970670148730278,
|
|
"num_tokens": 1455544.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 1.5377335771918297,
|
|
"epoch": 3.1322314049586777,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 3.4111762496978753e-06,
|
|
"loss": 1.5166088342666626,
|
|
"mean_token_accuracy": 0.6497631967067719,
|
|
"num_tokens": 1462550.0,
|
|
"step": 216
|
|
},
|
|
{
|
|
"entropy": 1.4202686659991741,
|
|
"epoch": 3.1469237832874195,
|
|
"grad_norm": 3.484375,
|
|
"learning_rate": 3.39591987386325e-06,
|
|
"loss": 1.4353243112564087,
|
|
"mean_token_accuracy": 0.6745712738484144,
|
|
"num_tokens": 1469996.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"entropy": 1.298032023012638,
|
|
"epoch": 3.1616161616161618,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 3.3806251198030843e-06,
|
|
"loss": 1.3633654117584229,
|
|
"mean_token_accuracy": 0.7036402598023415,
|
|
"num_tokens": 1476262.0,
|
|
"step": 218
|
|
},
|
|
{
|
|
"entropy": 1.3053624369204044,
|
|
"epoch": 3.1763085399449036,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 3.3652926426937327e-06,
|
|
"loss": 1.2723848819732666,
|
|
"mean_token_accuracy": 0.6963967196643353,
|
|
"num_tokens": 1482676.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"entropy": 1.1979273930191994,
|
|
"epoch": 3.1910009182736454,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 3.3499230993274857e-06,
|
|
"loss": 1.3043560981750488,
|
|
"mean_token_accuracy": 0.707458607852459,
|
|
"num_tokens": 1490483.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.27983458340168,
|
|
"epoch": 3.2056932966023877,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 3.3345171480844275e-06,
|
|
"loss": 1.2262349128723145,
|
|
"mean_token_accuracy": 0.7017333172261715,
|
|
"num_tokens": 1497544.0,
|
|
"step": 221
|
|
},
|
|
{
|
|
"entropy": 1.4334750287234783,
|
|
"epoch": 3.2203856749311295,
|
|
"grad_norm": 3.234375,
|
|
"learning_rate": 3.3190754489042343e-06,
|
|
"loss": 1.401545524597168,
|
|
"mean_token_accuracy": 0.676033478230238,
|
|
"num_tokens": 1505305.0,
|
|
"step": 222
|
|
},
|
|
{
|
|
"entropy": 1.3747035190463066,
|
|
"epoch": 3.2350780532598713,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 3.303598663257904e-06,
|
|
"loss": 1.3610343933105469,
|
|
"mean_token_accuracy": 0.689595028758049,
|
|
"num_tokens": 1511573.0,
|
|
"step": 223
|
|
},
|
|
{
|
|
"entropy": 1.3864431343972683,
|
|
"epoch": 3.2497704315886136,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 3.288087454119425e-06,
|
|
"loss": 1.275547981262207,
|
|
"mean_token_accuracy": 0.6855261363089085,
|
|
"num_tokens": 1517044.0,
|
|
"step": 224
|
|
},
|
|
{
|
|
"entropy": 1.2548606544733047,
|
|
"epoch": 3.2644628099173554,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 3.272542485937369e-06,
|
|
"loss": 1.2298287153244019,
|
|
"mean_token_accuracy": 0.7007413618266582,
|
|
"num_tokens": 1523433.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 1.4298927076160908,
|
|
"epoch": 3.279155188246097,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 3.256964424606437e-06,
|
|
"loss": 1.5319254398345947,
|
|
"mean_token_accuracy": 0.6646219603717327,
|
|
"num_tokens": 1529915.0,
|
|
"step": 226
|
|
},
|
|
{
|
|
"entropy": 1.2698330879211426,
|
|
"epoch": 3.2938475665748395,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 3.2413539374389275e-06,
|
|
"loss": 1.1953558921813965,
|
|
"mean_token_accuracy": 0.7178251221776009,
|
|
"num_tokens": 1536954.0,
|
|
"step": 227
|
|
},
|
|
{
|
|
"entropy": 1.2876740600913763,
|
|
"epoch": 3.3085399449035813,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 3.225711693136156e-06,
|
|
"loss": 1.283000111579895,
|
|
"mean_token_accuracy": 0.7038575522601604,
|
|
"num_tokens": 1543584.0,
|
|
"step": 228
|
|
},
|
|
{
|
|
"entropy": 1.370558850467205,
|
|
"epoch": 3.323232323232323,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 3.2100383617598075e-06,
|
|
"loss": 1.3564451932907104,
|
|
"mean_token_accuracy": 0.6855217441916466,
|
|
"num_tokens": 1550381.0,
|
|
"step": 229
|
|
},
|
|
{
|
|
"entropy": 1.4030266143381596,
|
|
"epoch": 3.3379247015610654,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 3.194334614703231e-06,
|
|
"loss": 1.4133769273757935,
|
|
"mean_token_accuracy": 0.6617723945528269,
|
|
"num_tokens": 1558228.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.3796805329620838,
|
|
"epoch": 3.352617079889807,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 3.1786011246626858e-06,
|
|
"loss": 1.3643141984939575,
|
|
"mean_token_accuracy": 0.6847334876656532,
|
|
"num_tokens": 1564241.0,
|
|
"step": 231
|
|
},
|
|
{
|
|
"entropy": 1.366750942543149,
|
|
"epoch": 3.367309458218549,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 3.1628385656085204e-06,
|
|
"loss": 1.2604291439056396,
|
|
"mean_token_accuracy": 0.6975630149245262,
|
|
"num_tokens": 1569807.0,
|
|
"step": 232
|
|
},
|
|
{
|
|
"entropy": 1.5372787863016129,
|
|
"epoch": 3.3820018365472913,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 3.147047612756302e-06,
|
|
"loss": 1.6237632036209106,
|
|
"mean_token_accuracy": 0.648850180208683,
|
|
"num_tokens": 1576906.0,
|
|
"step": 233
|
|
},
|
|
{
|
|
"entropy": 1.3469244949519634,
|
|
"epoch": 3.396694214876033,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 3.131228942537895e-06,
|
|
"loss": 1.4804552793502808,
|
|
"mean_token_accuracy": 0.6760262958705425,
|
|
"num_tokens": 1584543.0,
|
|
"step": 234
|
|
},
|
|
{
|
|
"entropy": 1.327893067151308,
|
|
"epoch": 3.411386593204775,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 3.115383232572483e-06,
|
|
"loss": 1.3390090465545654,
|
|
"mean_token_accuracy": 0.6867328248918056,
|
|
"num_tokens": 1590633.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 1.5125273801386356,
|
|
"epoch": 3.426078971533517,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 3.0995111616375417e-06,
|
|
"loss": 1.5971745252609253,
|
|
"mean_token_accuracy": 0.6557733975350857,
|
|
"num_tokens": 1598163.0,
|
|
"step": 236
|
|
},
|
|
{
|
|
"entropy": 1.2399644292891026,
|
|
"epoch": 3.440771349862259,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 3.0836134096397642e-06,
|
|
"loss": 1.1880427598953247,
|
|
"mean_token_accuracy": 0.7046547196805477,
|
|
"num_tokens": 1604955.0,
|
|
"step": 237
|
|
},
|
|
{
|
|
"entropy": 1.4055788703262806,
|
|
"epoch": 3.455463728191001,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 3.0676906575859335e-06,
|
|
"loss": 1.4325823783874512,
|
|
"mean_token_accuracy": 0.6649295091629028,
|
|
"num_tokens": 1612441.0,
|
|
"step": 238
|
|
},
|
|
{
|
|
"entropy": 1.3476755023002625,
|
|
"epoch": 3.470156106519743,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 3.051743587553754e-06,
|
|
"loss": 1.304101586341858,
|
|
"mean_token_accuracy": 0.6877517551183701,
|
|
"num_tokens": 1618692.0,
|
|
"step": 239
|
|
},
|
|
{
|
|
"entropy": 1.470371063798666,
|
|
"epoch": 3.484848484848485,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 3.035772882662627e-06,
|
|
"loss": 1.3992159366607666,
|
|
"mean_token_accuracy": 0.6673696786165237,
|
|
"num_tokens": 1624467.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.363200306892395,
|
|
"epoch": 3.4995408631772267,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 3.019779227044398e-06,
|
|
"loss": 1.4320815801620483,
|
|
"mean_token_accuracy": 0.693215861916542,
|
|
"num_tokens": 1630855.0,
|
|
"step": 241
|
|
},
|
|
{
|
|
"entropy": 1.499861165881157,
|
|
"epoch": 3.514233241505969,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 3.0037633058140433e-06,
|
|
"loss": 1.635284662246704,
|
|
"mean_token_accuracy": 0.6594692952930927,
|
|
"num_tokens": 1638233.0,
|
|
"step": 242
|
|
},
|
|
{
|
|
"entropy": 1.4187380149960518,
|
|
"epoch": 3.5289256198347108,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 2.9877258050403214e-06,
|
|
"loss": 1.4408353567123413,
|
|
"mean_token_accuracy": 0.6815416235476732,
|
|
"num_tokens": 1644307.0,
|
|
"step": 243
|
|
},
|
|
{
|
|
"entropy": 1.2909758538007736,
|
|
"epoch": 3.5436179981634526,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 2.9716674117163886e-06,
|
|
"loss": 1.1600507497787476,
|
|
"mean_token_accuracy": 0.7066163346171379,
|
|
"num_tokens": 1650324.0,
|
|
"step": 244
|
|
},
|
|
{
|
|
"entropy": 1.1622727885842323,
|
|
"epoch": 3.558310376492195,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 2.9555888137303695e-06,
|
|
"loss": 1.1322875022888184,
|
|
"mean_token_accuracy": 0.7267099879682064,
|
|
"num_tokens": 1655841.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 1.499529305845499,
|
|
"epoch": 3.5730027548209367,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 2.939490699835887e-06,
|
|
"loss": 1.5371626615524292,
|
|
"mean_token_accuracy": 0.6663133464753628,
|
|
"num_tokens": 1661462.0,
|
|
"step": 246
|
|
},
|
|
{
|
|
"entropy": 1.4221386834979057,
|
|
"epoch": 3.5876951331496785,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 2.9233737596225616e-06,
|
|
"loss": 1.4404072761535645,
|
|
"mean_token_accuracy": 0.6666534543037415,
|
|
"num_tokens": 1668939.0,
|
|
"step": 247
|
|
},
|
|
{
|
|
"entropy": 1.2013383097946644,
|
|
"epoch": 3.6023875114784207,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 2.9072386834864723e-06,
|
|
"loss": 1.2697336673736572,
|
|
"mean_token_accuracy": 0.7121818475425243,
|
|
"num_tokens": 1677284.0,
|
|
"step": 248
|
|
},
|
|
{
|
|
"entropy": 1.370607167482376,
|
|
"epoch": 3.6170798898071626,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 2.8910861626005774e-06,
|
|
"loss": 1.4362889528274536,
|
|
"mean_token_accuracy": 0.6732826940715313,
|
|
"num_tokens": 1684406.0,
|
|
"step": 249
|
|
},
|
|
{
|
|
"entropy": 1.5012065656483173,
|
|
"epoch": 3.6317722681359044,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 2.8749168888851126e-06,
|
|
"loss": 1.5818341970443726,
|
|
"mean_token_accuracy": 0.6568798068910837,
|
|
"num_tokens": 1692633.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.1994779370725155,
|
|
"epoch": 3.6464646464646466,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 2.858731554977948e-06,
|
|
"loss": 1.1643537282943726,
|
|
"mean_token_accuracy": 0.716903805732727,
|
|
"num_tokens": 1699505.0,
|
|
"step": 251
|
|
},
|
|
{
|
|
"entropy": 1.4587336257100105,
|
|
"epoch": 3.6611570247933884,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 2.8425308542049208e-06,
|
|
"loss": 1.6454309225082397,
|
|
"mean_token_accuracy": 0.6632649935781956,
|
|
"num_tokens": 1705828.0,
|
|
"step": 252
|
|
},
|
|
{
|
|
"entropy": 1.20838226005435,
|
|
"epoch": 3.6758494031221303,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 2.82631548055013e-06,
|
|
"loss": 1.144212007522583,
|
|
"mean_token_accuracy": 0.7113795578479767,
|
|
"num_tokens": 1713697.0,
|
|
"step": 253
|
|
},
|
|
{
|
|
"entropy": 1.4037099555134773,
|
|
"epoch": 3.6905417814508725,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 2.8100861286262137e-06,
|
|
"loss": 1.4409031867980957,
|
|
"mean_token_accuracy": 0.6661250051110983,
|
|
"num_tokens": 1721124.0,
|
|
"step": 254
|
|
},
|
|
{
|
|
"entropy": 1.344285275787115,
|
|
"epoch": 3.7052341597796143,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 2.7938434936445946e-06,
|
|
"loss": 1.3361552953720093,
|
|
"mean_token_accuracy": 0.6808187067508698,
|
|
"num_tokens": 1727536.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 1.53640878200531,
|
|
"epoch": 3.719926538108356,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 2.7775882713856946e-06,
|
|
"loss": 1.504386067390442,
|
|
"mean_token_accuracy": 0.6495453417301178,
|
|
"num_tokens": 1733126.0,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 3.719926538108356,
|
|
"eval_entropy": 1.3394816517829895,
|
|
"eval_loss": 1.415405035018921,
|
|
"eval_mean_token_accuracy": 0.676714651286602,
|
|
"eval_num_tokens": 1733126.0,
|
|
"eval_runtime": 1.677,
|
|
"eval_samples_per_second": 34.585,
|
|
"eval_steps_per_second": 4.77,
|
|
"step": 256
|
|
},
|
|
{
|
|
"entropy": 1.3805672824382782,
|
|
"epoch": 3.734618916437098,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 2.761321158169134e-06,
|
|
"loss": 1.276710033416748,
|
|
"mean_token_accuracy": 0.6919525004923344,
|
|
"num_tokens": 1740583.0,
|
|
"step": 257
|
|
},
|
|
{
|
|
"entropy": 1.4145719185471535,
|
|
"epoch": 3.7493112947658402,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 2.7450428508239024e-06,
|
|
"loss": 1.492904543876648,
|
|
"mean_token_accuracy": 0.6696864552795887,
|
|
"num_tokens": 1749615.0,
|
|
"step": 258
|
|
},
|
|
{
|
|
"entropy": 1.3122917041182518,
|
|
"epoch": 3.764003673094582,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 2.7287540466585067e-06,
|
|
"loss": 1.3468831777572632,
|
|
"mean_token_accuracy": 0.6847816966474056,
|
|
"num_tokens": 1755286.0,
|
|
"step": 259
|
|
},
|
|
{
|
|
"entropy": 1.2696636728942394,
|
|
"epoch": 3.7786960514233243,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 2.7124554434311047e-06,
|
|
"loss": 1.251458764076233,
|
|
"mean_token_accuracy": 0.695728961378336,
|
|
"num_tokens": 1763474.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.477790392935276,
|
|
"epoch": 3.793388429752066,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 2.696147739319613e-06,
|
|
"loss": 1.4353973865509033,
|
|
"mean_token_accuracy": 0.6688569448888302,
|
|
"num_tokens": 1770417.0,
|
|
"step": 261
|
|
},
|
|
{
|
|
"entropy": 1.2040468826889992,
|
|
"epoch": 3.808080808080808,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 2.6798316328917988e-06,
|
|
"loss": 1.1171551942825317,
|
|
"mean_token_accuracy": 0.7124636992812157,
|
|
"num_tokens": 1776268.0,
|
|
"step": 262
|
|
},
|
|
{
|
|
"entropy": 1.334158930927515,
|
|
"epoch": 3.8227731864095498,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 2.663507823075358e-06,
|
|
"loss": 1.442462682723999,
|
|
"mean_token_accuracy": 0.6851038224995136,
|
|
"num_tokens": 1784097.0,
|
|
"step": 263
|
|
},
|
|
{
|
|
"entropy": 1.2884401306509972,
|
|
"epoch": 3.837465564738292,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 2.6471770091279725e-06,
|
|
"loss": 1.2929211854934692,
|
|
"mean_token_accuracy": 0.6950427368283272,
|
|
"num_tokens": 1790700.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"entropy": 1.455569889396429,
|
|
"epoch": 3.852157943067034,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 2.6308398906073603e-06,
|
|
"loss": 1.5066760778427124,
|
|
"mean_token_accuracy": 0.6729711573570967,
|
|
"num_tokens": 1797581.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 1.3149556033313274,
|
|
"epoch": 3.866850321395776,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 2.6144971673413023e-06,
|
|
"loss": 1.345811128616333,
|
|
"mean_token_accuracy": 0.6892800442874432,
|
|
"num_tokens": 1804984.0,
|
|
"step": 266
|
|
},
|
|
{
|
|
"entropy": 1.5259748809039593,
|
|
"epoch": 3.881542699724518,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 2.5981495393976718e-06,
|
|
"loss": 1.560139775276184,
|
|
"mean_token_accuracy": 0.6488614473491907,
|
|
"num_tokens": 1810652.0,
|
|
"step": 267
|
|
},
|
|
{
|
|
"entropy": 1.6216607764363289,
|
|
"epoch": 3.8962350780532597,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 2.5817977070544408e-06,
|
|
"loss": 1.6535224914550781,
|
|
"mean_token_accuracy": 0.6346077732741833,
|
|
"num_tokens": 1817391.0,
|
|
"step": 268
|
|
},
|
|
{
|
|
"entropy": 1.465709399431944,
|
|
"epoch": 3.9109274563820016,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 2.5654423707696834e-06,
|
|
"loss": 1.5264501571655273,
|
|
"mean_token_accuracy": 0.6553380750119686,
|
|
"num_tokens": 1825473.0,
|
|
"step": 269
|
|
},
|
|
{
|
|
"entropy": 1.3719651140272617,
|
|
"epoch": 3.925619834710744,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 2.5490842311515706e-06,
|
|
"loss": 1.405755639076233,
|
|
"mean_token_accuracy": 0.6818547490984201,
|
|
"num_tokens": 1830992.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.30729578435421,
|
|
"epoch": 3.9403122130394856,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 2.5327239889283613e-06,
|
|
"loss": 1.2894923686981201,
|
|
"mean_token_accuracy": 0.6817844286561012,
|
|
"num_tokens": 1838915.0,
|
|
"step": 271
|
|
},
|
|
{
|
|
"entropy": 1.6357484422624111,
|
|
"epoch": 3.955004591368228,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 2.5163623449183797e-06,
|
|
"loss": 1.7700730562210083,
|
|
"mean_token_accuracy": 0.6349711399525404,
|
|
"num_tokens": 1847192.0,
|
|
"step": 272
|
|
},
|
|
{
|
|
"entropy": 1.42452397570014,
|
|
"epoch": 3.9696969696969697,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 1.4052257537841797,
|
|
"mean_token_accuracy": 0.670428641140461,
|
|
"num_tokens": 1853853.0,
|
|
"step": 273
|
|
},
|
|
{
|
|
"entropy": 1.2537009306252003,
|
|
"epoch": 3.9843893480257115,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 2.4836376550816207e-06,
|
|
"loss": 1.320694088935852,
|
|
"mean_token_accuracy": 0.708381250500679,
|
|
"num_tokens": 1860810.0,
|
|
"step": 274
|
|
},
|
|
{
|
|
"entropy": 1.3919994123280048,
|
|
"epoch": 3.9990817263544534,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 2.4672760110716395e-06,
|
|
"loss": 1.4379889965057373,
|
|
"mean_token_accuracy": 0.6738540474325418,
|
|
"num_tokens": 1869708.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 1.3144437074661255,
|
|
"epoch": 4.0,
|
|
"grad_norm": 17.25,
|
|
"learning_rate": 2.45091576884843e-06,
|
|
"loss": 1.3155449628829956,
|
|
"mean_token_accuracy": 0.6774193644523621,
|
|
"num_tokens": 1870112.0,
|
|
"step": 276
|
|
},
|
|
{
|
|
"entropy": 1.3445695489645004,
|
|
"epoch": 4.014692378328742,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 2.434557629230318e-06,
|
|
"loss": 1.3874691724777222,
|
|
"mean_token_accuracy": 0.689882904291153,
|
|
"num_tokens": 1877614.0,
|
|
"step": 277
|
|
},
|
|
{
|
|
"entropy": 1.5155527740716934,
|
|
"epoch": 4.029384756657484,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 2.41820229294556e-06,
|
|
"loss": 1.4299671649932861,
|
|
"mean_token_accuracy": 0.6567830629646778,
|
|
"num_tokens": 1882254.0,
|
|
"step": 278
|
|
},
|
|
{
|
|
"entropy": 1.2426442056894302,
|
|
"epoch": 4.044077134986226,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 2.4018504606023295e-06,
|
|
"loss": 1.2982856035232544,
|
|
"mean_token_accuracy": 0.6901493407785892,
|
|
"num_tokens": 1888736.0,
|
|
"step": 279
|
|
},
|
|
{
|
|
"entropy": 1.4402744472026825,
|
|
"epoch": 4.058769513314968,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 2.385502832658699e-06,
|
|
"loss": 1.526076078414917,
|
|
"mean_token_accuracy": 0.6596730649471283,
|
|
"num_tokens": 1896071.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.3624507710337639,
|
|
"epoch": 4.07346189164371,
|
|
"grad_norm": 3.46875,
|
|
"learning_rate": 2.3691601093926406e-06,
|
|
"loss": 1.5298748016357422,
|
|
"mean_token_accuracy": 0.672961063683033,
|
|
"num_tokens": 1904222.0,
|
|
"step": 281
|
|
},
|
|
{
|
|
"entropy": 1.518877875059843,
|
|
"epoch": 4.088154269972452,
|
|
"grad_norm": 3.5625,
|
|
"learning_rate": 2.3528229908720275e-06,
|
|
"loss": 1.5621310472488403,
|
|
"mean_token_accuracy": 0.6666549891233444,
|
|
"num_tokens": 1911418.0,
|
|
"step": 282
|
|
},
|
|
{
|
|
"entropy": 1.408459410071373,
|
|
"epoch": 4.102846648301194,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 2.3364921769246423e-06,
|
|
"loss": 1.5553926229476929,
|
|
"mean_token_accuracy": 0.6725407186895609,
|
|
"num_tokens": 1919869.0,
|
|
"step": 283
|
|
},
|
|
{
|
|
"entropy": 1.5305853299796581,
|
|
"epoch": 4.117539026629935,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 2.3201683671082016e-06,
|
|
"loss": 1.5349313020706177,
|
|
"mean_token_accuracy": 0.6633393950760365,
|
|
"num_tokens": 1924769.0,
|
|
"step": 284
|
|
},
|
|
{
|
|
"entropy": 1.3491091206669807,
|
|
"epoch": 4.132231404958677,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 2.3038522606803882e-06,
|
|
"loss": 1.3523074388504028,
|
|
"mean_token_accuracy": 0.6839871145784855,
|
|
"num_tokens": 1931557.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 1.4780425243079662,
|
|
"epoch": 4.14692378328742,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.287544556568896e-06,
|
|
"loss": 1.565530776977539,
|
|
"mean_token_accuracy": 0.6532882675528526,
|
|
"num_tokens": 1938144.0,
|
|
"step": 286
|
|
},
|
|
{
|
|
"entropy": 1.440908256918192,
|
|
"epoch": 4.161616161616162,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 2.271245953341494e-06,
|
|
"loss": 1.4530143737792969,
|
|
"mean_token_accuracy": 0.6627631783485413,
|
|
"num_tokens": 1945396.0,
|
|
"step": 287
|
|
},
|
|
{
|
|
"entropy": 1.297002412378788,
|
|
"epoch": 4.176308539944904,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 2.2549571491760985e-06,
|
|
"loss": 1.3622161149978638,
|
|
"mean_token_accuracy": 0.6911368295550346,
|
|
"num_tokens": 1952257.0,
|
|
"step": 288
|
|
},
|
|
{
|
|
"entropy": 1.1849941164255142,
|
|
"epoch": 4.191000918273645,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 2.238678841830867e-06,
|
|
"loss": 1.1791870594024658,
|
|
"mean_token_accuracy": 0.7209084387868643,
|
|
"num_tokens": 1960488.0,
|
|
"step": 289
|
|
},
|
|
{
|
|
"entropy": 1.609047919511795,
|
|
"epoch": 4.205693296602387,
|
|
"grad_norm": 4.4375,
|
|
"learning_rate": 2.2224117286143063e-06,
|
|
"loss": 1.7078322172164917,
|
|
"mean_token_accuracy": 0.639599371701479,
|
|
"num_tokens": 1966355.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.2323498874902725,
|
|
"epoch": 4.22038567493113,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 2.2061565063554063e-06,
|
|
"loss": 1.1928443908691406,
|
|
"mean_token_accuracy": 0.7171883173286915,
|
|
"num_tokens": 1973373.0,
|
|
"step": 291
|
|
},
|
|
{
|
|
"entropy": 1.3962544910609722,
|
|
"epoch": 4.235078053259872,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 2.1899138713737876e-06,
|
|
"loss": 1.3652687072753906,
|
|
"mean_token_accuracy": 0.673550434410572,
|
|
"num_tokens": 1980499.0,
|
|
"step": 292
|
|
},
|
|
{
|
|
"entropy": 1.2281092554330826,
|
|
"epoch": 4.249770431588614,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 2.173684519449872e-06,
|
|
"loss": 1.2023086547851562,
|
|
"mean_token_accuracy": 0.7110824286937714,
|
|
"num_tokens": 1987656.0,
|
|
"step": 293
|
|
},
|
|
{
|
|
"entropy": 1.4712859019637108,
|
|
"epoch": 4.264462809917355,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.1574691457950805e-06,
|
|
"loss": 1.5222008228302002,
|
|
"mean_token_accuracy": 0.6687300186604261,
|
|
"num_tokens": 1993905.0,
|
|
"step": 294
|
|
},
|
|
{
|
|
"entropy": 1.006372582167387,
|
|
"epoch": 4.279155188246097,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 2.1412684450220524e-06,
|
|
"loss": 0.9414258003234863,
|
|
"mean_token_accuracy": 0.7512795105576515,
|
|
"num_tokens": 2002852.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 1.3876865170896053,
|
|
"epoch": 4.293847566574839,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 2.1250831111148873e-06,
|
|
"loss": 1.3618916273117065,
|
|
"mean_token_accuracy": 0.68668382614851,
|
|
"num_tokens": 2009321.0,
|
|
"step": 296
|
|
},
|
|
{
|
|
"entropy": 1.117498192936182,
|
|
"epoch": 4.308539944903581,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 2.1089138373994226e-06,
|
|
"loss": 1.0988891124725342,
|
|
"mean_token_accuracy": 0.7218944355845451,
|
|
"num_tokens": 2015800.0,
|
|
"step": 297
|
|
},
|
|
{
|
|
"entropy": 1.300987858325243,
|
|
"epoch": 4.3232323232323235,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 2.0927613165135285e-06,
|
|
"loss": 1.2009758949279785,
|
|
"mean_token_accuracy": 0.692545972764492,
|
|
"num_tokens": 2022915.0,
|
|
"step": 298
|
|
},
|
|
{
|
|
"entropy": 1.3463373892009258,
|
|
"epoch": 4.337924701561065,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 2.0766262403774388e-06,
|
|
"loss": 1.3616676330566406,
|
|
"mean_token_accuracy": 0.6792471520602703,
|
|
"num_tokens": 2029521.0,
|
|
"step": 299
|
|
},
|
|
{
|
|
"entropy": 1.3613272085785866,
|
|
"epoch": 4.352617079889807,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 2.0605093001641138e-06,
|
|
"loss": 1.2829806804656982,
|
|
"mean_token_accuracy": 0.682598739862442,
|
|
"num_tokens": 2037788.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.3312906958162785,
|
|
"epoch": 4.367309458218549,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 2.0444111862696313e-06,
|
|
"loss": 1.2278118133544922,
|
|
"mean_token_accuracy": 0.6921750158071518,
|
|
"num_tokens": 2044247.0,
|
|
"step": 301
|
|
},
|
|
{
|
|
"entropy": 1.3146127797663212,
|
|
"epoch": 4.382001836547291,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 2.0283325882836126e-06,
|
|
"loss": 1.3366804122924805,
|
|
"mean_token_accuracy": 0.7049176283180714,
|
|
"num_tokens": 2051380.0,
|
|
"step": 302
|
|
},
|
|
{
|
|
"entropy": 1.7008662782609463,
|
|
"epoch": 4.3966942148760335,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 2.01227419495968e-06,
|
|
"loss": 1.6468967199325562,
|
|
"mean_token_accuracy": 0.6412950064986944,
|
|
"num_tokens": 2056909.0,
|
|
"step": 303
|
|
},
|
|
{
|
|
"entropy": 1.3109636045992374,
|
|
"epoch": 4.411386593204775,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.996236694185957e-06,
|
|
"loss": 1.3502867221832275,
|
|
"mean_token_accuracy": 0.6835893988609314,
|
|
"num_tokens": 2063308.0,
|
|
"step": 304
|
|
},
|
|
{
|
|
"entropy": 1.3513622879981995,
|
|
"epoch": 4.426078971533517,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.9802207729556023e-06,
|
|
"loss": 1.4691942930221558,
|
|
"mean_token_accuracy": 0.688535138964653,
|
|
"num_tokens": 2071132.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 1.1963273957371712,
|
|
"epoch": 4.440771349862259,
|
|
"grad_norm": 2.9375,
|
|
"learning_rate": 1.964227117337374e-06,
|
|
"loss": 1.2708735466003418,
|
|
"mean_token_accuracy": 0.707404674962163,
|
|
"num_tokens": 2080159.0,
|
|
"step": 306
|
|
},
|
|
{
|
|
"entropy": 1.4165263511240482,
|
|
"epoch": 4.455463728191001,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 1.9482564124462478e-06,
|
|
"loss": 1.335919976234436,
|
|
"mean_token_accuracy": 0.6746666543185711,
|
|
"num_tokens": 2085158.0,
|
|
"step": 307
|
|
},
|
|
{
|
|
"entropy": 1.2119375206530094,
|
|
"epoch": 4.470156106519743,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.9323093424140673e-06,
|
|
"loss": 1.1746174097061157,
|
|
"mean_token_accuracy": 0.717445420101285,
|
|
"num_tokens": 2091050.0,
|
|
"step": 308
|
|
},
|
|
{
|
|
"entropy": 1.4248721897602081,
|
|
"epoch": 4.484848484848484,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 1.9163865903602374e-06,
|
|
"loss": 1.3420960903167725,
|
|
"mean_token_accuracy": 0.6748971827328205,
|
|
"num_tokens": 2098572.0,
|
|
"step": 309
|
|
},
|
|
{
|
|
"entropy": 1.3115894440561533,
|
|
"epoch": 4.499540863177227,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 1.9004888383624596e-06,
|
|
"loss": 1.261577844619751,
|
|
"mean_token_accuracy": 0.6892781518399715,
|
|
"num_tokens": 2104436.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.2737204805016518,
|
|
"epoch": 4.514233241505969,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 1.8846167674275175e-06,
|
|
"loss": 1.2144405841827393,
|
|
"mean_token_accuracy": 0.7029564455151558,
|
|
"num_tokens": 2111721.0,
|
|
"step": 311
|
|
},
|
|
{
|
|
"entropy": 1.451767385005951,
|
|
"epoch": 4.528925619834711,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.8687710574621051e-06,
|
|
"loss": 1.5465962886810303,
|
|
"mean_token_accuracy": 0.6663695089519024,
|
|
"num_tokens": 2118973.0,
|
|
"step": 312
|
|
},
|
|
{
|
|
"entropy": 1.4293602593243122,
|
|
"epoch": 4.543617998163453,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 1.852952387243698e-06,
|
|
"loss": 1.3797839879989624,
|
|
"mean_token_accuracy": 0.6705848015844822,
|
|
"num_tokens": 2125146.0,
|
|
"step": 313
|
|
},
|
|
{
|
|
"entropy": 1.237271387130022,
|
|
"epoch": 4.558310376492194,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.8371614343914798e-06,
|
|
"loss": 1.2865500450134277,
|
|
"mean_token_accuracy": 0.7063010260462761,
|
|
"num_tokens": 2131187.0,
|
|
"step": 314
|
|
},
|
|
{
|
|
"entropy": 1.3471387848258018,
|
|
"epoch": 4.573002754820937,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.8213988753373147e-06,
|
|
"loss": 1.395220398902893,
|
|
"mean_token_accuracy": 0.6758437845855951,
|
|
"num_tokens": 2137515.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 1.3820691257715225,
|
|
"epoch": 4.587695133149679,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.8056653852967699e-06,
|
|
"loss": 1.4266245365142822,
|
|
"mean_token_accuracy": 0.6850110292434692,
|
|
"num_tokens": 2144669.0,
|
|
"step": 316
|
|
},
|
|
{
|
|
"entropy": 1.318991456180811,
|
|
"epoch": 4.602387511478421,
|
|
"grad_norm": 3.8125,
|
|
"learning_rate": 1.7899616382401935e-06,
|
|
"loss": 1.3007704019546509,
|
|
"mean_token_accuracy": 0.6927743554115295,
|
|
"num_tokens": 2150840.0,
|
|
"step": 317
|
|
},
|
|
{
|
|
"entropy": 1.2886290848255157,
|
|
"epoch": 4.6170798898071626,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 1.7742883068638447e-06,
|
|
"loss": 1.2548828125,
|
|
"mean_token_accuracy": 0.708765309303999,
|
|
"num_tokens": 2158229.0,
|
|
"step": 318
|
|
},
|
|
{
|
|
"entropy": 1.4580038003623486,
|
|
"epoch": 4.631772268135904,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 1.758646062561073e-06,
|
|
"loss": 1.4200549125671387,
|
|
"mean_token_accuracy": 0.6693456768989563,
|
|
"num_tokens": 2164006.0,
|
|
"step": 319
|
|
},
|
|
{
|
|
"entropy": 1.2778888307511806,
|
|
"epoch": 4.646464646464646,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 1.743035575393564e-06,
|
|
"loss": 1.2976222038269043,
|
|
"mean_token_accuracy": 0.7035369873046875,
|
|
"num_tokens": 2170612.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 1.2509517259895802,
|
|
"epoch": 4.661157024793388,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.7274575140626318e-06,
|
|
"loss": 1.2400519847869873,
|
|
"mean_token_accuracy": 0.7016804441809654,
|
|
"num_tokens": 2176823.0,
|
|
"step": 321
|
|
},
|
|
{
|
|
"entropy": 1.322471171617508,
|
|
"epoch": 4.675849403122131,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.7119125458805767e-06,
|
|
"loss": 1.3159892559051514,
|
|
"mean_token_accuracy": 0.689349815249443,
|
|
"num_tokens": 2183067.0,
|
|
"step": 322
|
|
},
|
|
{
|
|
"entropy": 1.4905264489352703,
|
|
"epoch": 4.6905417814508725,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 1.6964013367420967e-06,
|
|
"loss": 1.5832772254943848,
|
|
"mean_token_accuracy": 0.6548038627952337,
|
|
"num_tokens": 2190961.0,
|
|
"step": 323
|
|
},
|
|
{
|
|
"entropy": 1.3516268730163574,
|
|
"epoch": 4.705234159779614,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.6809245510957667e-06,
|
|
"loss": 1.2710171937942505,
|
|
"mean_token_accuracy": 0.6953945681452751,
|
|
"num_tokens": 2197269.0,
|
|
"step": 324
|
|
},
|
|
{
|
|
"entropy": 1.3766049407422543,
|
|
"epoch": 4.719926538108356,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 1.665482851915573e-06,
|
|
"loss": 1.3786622285842896,
|
|
"mean_token_accuracy": 0.685791440308094,
|
|
"num_tokens": 2203978.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 1.3528023660182953,
|
|
"epoch": 4.734618916437098,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.6500769006725142e-06,
|
|
"loss": 1.392067313194275,
|
|
"mean_token_accuracy": 0.6816291362047195,
|
|
"num_tokens": 2210550.0,
|
|
"step": 326
|
|
},
|
|
{
|
|
"entropy": 1.547594841569662,
|
|
"epoch": 4.749311294765841,
|
|
"grad_norm": 4.59375,
|
|
"learning_rate": 1.634707357306267e-06,
|
|
"loss": 1.6562820672988892,
|
|
"mean_token_accuracy": 0.6610909849405289,
|
|
"num_tokens": 2216651.0,
|
|
"step": 327
|
|
},
|
|
{
|
|
"entropy": 1.4075745232403278,
|
|
"epoch": 4.7640036730945825,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.6193748801969164e-06,
|
|
"loss": 1.3409862518310547,
|
|
"mean_token_accuracy": 0.6761545985937119,
|
|
"num_tokens": 2223859.0,
|
|
"step": 328
|
|
},
|
|
{
|
|
"entropy": 1.4365106411278248,
|
|
"epoch": 4.778696051423324,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 1.6040801261367494e-06,
|
|
"loss": 1.4949394464492798,
|
|
"mean_token_accuracy": 0.6674479302018881,
|
|
"num_tokens": 2230698.0,
|
|
"step": 329
|
|
},
|
|
{
|
|
"entropy": 1.39594067633152,
|
|
"epoch": 4.793388429752066,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.588823750302126e-06,
|
|
"loss": 1.4408518075942993,
|
|
"mean_token_accuracy": 0.6747472062706947,
|
|
"num_tokens": 2237816.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.5729894042015076,
|
|
"epoch": 4.808080808080808,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 1.5736064062254094e-06,
|
|
"loss": 1.6204023361206055,
|
|
"mean_token_accuracy": 0.6384808495640755,
|
|
"num_tokens": 2246004.0,
|
|
"step": 331
|
|
},
|
|
{
|
|
"entropy": 1.3464174792170525,
|
|
"epoch": 4.82277318640955,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.5584287457669733e-06,
|
|
"loss": 1.3943066596984863,
|
|
"mean_token_accuracy": 0.6859343573451042,
|
|
"num_tokens": 2252413.0,
|
|
"step": 332
|
|
},
|
|
{
|
|
"entropy": 1.6637616902589798,
|
|
"epoch": 4.837465564738292,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.5432914190872757e-06,
|
|
"loss": 1.8307788372039795,
|
|
"mean_token_accuracy": 0.6110464110970497,
|
|
"num_tokens": 2259455.0,
|
|
"step": 333
|
|
},
|
|
{
|
|
"entropy": 1.4676074236631393,
|
|
"epoch": 4.852157943067034,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.528195074619011e-06,
|
|
"loss": 1.4894895553588867,
|
|
"mean_token_accuracy": 0.6655828766524792,
|
|
"num_tokens": 2266752.0,
|
|
"step": 334
|
|
},
|
|
{
|
|
"entropy": 1.3864082805812359,
|
|
"epoch": 4.866850321395776,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 1.5131403590393323e-06,
|
|
"loss": 1.432395577430725,
|
|
"mean_token_accuracy": 0.6768216788768768,
|
|
"num_tokens": 2274152.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 1.3860022686421871,
|
|
"epoch": 4.881542699724518,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.4981279172421481e-06,
|
|
"loss": 1.3834329843521118,
|
|
"mean_token_accuracy": 0.677151620388031,
|
|
"num_tokens": 2281472.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"entropy": 1.437876883894205,
|
|
"epoch": 4.89623507805326,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 1.4831583923105e-06,
|
|
"loss": 1.584201455116272,
|
|
"mean_token_accuracy": 0.6708448305726051,
|
|
"num_tokens": 2289551.0,
|
|
"step": 337
|
|
},
|
|
{
|
|
"entropy": 1.330020684748888,
|
|
"epoch": 4.910927456382002,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 1.4682324254890135e-06,
|
|
"loss": 1.3487207889556885,
|
|
"mean_token_accuracy": 0.6990271396934986,
|
|
"num_tokens": 2296413.0,
|
|
"step": 338
|
|
},
|
|
{
|
|
"entropy": 1.408544309437275,
|
|
"epoch": 4.925619834710744,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.4533506561564305e-06,
|
|
"loss": 1.437635898590088,
|
|
"mean_token_accuracy": 0.673307441174984,
|
|
"num_tokens": 2303769.0,
|
|
"step": 339
|
|
},
|
|
{
|
|
"entropy": 1.2854021713137627,
|
|
"epoch": 4.940312213039486,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.4385137217982178e-06,
|
|
"loss": 1.3022122383117676,
|
|
"mean_token_accuracy": 0.7127466425299644,
|
|
"num_tokens": 2310004.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.3864372000098228,
|
|
"epoch": 4.955004591368228,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 1.4237222579792618e-06,
|
|
"loss": 1.4041612148284912,
|
|
"mean_token_accuracy": 0.6763649247586727,
|
|
"num_tokens": 2318788.0,
|
|
"step": 341
|
|
},
|
|
{
|
|
"entropy": 1.2509639486670494,
|
|
"epoch": 4.96969696969697,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.4089768983166445e-06,
|
|
"loss": 1.2501071691513062,
|
|
"mean_token_accuracy": 0.7078825943171978,
|
|
"num_tokens": 2325084.0,
|
|
"step": 342
|
|
},
|
|
{
|
|
"entropy": 1.454335866495967,
|
|
"epoch": 4.9843893480257115,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 1.3942782744524974e-06,
|
|
"loss": 1.449426293373108,
|
|
"mean_token_accuracy": 0.6640381850302219,
|
|
"num_tokens": 2331645.0,
|
|
"step": 343
|
|
},
|
|
{
|
|
"entropy": 1.2466024421155453,
|
|
"epoch": 4.999081726354453,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.379627016026944e-06,
|
|
"loss": 1.1815191507339478,
|
|
"mean_token_accuracy": 0.7297266945242882,
|
|
"num_tokens": 2337128.0,
|
|
"step": 344
|
|
},
|
|
{
|
|
"entropy": 1.5639240741729736,
|
|
"epoch": 5.0,
|
|
"grad_norm": 14.75,
|
|
"learning_rate": 1.3650237506511333e-06,
|
|
"loss": 1.6862883567810059,
|
|
"mean_token_accuracy": 0.6340509057044983,
|
|
"num_tokens": 2337640.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 1.3247104361653328,
|
|
"epoch": 5.014692378328742,
|
|
"grad_norm": 3.875,
|
|
"learning_rate": 1.3504691038803504e-06,
|
|
"loss": 1.336502194404602,
|
|
"mean_token_accuracy": 0.6947694420814514,
|
|
"num_tokens": 2343507.0,
|
|
"step": 346
|
|
},
|
|
{
|
|
"entropy": 1.248255368322134,
|
|
"epoch": 5.029384756657484,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 1.3359636991872215e-06,
|
|
"loss": 1.2202980518341064,
|
|
"mean_token_accuracy": 0.7036343440413475,
|
|
"num_tokens": 2349761.0,
|
|
"step": 347
|
|
},
|
|
{
|
|
"entropy": 1.4622275196015835,
|
|
"epoch": 5.044077134986226,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 1.3215081579350058e-06,
|
|
"loss": 1.478945016860962,
|
|
"mean_token_accuracy": 0.6600709166377783,
|
|
"num_tokens": 2356336.0,
|
|
"step": 348
|
|
},
|
|
{
|
|
"entropy": 1.344462625682354,
|
|
"epoch": 5.058769513314968,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.307103099350979e-06,
|
|
"loss": 1.4435844421386719,
|
|
"mean_token_accuracy": 0.6860466375946999,
|
|
"num_tokens": 2363847.0,
|
|
"step": 349
|
|
},
|
|
{
|
|
"entropy": 1.5077877081930637,
|
|
"epoch": 5.07346189164371,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 1.2927491404999077e-06,
|
|
"loss": 1.546884298324585,
|
|
"mean_token_accuracy": 0.6479404345154762,
|
|
"num_tokens": 2369738.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.298233974725008,
|
|
"epoch": 5.088154269972452,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.2784468962576136e-06,
|
|
"loss": 1.274640440940857,
|
|
"mean_token_accuracy": 0.7008918151259422,
|
|
"num_tokens": 2376780.0,
|
|
"step": 351
|
|
},
|
|
{
|
|
"entropy": 1.3465782329440117,
|
|
"epoch": 5.102846648301194,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 1.2641969792846393e-06,
|
|
"loss": 1.298094391822815,
|
|
"mean_token_accuracy": 0.6762865297496319,
|
|
"num_tokens": 2384800.0,
|
|
"step": 352
|
|
},
|
|
{
|
|
"entropy": 1.164300974458456,
|
|
"epoch": 5.117539026629935,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 1.2500000000000007e-06,
|
|
"loss": 1.0800127983093262,
|
|
"mean_token_accuracy": 0.7225227616727352,
|
|
"num_tokens": 2391130.0,
|
|
"step": 353
|
|
},
|
|
{
|
|
"entropy": 1.3427915126085281,
|
|
"epoch": 5.132231404958677,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.235856566555039e-06,
|
|
"loss": 1.2384849786758423,
|
|
"mean_token_accuracy": 0.6905228663235903,
|
|
"num_tokens": 2398399.0,
|
|
"step": 354
|
|
},
|
|
{
|
|
"entropy": 1.3961762860417366,
|
|
"epoch": 5.14692378328742,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 1.2217672848073702e-06,
|
|
"loss": 1.442419171333313,
|
|
"mean_token_accuracy": 0.6752587780356407,
|
|
"num_tokens": 2405354.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 1.2891596406698227,
|
|
"epoch": 5.161616161616162,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 1.2077327582949313e-06,
|
|
"loss": 1.1359935998916626,
|
|
"mean_token_accuracy": 0.7036025710403919,
|
|
"num_tokens": 2412665.0,
|
|
"step": 356
|
|
},
|
|
{
|
|
"entropy": 1.3552672527730465,
|
|
"epoch": 5.176308539944904,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.193753588210128e-06,
|
|
"loss": 1.3831486701965332,
|
|
"mean_token_accuracy": 0.6862701997160912,
|
|
"num_tokens": 2418974.0,
|
|
"step": 357
|
|
},
|
|
{
|
|
"entropy": 1.4431088119745255,
|
|
"epoch": 5.191000918273645,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.1798303733740801e-06,
|
|
"loss": 1.3804062604904175,
|
|
"mean_token_accuracy": 0.6673993915319443,
|
|
"num_tokens": 2425258.0,
|
|
"step": 358
|
|
},
|
|
{
|
|
"entropy": 1.3874380216002464,
|
|
"epoch": 5.205693296602387,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 1.1659637102109713e-06,
|
|
"loss": 1.4015424251556396,
|
|
"mean_token_accuracy": 0.6745005883276463,
|
|
"num_tokens": 2430975.0,
|
|
"step": 359
|
|
},
|
|
{
|
|
"entropy": 1.4490499272942543,
|
|
"epoch": 5.22038567493113,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.1521541927224994e-06,
|
|
"loss": 1.3985379934310913,
|
|
"mean_token_accuracy": 0.6676309891045094,
|
|
"num_tokens": 2437146.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 1.2855382412672043,
|
|
"epoch": 5.235078053259872,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 1.1384024124624324e-06,
|
|
"loss": 1.2529319524765015,
|
|
"mean_token_accuracy": 0.6951777450740337,
|
|
"num_tokens": 2443983.0,
|
|
"step": 361
|
|
},
|
|
{
|
|
"entropy": 1.354172457009554,
|
|
"epoch": 5.249770431588614,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.1247089585112666e-06,
|
|
"loss": 1.4322967529296875,
|
|
"mean_token_accuracy": 0.6889848560094833,
|
|
"num_tokens": 2450684.0,
|
|
"step": 362
|
|
},
|
|
{
|
|
"entropy": 1.3293256983160973,
|
|
"epoch": 5.264462809917355,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.1110744174509952e-06,
|
|
"loss": 1.3209728002548218,
|
|
"mean_token_accuracy": 0.687582079321146,
|
|
"num_tokens": 2457888.0,
|
|
"step": 363
|
|
},
|
|
{
|
|
"entropy": 1.3070398084819317,
|
|
"epoch": 5.279155188246097,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.0974993733399762e-06,
|
|
"loss": 1.3550182580947876,
|
|
"mean_token_accuracy": 0.6897185444831848,
|
|
"num_tokens": 2464596.0,
|
|
"step": 364
|
|
},
|
|
{
|
|
"entropy": 1.4290886037051678,
|
|
"epoch": 5.293847566574839,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.0839844076879186e-06,
|
|
"loss": 1.3897254467010498,
|
|
"mean_token_accuracy": 0.6664892844855785,
|
|
"num_tokens": 2470842.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 1.2070421613752842,
|
|
"epoch": 5.308539944903581,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.0705300994309697e-06,
|
|
"loss": 1.1175053119659424,
|
|
"mean_token_accuracy": 0.7257428281009197,
|
|
"num_tokens": 2477103.0,
|
|
"step": 366
|
|
},
|
|
{
|
|
"entropy": 1.3485257625579834,
|
|
"epoch": 5.3232323232323235,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 1.0571370249069163e-06,
|
|
"loss": 1.29289710521698,
|
|
"mean_token_accuracy": 0.692170076072216,
|
|
"num_tokens": 2483582.0,
|
|
"step": 367
|
|
},
|
|
{
|
|
"entropy": 1.3372049815952778,
|
|
"epoch": 5.337924701561065,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.043805757830495e-06,
|
|
"loss": 1.2823678255081177,
|
|
"mean_token_accuracy": 0.6924175024032593,
|
|
"num_tokens": 2489571.0,
|
|
"step": 368
|
|
},
|
|
{
|
|
"entropy": 1.453590054064989,
|
|
"epoch": 5.352617079889807,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.0305368692688175e-06,
|
|
"loss": 1.435807466506958,
|
|
"mean_token_accuracy": 0.6810374148190022,
|
|
"num_tokens": 2495721.0,
|
|
"step": 369
|
|
},
|
|
{
|
|
"entropy": 1.1400656588375568,
|
|
"epoch": 5.367309458218549,
|
|
"grad_norm": 3.171875,
|
|
"learning_rate": 1.0173309276169075e-06,
|
|
"loss": 1.142919898033142,
|
|
"mean_token_accuracy": 0.7174917720258236,
|
|
"num_tokens": 2503949.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.367987047880888,
|
|
"epoch": 5.382001836547291,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 1.0041884985733524e-06,
|
|
"loss": 1.4378776550292969,
|
|
"mean_token_accuracy": 0.6763719841837883,
|
|
"num_tokens": 2510208.0,
|
|
"step": 371
|
|
},
|
|
{
|
|
"entropy": 1.4379977211356163,
|
|
"epoch": 5.3966942148760335,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 9.911101451160714e-07,
|
|
"loss": 1.4669700860977173,
|
|
"mean_token_accuracy": 0.676942465826869,
|
|
"num_tokens": 2516100.0,
|
|
"step": 372
|
|
},
|
|
{
|
|
"entropy": 1.3764347173273563,
|
|
"epoch": 5.411386593204775,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 9.780964274781984e-07,
|
|
"loss": 1.5235940217971802,
|
|
"mean_token_accuracy": 0.6685472317039967,
|
|
"num_tokens": 2524555.0,
|
|
"step": 373
|
|
},
|
|
{
|
|
"entropy": 1.5450992733240128,
|
|
"epoch": 5.426078971533517,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 9.651479031240837e-07,
|
|
"loss": 1.5131289958953857,
|
|
"mean_token_accuracy": 0.6463721804320812,
|
|
"num_tokens": 2531282.0,
|
|
"step": 374
|
|
},
|
|
{
|
|
"entropy": 1.457864124327898,
|
|
"epoch": 5.440771349862259,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 9.522651267254149e-07,
|
|
"loss": 1.5437613725662231,
|
|
"mean_token_accuracy": 0.6583030465990305,
|
|
"num_tokens": 2539541.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 1.3948032334446907,
|
|
"epoch": 5.455463728191001,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 9.394486501374556e-07,
|
|
"loss": 1.4869863986968994,
|
|
"mean_token_accuracy": 0.6875165402889252,
|
|
"num_tokens": 2546176.0,
|
|
"step": 376
|
|
},
|
|
{
|
|
"entropy": 1.4013639837503433,
|
|
"epoch": 5.470156106519743,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 9.266990223754069e-07,
|
|
"loss": 1.4661489725112915,
|
|
"mean_token_accuracy": 0.6756766103208065,
|
|
"num_tokens": 2553070.0,
|
|
"step": 377
|
|
},
|
|
{
|
|
"entropy": 1.2801647149026394,
|
|
"epoch": 5.484848484848484,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 9.140167895908867e-07,
|
|
"loss": 1.312286138534546,
|
|
"mean_token_accuracy": 0.6866127587854862,
|
|
"num_tokens": 2561136.0,
|
|
"step": 378
|
|
},
|
|
{
|
|
"entropy": 1.4677649438381195,
|
|
"epoch": 5.499540863177227,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 9.014024950485384e-07,
|
|
"loss": 1.3887375593185425,
|
|
"mean_token_accuracy": 0.6793632172048092,
|
|
"num_tokens": 2567304.0,
|
|
"step": 379
|
|
},
|
|
{
|
|
"entropy": 1.4178661219775677,
|
|
"epoch": 5.514233241505969,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 8.88856679102757e-07,
|
|
"loss": 1.37828528881073,
|
|
"mean_token_accuracy": 0.6739732995629311,
|
|
"num_tokens": 2574401.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.379222609102726,
|
|
"epoch": 5.528925619834711,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 8.763798791745413e-07,
|
|
"loss": 1.3763408660888672,
|
|
"mean_token_accuracy": 0.6974320486187935,
|
|
"num_tokens": 2580639.0,
|
|
"step": 381
|
|
},
|
|
{
|
|
"entropy": 1.2704015038907528,
|
|
"epoch": 5.543617998163453,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 8.639726297284742e-07,
|
|
"loss": 1.1959011554718018,
|
|
"mean_token_accuracy": 0.700306411832571,
|
|
"num_tokens": 2587418.0,
|
|
"step": 382
|
|
},
|
|
{
|
|
"entropy": 1.4778965413570404,
|
|
"epoch": 5.558310376492194,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 8.516354622498279e-07,
|
|
"loss": 1.3851994276046753,
|
|
"mean_token_accuracy": 0.666837640106678,
|
|
"num_tokens": 2594190.0,
|
|
"step": 383
|
|
},
|
|
{
|
|
"entropy": 1.1784359328448772,
|
|
"epoch": 5.573002754820937,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 8.393689052217966e-07,
|
|
"loss": 1.236546277999878,
|
|
"mean_token_accuracy": 0.7079413570463657,
|
|
"num_tokens": 2602487.0,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 5.573002754820937,
|
|
"eval_entropy": 1.3389692306518555,
|
|
"eval_loss": 1.406298041343689,
|
|
"eval_mean_token_accuracy": 0.6783231347799301,
|
|
"eval_num_tokens": 2602487.0,
|
|
"eval_runtime": 1.6707,
|
|
"eval_samples_per_second": 34.715,
|
|
"eval_steps_per_second": 4.788,
|
|
"step": 384
|
|
},
|
|
{
|
|
"entropy": 1.588452558964491,
|
|
"epoch": 5.587695133149679,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 8.271734841028553e-07,
|
|
"loss": 1.7794235944747925,
|
|
"mean_token_accuracy": 0.6403734050691128,
|
|
"num_tokens": 2610026.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 1.407985232770443,
|
|
"epoch": 5.602387511478421,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 8.150497213042552e-07,
|
|
"loss": 1.4693222045898438,
|
|
"mean_token_accuracy": 0.6730717644095421,
|
|
"num_tokens": 2616763.0,
|
|
"step": 386
|
|
},
|
|
{
|
|
"entropy": 1.554512519389391,
|
|
"epoch": 5.6170798898071626,
|
|
"grad_norm": 4.125,
|
|
"learning_rate": 8.029981361676456e-07,
|
|
"loss": 1.7840030193328857,
|
|
"mean_token_accuracy": 0.6635391432791948,
|
|
"num_tokens": 2623765.0,
|
|
"step": 387
|
|
},
|
|
{
|
|
"entropy": 1.3049566857516766,
|
|
"epoch": 5.631772268135904,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 7.910192449428216e-07,
|
|
"loss": 1.3470871448516846,
|
|
"mean_token_accuracy": 0.6958029642701149,
|
|
"num_tokens": 2630843.0,
|
|
"step": 388
|
|
},
|
|
{
|
|
"entropy": 1.496280875056982,
|
|
"epoch": 5.646464646464646,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 7.791135607656147e-07,
|
|
"loss": 1.5659466981887817,
|
|
"mean_token_accuracy": 0.6670752931386232,
|
|
"num_tokens": 2638100.0,
|
|
"step": 389
|
|
},
|
|
{
|
|
"entropy": 1.448220781981945,
|
|
"epoch": 5.661157024793388,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 7.672815936359107e-07,
|
|
"loss": 1.6060407161712646,
|
|
"mean_token_accuracy": 0.6671166494488716,
|
|
"num_tokens": 2645333.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.3865485899150372,
|
|
"epoch": 5.675849403122131,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 7.555238503958001e-07,
|
|
"loss": 1.4103912115097046,
|
|
"mean_token_accuracy": 0.6776862740516663,
|
|
"num_tokens": 2651746.0,
|
|
"step": 391
|
|
},
|
|
{
|
|
"entropy": 1.4491654373705387,
|
|
"epoch": 5.6905417814508725,
|
|
"grad_norm": 3.859375,
|
|
"learning_rate": 7.43840834707871e-07,
|
|
"loss": 1.5049588680267334,
|
|
"mean_token_accuracy": 0.6596195660531521,
|
|
"num_tokens": 2658321.0,
|
|
"step": 392
|
|
},
|
|
{
|
|
"entropy": 1.396248023957014,
|
|
"epoch": 5.705234159779614,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 7.322330470336314e-07,
|
|
"loss": 1.44000244140625,
|
|
"mean_token_accuracy": 0.6730465441942215,
|
|
"num_tokens": 2665932.0,
|
|
"step": 393
|
|
},
|
|
{
|
|
"entropy": 1.2807521969079971,
|
|
"epoch": 5.719926538108356,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 7.207009846120718e-07,
|
|
"loss": 1.3392530679702759,
|
|
"mean_token_accuracy": 0.694486953318119,
|
|
"num_tokens": 2672679.0,
|
|
"step": 394
|
|
},
|
|
{
|
|
"entropy": 1.3551440499722958,
|
|
"epoch": 5.734618916437098,
|
|
"grad_norm": 3.078125,
|
|
"learning_rate": 7.092451414383644e-07,
|
|
"loss": 1.317352056503296,
|
|
"mean_token_accuracy": 0.6877163723111153,
|
|
"num_tokens": 2681371.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 1.3415511585772038,
|
|
"epoch": 5.749311294765841,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 6.97866008242703e-07,
|
|
"loss": 1.4178882837295532,
|
|
"mean_token_accuracy": 0.6842059157788754,
|
|
"num_tokens": 2688476.0,
|
|
"step": 396
|
|
},
|
|
{
|
|
"entropy": 1.1355127394199371,
|
|
"epoch": 5.7640036730945825,
|
|
"grad_norm": 3.25,
|
|
"learning_rate": 6.865640724692815e-07,
|
|
"loss": 1.0911461114883423,
|
|
"mean_token_accuracy": 0.7424787282943726,
|
|
"num_tokens": 2695575.0,
|
|
"step": 397
|
|
},
|
|
{
|
|
"entropy": 1.2852298319339752,
|
|
"epoch": 5.778696051423324,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 6.753398182554116e-07,
|
|
"loss": 1.2322055101394653,
|
|
"mean_token_accuracy": 0.7083672620356083,
|
|
"num_tokens": 2702192.0,
|
|
"step": 398
|
|
},
|
|
{
|
|
"entropy": 1.5159233435988426,
|
|
"epoch": 5.793388429752066,
|
|
"grad_norm": 4.75,
|
|
"learning_rate": 6.641937264107868e-07,
|
|
"loss": 1.5141452550888062,
|
|
"mean_token_accuracy": 0.6346602737903595,
|
|
"num_tokens": 2708132.0,
|
|
"step": 399
|
|
},
|
|
{
|
|
"entropy": 1.3606117404997349,
|
|
"epoch": 5.808080808080808,
|
|
"grad_norm": 3.375,
|
|
"learning_rate": 6.53126274396885e-07,
|
|
"loss": 1.4291459321975708,
|
|
"mean_token_accuracy": 0.6642967071384192,
|
|
"num_tokens": 2715821.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.2682021632790565,
|
|
"epoch": 5.82277318640955,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 6.421379363065142e-07,
|
|
"loss": 1.293297529220581,
|
|
"mean_token_accuracy": 0.6939828936010599,
|
|
"num_tokens": 2722727.0,
|
|
"step": 401
|
|
},
|
|
{
|
|
"entropy": 1.4758578278124332,
|
|
"epoch": 5.837465564738292,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 6.312291828435077e-07,
|
|
"loss": 1.668549656867981,
|
|
"mean_token_accuracy": 0.6563255451619625,
|
|
"num_tokens": 2730755.0,
|
|
"step": 402
|
|
},
|
|
{
|
|
"entropy": 1.524743027985096,
|
|
"epoch": 5.852157943067034,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 6.204004813025569e-07,
|
|
"loss": 1.5362768173217773,
|
|
"mean_token_accuracy": 0.6655668392777443,
|
|
"num_tokens": 2736790.0,
|
|
"step": 403
|
|
},
|
|
{
|
|
"entropy": 1.209791924804449,
|
|
"epoch": 5.866850321395776,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 6.096522955491932e-07,
|
|
"loss": 1.2250401973724365,
|
|
"mean_token_accuracy": 0.7064687013626099,
|
|
"num_tokens": 2744896.0,
|
|
"step": 404
|
|
},
|
|
{
|
|
"entropy": 1.291951572522521,
|
|
"epoch": 5.881542699724518,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 5.989850859999227e-07,
|
|
"loss": 1.3850435018539429,
|
|
"mean_token_accuracy": 0.6930592581629753,
|
|
"num_tokens": 2751935.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 1.3803613483905792,
|
|
"epoch": 5.89623507805326,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 5.883993096024993e-07,
|
|
"loss": 1.3516204357147217,
|
|
"mean_token_accuracy": 0.6931698061525822,
|
|
"num_tokens": 2758137.0,
|
|
"step": 406
|
|
},
|
|
{
|
|
"entropy": 1.1959835402667522,
|
|
"epoch": 5.910927456382002,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 5.778954198163514e-07,
|
|
"loss": 1.2868695259094238,
|
|
"mean_token_accuracy": 0.7249186784029007,
|
|
"num_tokens": 2764628.0,
|
|
"step": 407
|
|
},
|
|
{
|
|
"entropy": 1.2468183785676956,
|
|
"epoch": 5.925619834710744,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 5.674738665931575e-07,
|
|
"loss": 1.235489010810852,
|
|
"mean_token_accuracy": 0.7177602611482143,
|
|
"num_tokens": 2771814.0,
|
|
"step": 408
|
|
},
|
|
{
|
|
"entropy": 1.60049744322896,
|
|
"epoch": 5.940312213039486,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 5.571350963575728e-07,
|
|
"loss": 1.5064845085144043,
|
|
"mean_token_accuracy": 0.652068167924881,
|
|
"num_tokens": 2778109.0,
|
|
"step": 409
|
|
},
|
|
{
|
|
"entropy": 1.3452286906540394,
|
|
"epoch": 5.955004591368228,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 5.468795519881043e-07,
|
|
"loss": 1.2341338396072388,
|
|
"mean_token_accuracy": 0.6869874056428671,
|
|
"num_tokens": 2784412.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.4359879940748215,
|
|
"epoch": 5.96969696969697,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 5.367076727981383e-07,
|
|
"loss": 1.4686487913131714,
|
|
"mean_token_accuracy": 0.6682025790214539,
|
|
"num_tokens": 2790953.0,
|
|
"step": 411
|
|
},
|
|
{
|
|
"entropy": 1.3611623905599117,
|
|
"epoch": 5.9843893480257115,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 5.266198945171253e-07,
|
|
"loss": 1.4125094413757324,
|
|
"mean_token_accuracy": 0.6872195526957512,
|
|
"num_tokens": 2798320.0,
|
|
"step": 412
|
|
},
|
|
{
|
|
"entropy": 1.2970973066985607,
|
|
"epoch": 5.999081726354453,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 5.166166492719124e-07,
|
|
"loss": 1.251451849937439,
|
|
"mean_token_accuracy": 0.7029491886496544,
|
|
"num_tokens": 2804831.0,
|
|
"step": 413
|
|
},
|
|
{
|
|
"entropy": 0.6531462669372559,
|
|
"epoch": 6.0,
|
|
"grad_norm": 10.8125,
|
|
"learning_rate": 5.066983655682325e-07,
|
|
"loss": 0.6284084320068359,
|
|
"mean_token_accuracy": 0.824404776096344,
|
|
"num_tokens": 2805168.0,
|
|
"step": 414
|
|
},
|
|
{
|
|
"entropy": 1.3656140714883804,
|
|
"epoch": 6.014692378328742,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 4.968654682723487e-07,
|
|
"loss": 1.2719142436981201,
|
|
"mean_token_accuracy": 0.6842254959046841,
|
|
"num_tokens": 2811186.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 1.4867672063410282,
|
|
"epoch": 6.029384756657484,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 4.871183785928546e-07,
|
|
"loss": 1.557564616203308,
|
|
"mean_token_accuracy": 0.6533172447234392,
|
|
"num_tokens": 2818841.0,
|
|
"step": 416
|
|
},
|
|
{
|
|
"entropy": 1.4871499314904213,
|
|
"epoch": 6.044077134986226,
|
|
"grad_norm": 3.796875,
|
|
"learning_rate": 4.774575140626317e-07,
|
|
"loss": 1.444616436958313,
|
|
"mean_token_accuracy": 0.6553527489304543,
|
|
"num_tokens": 2825100.0,
|
|
"step": 417
|
|
},
|
|
{
|
|
"entropy": 1.398858230561018,
|
|
"epoch": 6.058769513314968,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 4.678832885209622e-07,
|
|
"loss": 1.36493980884552,
|
|
"mean_token_accuracy": 0.6685687974095345,
|
|
"num_tokens": 2831217.0,
|
|
"step": 418
|
|
},
|
|
{
|
|
"entropy": 1.470800019800663,
|
|
"epoch": 6.07346189164371,
|
|
"grad_norm": 3.84375,
|
|
"learning_rate": 4.5839611209580277e-07,
|
|
"loss": 1.413637638092041,
|
|
"mean_token_accuracy": 0.6679843384772539,
|
|
"num_tokens": 2837281.0,
|
|
"step": 419
|
|
},
|
|
{
|
|
"entropy": 1.4590460509061813,
|
|
"epoch": 6.088154269972452,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 4.4899639118621606e-07,
|
|
"loss": 1.5037059783935547,
|
|
"mean_token_accuracy": 0.6643645130097866,
|
|
"num_tokens": 2844295.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.3967719785869122,
|
|
"epoch": 6.102846648301194,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 4.396845284449608e-07,
|
|
"loss": 1.6165939569473267,
|
|
"mean_token_accuracy": 0.6795150488615036,
|
|
"num_tokens": 2851174.0,
|
|
"step": 421
|
|
},
|
|
{
|
|
"entropy": 1.4725304134190083,
|
|
"epoch": 6.117539026629935,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 4.3046092276124467e-07,
|
|
"loss": 1.367598533630371,
|
|
"mean_token_accuracy": 0.6686763595789671,
|
|
"num_tokens": 2857478.0,
|
|
"step": 422
|
|
},
|
|
{
|
|
"entropy": 1.3191987164318562,
|
|
"epoch": 6.132231404958677,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 4.2132596924363666e-07,
|
|
"loss": 1.270479679107666,
|
|
"mean_token_accuracy": 0.6981482766568661,
|
|
"num_tokens": 2864721.0,
|
|
"step": 423
|
|
},
|
|
{
|
|
"entropy": 1.27112677693367,
|
|
"epoch": 6.14692378328742,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 4.122800592031426e-07,
|
|
"loss": 1.3095415830612183,
|
|
"mean_token_accuracy": 0.6866735070943832,
|
|
"num_tokens": 2869929.0,
|
|
"step": 424
|
|
},
|
|
{
|
|
"entropy": 1.386510156095028,
|
|
"epoch": 6.161616161616162,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 4.033235801364402e-07,
|
|
"loss": 1.378354787826538,
|
|
"mean_token_accuracy": 0.6822472270578146,
|
|
"num_tokens": 2876724.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 1.406757928431034,
|
|
"epoch": 6.176308539944904,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 3.94456915709284e-07,
|
|
"loss": 1.4385218620300293,
|
|
"mean_token_accuracy": 0.6637353654950857,
|
|
"num_tokens": 2884843.0,
|
|
"step": 426
|
|
},
|
|
{
|
|
"entropy": 1.3404726311564445,
|
|
"epoch": 6.191000918273645,
|
|
"grad_norm": 4.25,
|
|
"learning_rate": 3.85680445740067e-07,
|
|
"loss": 1.3165702819824219,
|
|
"mean_token_accuracy": 0.6864796336740255,
|
|
"num_tokens": 2889782.0,
|
|
"step": 427
|
|
},
|
|
{
|
|
"entropy": 1.3043682426214218,
|
|
"epoch": 6.205693296602387,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 3.7699454618355306e-07,
|
|
"loss": 1.3163912296295166,
|
|
"mean_token_accuracy": 0.6922629773616791,
|
|
"num_tokens": 2896735.0,
|
|
"step": 428
|
|
},
|
|
{
|
|
"entropy": 1.3188545294106007,
|
|
"epoch": 6.22038567493113,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 3.683995891147696e-07,
|
|
"loss": 1.4004876613616943,
|
|
"mean_token_accuracy": 0.6927106529474258,
|
|
"num_tokens": 2902419.0,
|
|
"step": 429
|
|
},
|
|
{
|
|
"entropy": 1.1944062858819962,
|
|
"epoch": 6.235078053259872,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 3.598959427130716e-07,
|
|
"loss": 1.161584734916687,
|
|
"mean_token_accuracy": 0.7194525264203548,
|
|
"num_tokens": 2910508.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.3897150121629238,
|
|
"epoch": 6.249770431588614,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 3.514839712463683e-07,
|
|
"loss": 1.3794488906860352,
|
|
"mean_token_accuracy": 0.6831434555351734,
|
|
"num_tokens": 2917905.0,
|
|
"step": 431
|
|
},
|
|
{
|
|
"entropy": 1.5148936957120895,
|
|
"epoch": 6.264462809917355,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 3.4316403505552045e-07,
|
|
"loss": 1.6462560892105103,
|
|
"mean_token_accuracy": 0.6511917188763618,
|
|
"num_tokens": 2924680.0,
|
|
"step": 432
|
|
},
|
|
{
|
|
"entropy": 1.4466035105288029,
|
|
"epoch": 6.279155188246097,
|
|
"grad_norm": 3.765625,
|
|
"learning_rate": 3.3493649053890325e-07,
|
|
"loss": 1.496029257774353,
|
|
"mean_token_accuracy": 0.6696281190961599,
|
|
"num_tokens": 2932051.0,
|
|
"step": 433
|
|
},
|
|
{
|
|
"entropy": 1.372856643050909,
|
|
"epoch": 6.293847566574839,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 3.268016901371407e-07,
|
|
"loss": 1.3746181726455688,
|
|
"mean_token_accuracy": 0.6710073538124561,
|
|
"num_tokens": 2938884.0,
|
|
"step": 434
|
|
},
|
|
{
|
|
"entropy": 1.4355628602206707,
|
|
"epoch": 6.308539944903581,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 3.187599823180071e-07,
|
|
"loss": 1.4408472776412964,
|
|
"mean_token_accuracy": 0.6618307530879974,
|
|
"num_tokens": 2945324.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 1.4253287892788649,
|
|
"epoch": 6.3232323232323235,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 3.108117115615006e-07,
|
|
"loss": 1.4239962100982666,
|
|
"mean_token_accuracy": 0.6748282723128796,
|
|
"num_tokens": 2952312.0,
|
|
"step": 436
|
|
},
|
|
{
|
|
"entropy": 1.3979435861110687,
|
|
"epoch": 6.337924701561065,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 3.0295721834508686e-07,
|
|
"loss": 1.3381223678588867,
|
|
"mean_token_accuracy": 0.6750478371977806,
|
|
"num_tokens": 2959402.0,
|
|
"step": 437
|
|
},
|
|
{
|
|
"entropy": 1.405780129134655,
|
|
"epoch": 6.352617079889807,
|
|
"grad_norm": 3.71875,
|
|
"learning_rate": 2.9519683912911267e-07,
|
|
"loss": 1.437018871307373,
|
|
"mean_token_accuracy": 0.6710403822362423,
|
|
"num_tokens": 2966008.0,
|
|
"step": 438
|
|
},
|
|
{
|
|
"entropy": 1.4760498031973839,
|
|
"epoch": 6.367309458218549,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 2.875309063423956e-07,
|
|
"loss": 1.6535365581512451,
|
|
"mean_token_accuracy": 0.6638116780668497,
|
|
"num_tokens": 2972476.0,
|
|
"step": 439
|
|
},
|
|
{
|
|
"entropy": 1.4005642868578434,
|
|
"epoch": 6.382001836547291,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.7995974836798194e-07,
|
|
"loss": 1.3646724224090576,
|
|
"mean_token_accuracy": 0.6797922551631927,
|
|
"num_tokens": 2978239.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.293638188391924,
|
|
"epoch": 6.3966942148760335,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 2.7248368952908055e-07,
|
|
"loss": 1.2501822710037231,
|
|
"mean_token_accuracy": 0.6940024830400944,
|
|
"num_tokens": 2984971.0,
|
|
"step": 441
|
|
},
|
|
{
|
|
"entropy": 1.6457590200006962,
|
|
"epoch": 6.411386593204775,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.6510305007516974e-07,
|
|
"loss": 1.590219497680664,
|
|
"mean_token_accuracy": 0.6325998902320862,
|
|
"num_tokens": 2992049.0,
|
|
"step": 442
|
|
},
|
|
{
|
|
"entropy": 1.658167488873005,
|
|
"epoch": 6.426078971533517,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 2.5781814616827936e-07,
|
|
"loss": 1.833939790725708,
|
|
"mean_token_accuracy": 0.6285004448145628,
|
|
"num_tokens": 2998916.0,
|
|
"step": 443
|
|
},
|
|
{
|
|
"entropy": 1.231806393712759,
|
|
"epoch": 6.440771349862259,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 2.506292898694468e-07,
|
|
"loss": 1.28045654296875,
|
|
"mean_token_accuracy": 0.7068323567509651,
|
|
"num_tokens": 3005512.0,
|
|
"step": 444
|
|
},
|
|
{
|
|
"entropy": 1.4775658771395683,
|
|
"epoch": 6.455463728191001,
|
|
"grad_norm": 3.703125,
|
|
"learning_rate": 2.43536789125349e-07,
|
|
"loss": 1.4871981143951416,
|
|
"mean_token_accuracy": 0.6590262055397034,
|
|
"num_tokens": 3012108.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 1.3822932876646519,
|
|
"epoch": 6.470156106519743,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 2.365409477551117e-07,
|
|
"loss": 1.378037691116333,
|
|
"mean_token_accuracy": 0.6818497627973557,
|
|
"num_tokens": 3018657.0,
|
|
"step": 446
|
|
},
|
|
{
|
|
"entropy": 1.3791243396699429,
|
|
"epoch": 6.484848484848484,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 2.2964206543729662e-07,
|
|
"loss": 1.3524497747421265,
|
|
"mean_token_accuracy": 0.6901324354112148,
|
|
"num_tokens": 3025796.0,
|
|
"step": 447
|
|
},
|
|
{
|
|
"entropy": 1.4186674058437347,
|
|
"epoch": 6.499540863177227,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 2.2284043769706026e-07,
|
|
"loss": 1.519934058189392,
|
|
"mean_token_accuracy": 0.6646804567426443,
|
|
"num_tokens": 3033756.0,
|
|
"step": 448
|
|
},
|
|
{
|
|
"entropy": 1.2882320508360863,
|
|
"epoch": 6.514233241505969,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 2.1613635589349756e-07,
|
|
"loss": 1.3580642938613892,
|
|
"mean_token_accuracy": 0.69617984816432,
|
|
"num_tokens": 3040800.0,
|
|
"step": 449
|
|
},
|
|
{
|
|
"entropy": 1.3028308153152466,
|
|
"epoch": 6.528925619834711,
|
|
"grad_norm": 4.15625,
|
|
"learning_rate": 2.0953010720716037e-07,
|
|
"loss": 1.3698010444641113,
|
|
"mean_token_accuracy": 0.6910812072455883,
|
|
"num_tokens": 3047199.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.3760417755693197,
|
|
"epoch": 6.543617998163453,
|
|
"grad_norm": 3.640625,
|
|
"learning_rate": 2.0302197462775453e-07,
|
|
"loss": 1.3222535848617554,
|
|
"mean_token_accuracy": 0.6876711696386337,
|
|
"num_tokens": 3054037.0,
|
|
"step": 451
|
|
},
|
|
{
|
|
"entropy": 1.1697348654270172,
|
|
"epoch": 6.558310376492194,
|
|
"grad_norm": 3.421875,
|
|
"learning_rate": 1.9661223694201898e-07,
|
|
"loss": 1.1562086343765259,
|
|
"mean_token_accuracy": 0.7222369164228439,
|
|
"num_tokens": 3060835.0,
|
|
"step": 452
|
|
},
|
|
{
|
|
"entropy": 1.2634800747036934,
|
|
"epoch": 6.573002754820937,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.9030116872178317e-07,
|
|
"loss": 1.1908690929412842,
|
|
"mean_token_accuracy": 0.7110442295670509,
|
|
"num_tokens": 3066363.0,
|
|
"step": 453
|
|
},
|
|
{
|
|
"entropy": 1.3702223263680935,
|
|
"epoch": 6.587695133149679,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 1.8408904031220476e-07,
|
|
"loss": 1.5914932489395142,
|
|
"mean_token_accuracy": 0.6844195239245892,
|
|
"num_tokens": 3073703.0,
|
|
"step": 454
|
|
},
|
|
{
|
|
"entropy": 1.397190399467945,
|
|
"epoch": 6.602387511478421,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 1.7797611782018942e-07,
|
|
"loss": 1.5647703409194946,
|
|
"mean_token_accuracy": 0.676605511456728,
|
|
"num_tokens": 3081815.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 1.5388475097715855,
|
|
"epoch": 6.6170798898071626,
|
|
"grad_norm": 3.96875,
|
|
"learning_rate": 1.719626631029911e-07,
|
|
"loss": 1.640184760093689,
|
|
"mean_token_accuracy": 0.6557557284832001,
|
|
"num_tokens": 3088623.0,
|
|
"step": 456
|
|
},
|
|
{
|
|
"entropy": 1.19817179068923,
|
|
"epoch": 6.631772268135904,
|
|
"grad_norm": 4.09375,
|
|
"learning_rate": 1.6604893375699594e-07,
|
|
"loss": 1.1748251914978027,
|
|
"mean_token_accuracy": 0.721715409308672,
|
|
"num_tokens": 3094847.0,
|
|
"step": 457
|
|
},
|
|
{
|
|
"entropy": 1.35783052444458,
|
|
"epoch": 6.646464646464646,
|
|
"grad_norm": 3.4375,
|
|
"learning_rate": 1.602351831066862e-07,
|
|
"loss": 1.3434321880340576,
|
|
"mean_token_accuracy": 0.6994687616825104,
|
|
"num_tokens": 3102248.0,
|
|
"step": 458
|
|
},
|
|
{
|
|
"entropy": 1.0797005984932184,
|
|
"epoch": 6.661157024793388,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 1.5452166019378989e-07,
|
|
"loss": 1.0800870656967163,
|
|
"mean_token_accuracy": 0.7316170409321785,
|
|
"num_tokens": 3110001.0,
|
|
"step": 459
|
|
},
|
|
{
|
|
"entropy": 1.3832198455929756,
|
|
"epoch": 6.675849403122131,
|
|
"grad_norm": 3.5,
|
|
"learning_rate": 1.4890860976661314e-07,
|
|
"loss": 1.3797262907028198,
|
|
"mean_token_accuracy": 0.6900227032601833,
|
|
"num_tokens": 3117274.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.1709060333669186,
|
|
"epoch": 6.6905417814508725,
|
|
"grad_norm": 3.453125,
|
|
"learning_rate": 1.4339627226955394e-07,
|
|
"loss": 1.2020788192749023,
|
|
"mean_token_accuracy": 0.7185935415327549,
|
|
"num_tokens": 3123995.0,
|
|
"step": 461
|
|
},
|
|
{
|
|
"entropy": 1.4667035713791847,
|
|
"epoch": 6.705234159779614,
|
|
"grad_norm": 3.90625,
|
|
"learning_rate": 1.3798488383280489e-07,
|
|
"loss": 1.3801714181900024,
|
|
"mean_token_accuracy": 0.6576951071619987,
|
|
"num_tokens": 3131304.0,
|
|
"step": 462
|
|
},
|
|
{
|
|
"entropy": 1.5537691339850426,
|
|
"epoch": 6.719926538108356,
|
|
"grad_norm": 3.75,
|
|
"learning_rate": 1.3267467626223606e-07,
|
|
"loss": 1.476824402809143,
|
|
"mean_token_accuracy": 0.6602157857269049,
|
|
"num_tokens": 3137410.0,
|
|
"step": 463
|
|
},
|
|
{
|
|
"entropy": 1.235965933650732,
|
|
"epoch": 6.734618916437098,
|
|
"grad_norm": 3.984375,
|
|
"learning_rate": 1.2746587702946538e-07,
|
|
"loss": 1.1468799114227295,
|
|
"mean_token_accuracy": 0.7125266939401627,
|
|
"num_tokens": 3142974.0,
|
|
"step": 464
|
|
},
|
|
{
|
|
"entropy": 1.2582230232656002,
|
|
"epoch": 6.749311294765841,
|
|
"grad_norm": 3.296875,
|
|
"learning_rate": 1.223587092621162e-07,
|
|
"loss": 1.2624547481536865,
|
|
"mean_token_accuracy": 0.70580143481493,
|
|
"num_tokens": 3151139.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 1.2958066929131746,
|
|
"epoch": 6.7640036730945825,
|
|
"grad_norm": 3.65625,
|
|
"learning_rate": 1.1735339173425759e-07,
|
|
"loss": 1.276180386543274,
|
|
"mean_token_accuracy": 0.7052340060472488,
|
|
"num_tokens": 3158302.0,
|
|
"step": 466
|
|
},
|
|
{
|
|
"entropy": 1.3456409573554993,
|
|
"epoch": 6.778696051423324,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 1.1245013885703343e-07,
|
|
"loss": 1.3040763139724731,
|
|
"mean_token_accuracy": 0.7013955563306808,
|
|
"num_tokens": 3165056.0,
|
|
"step": 467
|
|
},
|
|
{
|
|
"entropy": 1.3066479973495007,
|
|
"epoch": 6.793388429752066,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 1.0764916066947795e-07,
|
|
"loss": 1.4025518894195557,
|
|
"mean_token_accuracy": 0.6909100040793419,
|
|
"num_tokens": 3172110.0,
|
|
"step": 468
|
|
},
|
|
{
|
|
"entropy": 1.3883640430867672,
|
|
"epoch": 6.808080808080808,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 1.0295066282951738e-07,
|
|
"loss": 1.4634349346160889,
|
|
"mean_token_accuracy": 0.6746046468615532,
|
|
"num_tokens": 3178411.0,
|
|
"step": 469
|
|
},
|
|
{
|
|
"entropy": 1.5163409858942032,
|
|
"epoch": 6.82277318640955,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 9.835484660516203e-08,
|
|
"loss": 1.6738132238388062,
|
|
"mean_token_accuracy": 0.6578865684568882,
|
|
"num_tokens": 3185092.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 1.357316054403782,
|
|
"epoch": 6.837465564738292,
|
|
"grad_norm": 3.828125,
|
|
"learning_rate": 9.386190886588208e-08,
|
|
"loss": 1.3559865951538086,
|
|
"mean_token_accuracy": 0.6832005195319653,
|
|
"num_tokens": 3191782.0,
|
|
"step": 471
|
|
},
|
|
{
|
|
"entropy": 1.3066742308437824,
|
|
"epoch": 6.852157943067034,
|
|
"grad_norm": 3.921875,
|
|
"learning_rate": 8.947204207417681e-08,
|
|
"loss": 1.3522746562957764,
|
|
"mean_token_accuracy": 0.7099240720272064,
|
|
"num_tokens": 3197795.0,
|
|
"step": 472
|
|
},
|
|
{
|
|
"entropy": 1.3219049498438835,
|
|
"epoch": 6.866850321395776,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 8.518543427732951e-08,
|
|
"loss": 1.3603464365005493,
|
|
"mean_token_accuracy": 0.6895252950489521,
|
|
"num_tokens": 3204782.0,
|
|
"step": 473
|
|
},
|
|
{
|
|
"entropy": 1.241905678063631,
|
|
"epoch": 6.881542699724518,
|
|
"grad_norm": 3.59375,
|
|
"learning_rate": 8.100226909935061e-08,
|
|
"loss": 1.1739405393600464,
|
|
"mean_token_accuracy": 0.7041682228446007,
|
|
"num_tokens": 3210805.0,
|
|
"step": 474
|
|
},
|
|
{
|
|
"entropy": 1.3946216590702534,
|
|
"epoch": 6.89623507805326,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 7.692272573311427e-08,
|
|
"loss": 1.4739960432052612,
|
|
"mean_token_accuracy": 0.68177555128932,
|
|
"num_tokens": 3219953.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 1.3859229907393456,
|
|
"epoch": 6.910927456382002,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 7.294697893267977e-08,
|
|
"loss": 1.376434326171875,
|
|
"mean_token_accuracy": 0.6881735809147358,
|
|
"num_tokens": 3227524.0,
|
|
"step": 476
|
|
},
|
|
{
|
|
"entropy": 1.140619345009327,
|
|
"epoch": 6.925619834710744,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 6.907519900580862e-08,
|
|
"loss": 0.9896233081817627,
|
|
"mean_token_accuracy": 0.7222557105123997,
|
|
"num_tokens": 3235487.0,
|
|
"step": 477
|
|
},
|
|
{
|
|
"entropy": 1.4113641753792763,
|
|
"epoch": 6.940312213039486,
|
|
"grad_norm": 3.203125,
|
|
"learning_rate": 6.530755180666593e-08,
|
|
"loss": 1.35636568069458,
|
|
"mean_token_accuracy": 0.6656701732426882,
|
|
"num_tokens": 3243387.0,
|
|
"step": 478
|
|
},
|
|
{
|
|
"entropy": 1.2951929830014706,
|
|
"epoch": 6.955004591368228,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 6.164419872871835e-08,
|
|
"loss": 1.3108189105987549,
|
|
"mean_token_accuracy": 0.6984525807201862,
|
|
"num_tokens": 3250630.0,
|
|
"step": 479
|
|
},
|
|
{
|
|
"entropy": 1.2143667675554752,
|
|
"epoch": 6.96969696969697,
|
|
"grad_norm": 3.390625,
|
|
"learning_rate": 5.8085296697819036e-08,
|
|
"loss": 1.1499379873275757,
|
|
"mean_token_accuracy": 0.7011887915432453,
|
|
"num_tokens": 3258109.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.3035505078732967,
|
|
"epoch": 6.9843893480257115,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 5.463099816548578e-08,
|
|
"loss": 1.3332489728927612,
|
|
"mean_token_accuracy": 0.6942353397607803,
|
|
"num_tokens": 3265127.0,
|
|
"step": 481
|
|
},
|
|
{
|
|
"entropy": 1.3939937017858028,
|
|
"epoch": 6.999081726354453,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 5.128145110237154e-08,
|
|
"loss": 1.4082341194152832,
|
|
"mean_token_accuracy": 0.6772250905632973,
|
|
"num_tokens": 3272251.0,
|
|
"step": 482
|
|
},
|
|
{
|
|
"entropy": 0.9707384705543518,
|
|
"epoch": 7.0,
|
|
"grad_norm": 10.5,
|
|
"learning_rate": 4.8036798991923925e-08,
|
|
"loss": 0.9935500025749207,
|
|
"mean_token_accuracy": 0.7545045018196106,
|
|
"num_tokens": 3272696.0,
|
|
"step": 483
|
|
},
|
|
{
|
|
"entropy": 1.1763608865439892,
|
|
"epoch": 7.014692378328742,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 4.489718082424044e-08,
|
|
"loss": 1.2014083862304688,
|
|
"mean_token_accuracy": 0.7273847311735153,
|
|
"num_tokens": 3278916.0,
|
|
"step": 484
|
|
},
|
|
{
|
|
"entropy": 1.1877197846770287,
|
|
"epoch": 7.029384756657484,
|
|
"grad_norm": 3.578125,
|
|
"learning_rate": 4.186273109011374e-08,
|
|
"loss": 1.205470085144043,
|
|
"mean_token_accuracy": 0.7204346731305122,
|
|
"num_tokens": 3286066.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 1.1499557420611382,
|
|
"epoch": 7.044077134986226,
|
|
"grad_norm": 3.109375,
|
|
"learning_rate": 3.893357977527101e-08,
|
|
"loss": 1.011968970298767,
|
|
"mean_token_accuracy": 0.724033422768116,
|
|
"num_tokens": 3293689.0,
|
|
"step": 486
|
|
},
|
|
{
|
|
"entropy": 1.4138266146183014,
|
|
"epoch": 7.058769513314968,
|
|
"grad_norm": 4.5,
|
|
"learning_rate": 3.610985235480563e-08,
|
|
"loss": 1.3265433311462402,
|
|
"mean_token_accuracy": 0.675995796918869,
|
|
"num_tokens": 3299096.0,
|
|
"step": 487
|
|
},
|
|
{
|
|
"entropy": 1.4587520882487297,
|
|
"epoch": 7.07346189164371,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 3.339166978780256e-08,
|
|
"loss": 1.5998899936676025,
|
|
"mean_token_accuracy": 0.6766221728175879,
|
|
"num_tokens": 3306153.0,
|
|
"step": 488
|
|
},
|
|
{
|
|
"entropy": 1.4471938125789165,
|
|
"epoch": 7.088154269972452,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 3.077914851215585e-08,
|
|
"loss": 1.523323893547058,
|
|
"mean_token_accuracy": 0.669438187032938,
|
|
"num_tokens": 3312485.0,
|
|
"step": 489
|
|
},
|
|
{
|
|
"entropy": 1.3640289083123207,
|
|
"epoch": 7.102846648301194,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 2.8272400439581514e-08,
|
|
"loss": 1.3801430463790894,
|
|
"mean_token_accuracy": 0.6815820559859276,
|
|
"num_tokens": 3319393.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.3892018273472786,
|
|
"epoch": 7.117539026629935,
|
|
"grad_norm": 3.6875,
|
|
"learning_rate": 2.5871532950824395e-08,
|
|
"loss": 1.5527251958847046,
|
|
"mean_token_accuracy": 0.6826623827219009,
|
|
"num_tokens": 3326550.0,
|
|
"step": 491
|
|
},
|
|
{
|
|
"entropy": 1.2440132424235344,
|
|
"epoch": 7.132231404958677,
|
|
"grad_norm": 4.1875,
|
|
"learning_rate": 2.3576648891056876e-08,
|
|
"loss": 1.3222265243530273,
|
|
"mean_token_accuracy": 0.6882449053227901,
|
|
"num_tokens": 3332636.0,
|
|
"step": 492
|
|
},
|
|
{
|
|
"entropy": 1.4525053799152374,
|
|
"epoch": 7.14692378328742,
|
|
"grad_norm": 3.890625,
|
|
"learning_rate": 2.1387846565474047e-08,
|
|
"loss": 1.398058533668518,
|
|
"mean_token_accuracy": 0.6715537309646606,
|
|
"num_tokens": 3340220.0,
|
|
"step": 493
|
|
},
|
|
{
|
|
"entropy": 1.3993450328707695,
|
|
"epoch": 7.161616161616162,
|
|
"grad_norm": 3.671875,
|
|
"learning_rate": 1.930521973508237e-08,
|
|
"loss": 1.3713880777359009,
|
|
"mean_token_accuracy": 0.6754298955202103,
|
|
"num_tokens": 3347587.0,
|
|
"step": 494
|
|
},
|
|
{
|
|
"entropy": 1.5341194830834866,
|
|
"epoch": 7.176308539944904,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 1.732885761268427e-08,
|
|
"loss": 1.5632487535476685,
|
|
"mean_token_accuracy": 0.6606750525534153,
|
|
"num_tokens": 3354512.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 1.3622602969408035,
|
|
"epoch": 7.191000918273645,
|
|
"grad_norm": 4.21875,
|
|
"learning_rate": 1.54588448590548e-08,
|
|
"loss": 1.3854639530181885,
|
|
"mean_token_accuracy": 0.6834001019597054,
|
|
"num_tokens": 3360290.0,
|
|
"step": 496
|
|
},
|
|
{
|
|
"entropy": 1.2942306697368622,
|
|
"epoch": 7.205693296602387,
|
|
"grad_norm": 3.78125,
|
|
"learning_rate": 1.3695261579316776e-08,
|
|
"loss": 1.2496094703674316,
|
|
"mean_token_accuracy": 0.7065734341740608,
|
|
"num_tokens": 3366921.0,
|
|
"step": 497
|
|
},
|
|
{
|
|
"entropy": 1.3717114739120007,
|
|
"epoch": 7.22038567493113,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 1.2038183319507957e-08,
|
|
"loss": 1.334294319152832,
|
|
"mean_token_accuracy": 0.6929150782525539,
|
|
"num_tokens": 3373498.0,
|
|
"step": 498
|
|
},
|
|
{
|
|
"entropy": 1.5312567129731178,
|
|
"epoch": 7.235078053259872,
|
|
"grad_norm": 4.0,
|
|
"learning_rate": 1.0487681063345856e-08,
|
|
"loss": 1.564821481704712,
|
|
"mean_token_accuracy": 0.6466786749660969,
|
|
"num_tokens": 3380077.0,
|
|
"step": 499
|
|
},
|
|
{
|
|
"entropy": 1.3633756004273891,
|
|
"epoch": 7.249770431588614,
|
|
"grad_norm": 3.34375,
|
|
"learning_rate": 9.043821229186567e-09,
|
|
"loss": 1.402569055557251,
|
|
"mean_token_accuracy": 0.6758171431720257,
|
|
"num_tokens": 3387922.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.2247029319405556,
|
|
"epoch": 7.264462809917355,
|
|
"grad_norm": 3.625,
|
|
"learning_rate": 7.70666566718009e-09,
|
|
"loss": 1.2212553024291992,
|
|
"mean_token_accuracy": 0.70748520642519,
|
|
"num_tokens": 3393752.0,
|
|
"step": 501
|
|
},
|
|
{
|
|
"entropy": 1.4086985550820827,
|
|
"epoch": 7.279155188246097,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 6.476271656620237e-09,
|
|
"loss": 1.6283751726150513,
|
|
"mean_token_accuracy": 0.6691960953176022,
|
|
"num_tokens": 3400652.0,
|
|
"step": 502
|
|
},
|
|
{
|
|
"entropy": 1.4841664768755436,
|
|
"epoch": 7.293847566574839,
|
|
"grad_norm": 4.84375,
|
|
"learning_rate": 5.352691903491303e-09,
|
|
"loss": 1.4717917442321777,
|
|
"mean_token_accuracy": 0.6619565561413765,
|
|
"num_tokens": 3405519.0,
|
|
"step": 503
|
|
},
|
|
{
|
|
"entropy": 1.504446342587471,
|
|
"epoch": 7.308539944903581,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 4.335974538210441e-09,
|
|
"loss": 1.6657609939575195,
|
|
"mean_token_accuracy": 0.6465577762573957,
|
|
"num_tokens": 3412942.0,
|
|
"step": 504
|
|
},
|
|
{
|
|
"entropy": 1.175526186823845,
|
|
"epoch": 7.3232323232323235,
|
|
"grad_norm": 3.953125,
|
|
"learning_rate": 3.4261631135654174e-09,
|
|
"loss": 1.1835148334503174,
|
|
"mean_token_accuracy": 0.7157127186655998,
|
|
"num_tokens": 3419831.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 1.3945834636688232,
|
|
"epoch": 7.337924701561065,
|
|
"grad_norm": 3.53125,
|
|
"learning_rate": 2.623296602849712e-09,
|
|
"loss": 1.3837270736694336,
|
|
"mean_token_accuracy": 0.6788865961134434,
|
|
"num_tokens": 3427129.0,
|
|
"step": 506
|
|
},
|
|
{
|
|
"entropy": 1.2734551429748535,
|
|
"epoch": 7.352617079889807,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 1.9274093981927476e-09,
|
|
"loss": 1.2432376146316528,
|
|
"mean_token_accuracy": 0.6965003944933414,
|
|
"num_tokens": 3434476.0,
|
|
"step": 507
|
|
},
|
|
{
|
|
"entropy": 1.2341192811727524,
|
|
"epoch": 7.367309458218549,
|
|
"grad_norm": 3.3125,
|
|
"learning_rate": 1.3385313090857888e-09,
|
|
"loss": 1.2809759378433228,
|
|
"mean_token_accuracy": 0.7019614204764366,
|
|
"num_tokens": 3441949.0,
|
|
"step": 508
|
|
},
|
|
{
|
|
"entropy": 1.2455067448318005,
|
|
"epoch": 7.382001836547291,
|
|
"grad_norm": 4.03125,
|
|
"learning_rate": 8.566875611068503e-10,
|
|
"loss": 1.1633739471435547,
|
|
"mean_token_accuracy": 0.7040832042694092,
|
|
"num_tokens": 3448515.0,
|
|
"step": 509
|
|
},
|
|
{
|
|
"entropy": 1.3598694279789925,
|
|
"epoch": 7.3966942148760335,
|
|
"grad_norm": 3.734375,
|
|
"learning_rate": 4.818987948379538e-10,
|
|
"loss": 1.4113589525222778,
|
|
"mean_token_accuracy": 0.6924026310443878,
|
|
"num_tokens": 3455770.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.0381613001227379,
|
|
"epoch": 7.411386593204775,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 2.1418106498249936e-10,
|
|
"loss": 1.0188246965408325,
|
|
"mean_token_accuracy": 0.7484844401478767,
|
|
"num_tokens": 3464219.0,
|
|
"step": 511
|
|
},
|
|
{
|
|
"entropy": 1.511953193694353,
|
|
"epoch": 7.426078971533517,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 5.354583967692728e-11,
|
|
"loss": 1.4547315835952759,
|
|
"mean_token_accuracy": 0.6628812402486801,
|
|
"num_tokens": 3469540.0,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 7.426078971533517,
|
|
"eval_entropy": 1.3383248895406723,
|
|
"eval_loss": 1.4060174226760864,
|
|
"eval_mean_token_accuracy": 0.6780907139182091,
|
|
"eval_num_tokens": 3469540.0,
|
|
"eval_runtime": 1.6771,
|
|
"eval_samples_per_second": 34.583,
|
|
"eval_steps_per_second": 4.77,
|
|
"step": 512
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 512,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 8,
|
|
"save_steps": 128,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.581762221338624e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|