1264 lines
35 KiB
JSON
1264 lines
35 KiB
JSON
[
|
|
{
|
|
"loss": 4.235999755859375,
|
|
"grad_norm": 5.0625,
|
|
"learning_rate": 1.9056785370548606e-06,
|
|
"entropy": 1.2326550805568695,
|
|
"num_tokens": 886962.0,
|
|
"mean_token_accuracy": 0.5153479687372844,
|
|
"epoch": 0.02888781896966779,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 1.0453578186035157,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 3.83060635226179e-06,
|
|
"entropy": 1.003088502685229,
|
|
"num_tokens": 1772142.0,
|
|
"mean_token_accuracy": 0.8255874119202296,
|
|
"epoch": 0.05777563793933558,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 0.6848526000976562,
|
|
"grad_norm": 0.384765625,
|
|
"learning_rate": 5.75553416746872e-06,
|
|
"entropy": 0.730368642906348,
|
|
"num_tokens": 2658161.0,
|
|
"mean_token_accuracy": 0.8729760211706161,
|
|
"epoch": 0.08666345690900337,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 0.6384764099121094,
|
|
"grad_norm": 0.5234375,
|
|
"learning_rate": 7.68046198267565e-06,
|
|
"entropy": 0.637606812864542,
|
|
"num_tokens": 3544837.0,
|
|
"mean_token_accuracy": 0.8758877144257228,
|
|
"epoch": 0.11555127587867116,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 0.6139374160766602,
|
|
"grad_norm": 0.287109375,
|
|
"learning_rate": 9.605389797882581e-06,
|
|
"entropy": 0.6128936386108399,
|
|
"num_tokens": 4431654.0,
|
|
"mean_token_accuracy": 0.8796136958400408,
|
|
"epoch": 0.14443909484833894,
|
|
"step": 500
|
|
},
|
|
{
|
|
"eval_loss": 0.6138447523117065,
|
|
"eval_runtime": 121.1778,
|
|
"eval_samples_per_second": 111.522,
|
|
"eval_steps_per_second": 18.593,
|
|
"eval_entropy": 0.6253009630979774,
|
|
"eval_num_tokens": 4431654.0,
|
|
"eval_mean_token_accuracy": 0.8792604871500983,
|
|
"epoch": 0.14443909484833894,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 0.5988116073608398,
|
|
"grad_norm": 0.294921875,
|
|
"learning_rate": 1.153031761308951e-05,
|
|
"entropy": 0.599972769121329,
|
|
"num_tokens": 5317947.0,
|
|
"mean_token_accuracy": 0.8811199645201365,
|
|
"epoch": 0.17332691381800674,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 0.5892007064819336,
|
|
"grad_norm": 0.345703125,
|
|
"learning_rate": 1.3455245428296439e-05,
|
|
"entropy": 0.5934143226842086,
|
|
"num_tokens": 6205303.0,
|
|
"mean_token_accuracy": 0.8826212238272031,
|
|
"epoch": 0.20221473278767452,
|
|
"step": 700
|
|
},
|
|
{
|
|
"loss": 0.579403953552246,
|
|
"grad_norm": 0.244140625,
|
|
"learning_rate": 1.538017324350337e-05,
|
|
"entropy": 0.5813252867758274,
|
|
"num_tokens": 7092051.0,
|
|
"mean_token_accuracy": 0.8843332821130753,
|
|
"epoch": 0.23110255175734232,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 0.5702339553833008,
|
|
"grad_norm": 0.2158203125,
|
|
"learning_rate": 1.73051010587103e-05,
|
|
"entropy": 0.5745066069066525,
|
|
"num_tokens": 7978376.0,
|
|
"mean_token_accuracy": 0.884980032046636,
|
|
"epoch": 0.2599903707270101,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 0.5615914154052735,
|
|
"grad_norm": 0.326171875,
|
|
"learning_rate": 1.923002887391723e-05,
|
|
"entropy": 0.564577516814073,
|
|
"num_tokens": 8865602.0,
|
|
"mean_token_accuracy": 0.8863117796182632,
|
|
"epoch": 0.2888781896966779,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"eval_loss": 0.5681502223014832,
|
|
"eval_runtime": 121.1382,
|
|
"eval_samples_per_second": 111.559,
|
|
"eval_steps_per_second": 18.599,
|
|
"eval_entropy": 0.5520287772257805,
|
|
"eval_num_tokens": 8865602.0,
|
|
"eval_mean_token_accuracy": 0.8852167688789655,
|
|
"epoch": 0.2888781896966779,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 0.5591428756713868,
|
|
"grad_norm": 0.2392578125,
|
|
"learning_rate": 1.9997966645755392e-05,
|
|
"entropy": 0.5630815910299619,
|
|
"num_tokens": 9752872.0,
|
|
"mean_token_accuracy": 0.8859697584311167,
|
|
"epoch": 0.3177660086663457,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 0.5547999954223632,
|
|
"grad_norm": 0.2255859375,
|
|
"learning_rate": 1.9985543586468365e-05,
|
|
"entropy": 0.5587607964873313,
|
|
"num_tokens": 10640416.0,
|
|
"mean_token_accuracy": 0.886342776119709,
|
|
"epoch": 0.3466538276360135,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 0.5420237731933594,
|
|
"grad_norm": 0.208984375,
|
|
"learning_rate": 1.9961841124880657e-05,
|
|
"entropy": 0.5386069346467653,
|
|
"num_tokens": 11526809.0,
|
|
"mean_token_accuracy": 0.8891533743341764,
|
|
"epoch": 0.3755416466056813,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 0.5437238693237305,
|
|
"grad_norm": 0.20703125,
|
|
"learning_rate": 1.9926886034657355e-05,
|
|
"entropy": 0.5422710782289505,
|
|
"num_tokens": 12412683.0,
|
|
"mean_token_accuracy": 0.8881420868635178,
|
|
"epoch": 0.40442946557534903,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 0.538447380065918,
|
|
"grad_norm": 0.189453125,
|
|
"learning_rate": 1.9880717800131158e-05,
|
|
"entropy": 0.5379938718179862,
|
|
"num_tokens": 13297912.0,
|
|
"mean_token_accuracy": 0.8887334618965784,
|
|
"epoch": 0.43331728454501683,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"eval_loss": 0.5418813228607178,
|
|
"eval_runtime": 121.0107,
|
|
"eval_samples_per_second": 111.676,
|
|
"eval_steps_per_second": 18.618,
|
|
"eval_entropy": 0.5439975674264547,
|
|
"eval_num_tokens": 13297912.0,
|
|
"eval_mean_token_accuracy": 0.8884000489831341,
|
|
"epoch": 0.43331728454501683,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 0.533262062072754,
|
|
"grad_norm": 0.203125,
|
|
"learning_rate": 1.9823388571701914e-05,
|
|
"entropy": 0.5346410566071669,
|
|
"num_tokens": 14184641.0,
|
|
"mean_token_accuracy": 0.889172242085139,
|
|
"epoch": 0.46220510351468463,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"loss": 0.5277928924560547,
|
|
"grad_norm": 0.2021484375,
|
|
"learning_rate": 1.975496310692893e-05,
|
|
"entropy": 0.528671110868454,
|
|
"num_tokens": 15071591.0,
|
|
"mean_token_accuracy": 0.889830404818058,
|
|
"epoch": 0.49109292248435243,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"loss": 0.5305094909667969,
|
|
"grad_norm": 0.2294921875,
|
|
"learning_rate": 1.9675518697382575e-05,
|
|
"entropy": 0.5330572768549124,
|
|
"num_tokens": 15958400.0,
|
|
"mean_token_accuracy": 0.8894869204362233,
|
|
"epoch": 0.5199807414540202,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"loss": 0.5314492797851562,
|
|
"grad_norm": 0.2080078125,
|
|
"learning_rate": 1.958514508133779e-05,
|
|
"entropy": 0.5313531577587127,
|
|
"num_tokens": 16845090.0,
|
|
"mean_token_accuracy": 0.8891615062952042,
|
|
"epoch": 0.548868560423688,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"loss": 0.5224239349365234,
|
|
"grad_norm": 0.2265625,
|
|
"learning_rate": 1.9483944342408145e-05,
|
|
"entropy": 0.521418957610925,
|
|
"num_tokens": 17731372.0,
|
|
"mean_token_accuracy": 0.8913801329334577,
|
|
"epoch": 0.5777563793933558,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"eval_loss": 0.5295727849006653,
|
|
"eval_runtime": 120.9976,
|
|
"eval_samples_per_second": 111.688,
|
|
"eval_steps_per_second": 18.62,
|
|
"eval_entropy": 0.5356734260367755,
|
|
"eval_num_tokens": 17731372.0,
|
|
"eval_mean_token_accuracy": 0.889919996896533,
|
|
"epoch": 0.5777563793933558,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"loss": 0.5248940658569335,
|
|
"grad_norm": 0.1904296875,
|
|
"learning_rate": 1.9372030794234916e-05,
|
|
"entropy": 0.5240081022679806,
|
|
"num_tokens": 18620093.0,
|
|
"mean_token_accuracy": 0.8901815564433734,
|
|
"epoch": 0.6066441983630236,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"loss": 0.5177249526977539,
|
|
"grad_norm": 0.1884765625,
|
|
"learning_rate": 1.9249530851361477e-05,
|
|
"entropy": 0.5155584744115671,
|
|
"num_tokens": 19506319.0,
|
|
"mean_token_accuracy": 0.892069263557593,
|
|
"epoch": 0.6355320173326914,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"loss": 0.5167172622680664,
|
|
"grad_norm": 0.2060546875,
|
|
"learning_rate": 1.9116582886438787e-05,
|
|
"entropy": 0.5149475511411826,
|
|
"num_tokens": 20392582.0,
|
|
"mean_token_accuracy": 0.8917997235059738,
|
|
"epoch": 0.6644198363023591,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"loss": 0.5186575698852539,
|
|
"grad_norm": 0.2158203125,
|
|
"learning_rate": 1.8973337073923406e-05,
|
|
"entropy": 0.5176833253105482,
|
|
"num_tokens": 21281188.0,
|
|
"mean_token_accuracy": 0.891216093202432,
|
|
"epoch": 0.693307655272027,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"loss": 0.5165076446533203,
|
|
"grad_norm": 0.208984375,
|
|
"learning_rate": 1.881995522044441e-05,
|
|
"entropy": 0.5149765905241172,
|
|
"num_tokens": 22166818.0,
|
|
"mean_token_accuracy": 0.8918550156553586,
|
|
"epoch": 0.7221954742416947,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"eval_loss": 0.5220824480056763,
|
|
"eval_runtime": 120.9697,
|
|
"eval_samples_per_second": 111.714,
|
|
"eval_steps_per_second": 18.625,
|
|
"eval_entropy": 0.5212392443902641,
|
|
"eval_num_tokens": 22166818.0,
|
|
"eval_mean_token_accuracy": 0.8910056220866075,
|
|
"epoch": 0.7221954742416947,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"loss": 0.5141453170776367,
|
|
"grad_norm": 0.185546875,
|
|
"learning_rate": 1.8656610582030976e-05,
|
|
"entropy": 0.5123816165824732,
|
|
"num_tokens": 23055206.0,
|
|
"mean_token_accuracy": 0.8920613800485929,
|
|
"epoch": 0.7510832932113626,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"loss": 0.5139566802978516,
|
|
"grad_norm": 0.193359375,
|
|
"learning_rate": 1.848348766840703e-05,
|
|
"entropy": 0.5132978439331055,
|
|
"num_tokens": 23942436.0,
|
|
"mean_token_accuracy": 0.8916262516379356,
|
|
"epoch": 0.7799711121810303,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"loss": 0.5115699005126954,
|
|
"grad_norm": 0.2255859375,
|
|
"learning_rate": 1.8300782034573984e-05,
|
|
"entropy": 0.5114874669909477,
|
|
"num_tokens": 24827923.0,
|
|
"mean_token_accuracy": 0.8923087509473165,
|
|
"epoch": 0.8088589311506981,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"loss": 0.5094342803955079,
|
|
"grad_norm": 0.1962890625,
|
|
"learning_rate": 1.8108700059917083e-05,
|
|
"entropy": 0.5071784610549609,
|
|
"num_tokens": 25714586.0,
|
|
"mean_token_accuracy": 0.8931817663709323,
|
|
"epoch": 0.8377467501203659,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"loss": 0.511685447692871,
|
|
"grad_norm": 0.20703125,
|
|
"learning_rate": 1.7907458715084743e-05,
|
|
"entropy": 0.5103998957574367,
|
|
"num_tokens": 26601471.0,
|
|
"mean_token_accuracy": 0.8925204570094745,
|
|
"epoch": 0.8666345690900337,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"eval_loss": 0.5168190002441406,
|
|
"eval_runtime": 121.3252,
|
|
"eval_samples_per_second": 111.387,
|
|
"eval_steps_per_second": 18.57,
|
|
"eval_entropy": 0.5127926120560168,
|
|
"eval_num_tokens": 26601471.0,
|
|
"eval_mean_token_accuracy": 0.8917217899892682,
|
|
"epoch": 0.8666345690900337,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"loss": 0.510882911682129,
|
|
"grad_norm": 0.2216796875,
|
|
"learning_rate": 1.769728531690437e-05,
|
|
"entropy": 0.5096460196375847,
|
|
"num_tokens": 27489339.0,
|
|
"mean_token_accuracy": 0.8923352911074957,
|
|
"epoch": 0.8955223880597015,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"loss": 0.5120228195190429,
|
|
"grad_norm": 0.20703125,
|
|
"learning_rate": 1.7478417271611325e-05,
|
|
"entropy": 0.512226097236077,
|
|
"num_tokens": 28377750.0,
|
|
"mean_token_accuracy": 0.8917342044909795,
|
|
"epoch": 0.9244102070293693,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"loss": 0.5107974243164063,
|
|
"grad_norm": 0.2177734375,
|
|
"learning_rate": 1.725110180668124e-05,
|
|
"entropy": 0.5103923585514227,
|
|
"num_tokens": 29264856.0,
|
|
"mean_token_accuracy": 0.8925020119547844,
|
|
"epoch": 0.9532980259990371,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"loss": 0.5124374008178711,
|
|
"grad_norm": 0.18359375,
|
|
"learning_rate": 1.7015595691568466e-05,
|
|
"entropy": 0.5123077415426572,
|
|
"num_tokens": 30150254.0,
|
|
"mean_token_accuracy": 0.8922114634513855,
|
|
"epoch": 0.9821858449687049,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"loss": 0.5024247741699219,
|
|
"grad_norm": 0.2109375,
|
|
"learning_rate": 1.6772164947666184e-05,
|
|
"entropy": 0.5024208198721991,
|
|
"num_tokens": 31033148.0,
|
|
"mean_token_accuracy": 0.8933051790680774,
|
|
"epoch": 1.0109773712084738,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"eval_loss": 0.5133240818977356,
|
|
"eval_runtime": 121.1413,
|
|
"eval_samples_per_second": 111.556,
|
|
"eval_steps_per_second": 18.598,
|
|
"eval_entropy": 0.49454835110169965,
|
|
"eval_num_tokens": 31033148.0,
|
|
"eval_mean_token_accuracy": 0.8922610160408155,
|
|
"epoch": 1.0109773712084738,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"loss": 0.49717609405517577,
|
|
"grad_norm": 0.2138671875,
|
|
"learning_rate": 1.6521084547815804e-05,
|
|
"entropy": 0.49633729274074234,
|
|
"num_tokens": 31919854.0,
|
|
"mean_token_accuracy": 0.8940571908156077,
|
|
"epoch": 1.0398651901781415,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"loss": 0.49857440948486326,
|
|
"grad_norm": 0.2060546875,
|
|
"learning_rate": 1.6262638105704958e-05,
|
|
"entropy": 0.4946801410615444,
|
|
"num_tokens": 32806935.0,
|
|
"mean_token_accuracy": 0.8944180096189182,
|
|
"epoch": 1.0687530091478092,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"loss": 0.49641082763671873,
|
|
"grad_norm": 0.2001953125,
|
|
"learning_rate": 1.5997117555505138e-05,
|
|
"entropy": 0.4942504517734051,
|
|
"num_tokens": 33692824.0,
|
|
"mean_token_accuracy": 0.894220456580321,
|
|
"epoch": 1.0976408281174772,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"loss": 0.49624130249023435,
|
|
"grad_norm": 0.212890625,
|
|
"learning_rate": 1.5724822822110655e-05,
|
|
"entropy": 0.49494813561439516,
|
|
"num_tokens": 34579763.0,
|
|
"mean_token_accuracy": 0.8944190714756648,
|
|
"epoch": 1.126528647087145,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"loss": 0.49298313140869143,
|
|
"grad_norm": 0.1904296875,
|
|
"learning_rate": 1.5446061482351525e-05,
|
|
"entropy": 0.4929438012341658,
|
|
"num_tokens": 35466508.0,
|
|
"mean_token_accuracy": 0.8952326637506485,
|
|
"epoch": 1.1554164660568127,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"eval_loss": 0.5108281970024109,
|
|
"eval_runtime": 121.0509,
|
|
"eval_samples_per_second": 111.639,
|
|
"eval_steps_per_second": 18.612,
|
|
"eval_entropy": 0.501549962833623,
|
|
"eval_num_tokens": 35466508.0,
|
|
"eval_mean_token_accuracy": 0.8925832025855581,
|
|
"epoch": 1.1554164660568127,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"loss": 0.4936357116699219,
|
|
"grad_norm": 0.2001953125,
|
|
"learning_rate": 1.5161148417562932e-05,
|
|
"entropy": 0.4936923775573572,
|
|
"num_tokens": 36353094.0,
|
|
"mean_token_accuracy": 0.8945872736970584,
|
|
"epoch": 1.1843042850264804,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"loss": 0.49270416259765626,
|
|
"grad_norm": 0.1943359375,
|
|
"learning_rate": 1.4870405457903703e-05,
|
|
"entropy": 0.4933264861504237,
|
|
"num_tokens": 37239421.0,
|
|
"mean_token_accuracy": 0.8946521012981733,
|
|
"epoch": 1.2131921039961484,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"loss": 0.49468719482421875,
|
|
"grad_norm": 0.2177734375,
|
|
"learning_rate": 1.457416101882561e-05,
|
|
"entropy": 0.49451990927259126,
|
|
"num_tokens": 38126461.0,
|
|
"mean_token_accuracy": 0.8947041656573613,
|
|
"epoch": 1.2420799229658162,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"loss": 0.48858612060546874,
|
|
"grad_norm": 0.1923828125,
|
|
"learning_rate": 1.4272749730104063e-05,
|
|
"entropy": 0.4893125213185946,
|
|
"num_tokens": 39013383.0,
|
|
"mean_token_accuracy": 0.8960103297233581,
|
|
"epoch": 1.270967741935484,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"loss": 0.48944534301757814,
|
|
"grad_norm": 0.2109375,
|
|
"learning_rate": 1.3966512057849295e-05,
|
|
"entropy": 0.48983013848463697,
|
|
"num_tokens": 39900377.0,
|
|
"mean_token_accuracy": 0.8956057903170586,
|
|
"epoch": 1.2998555609051516,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"eval_loss": 0.5091761946678162,
|
|
"eval_runtime": 121.0065,
|
|
"eval_samples_per_second": 111.68,
|
|
"eval_steps_per_second": 18.619,
|
|
"eval_entropy": 0.48820916045733254,
|
|
"eval_num_tokens": 39900377.0,
|
|
"eval_mean_token_accuracy": 0.892838896749499,
|
|
"epoch": 1.2998555609051516,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"loss": 0.49533580780029296,
|
|
"grad_norm": 0.212890625,
|
|
"learning_rate": 1.3655793919924975e-05,
|
|
"entropy": 0.49353456447521843,
|
|
"num_tokens": 40787140.0,
|
|
"mean_token_accuracy": 0.8943837519486745,
|
|
"epoch": 1.3287433798748194,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"loss": 0.4934038925170898,
|
|
"grad_norm": 0.1962890625,
|
|
"learning_rate": 1.3340946295208658e-05,
|
|
"entropy": 0.4935056679447492,
|
|
"num_tokens": 41674591.0,
|
|
"mean_token_accuracy": 0.894527651667595,
|
|
"epoch": 1.3576311988444871,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"loss": 0.4890486907958984,
|
|
"grad_norm": 0.205078125,
|
|
"learning_rate": 1.302232482713546e-05,
|
|
"entropy": 0.48878600150346757,
|
|
"num_tokens": 42561974.0,
|
|
"mean_token_accuracy": 0.8960626801848411,
|
|
"epoch": 1.386519017814155,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"loss": 0.4930953598022461,
|
|
"grad_norm": 0.2177734375,
|
|
"learning_rate": 1.2700289421972767e-05,
|
|
"entropy": 0.49330734809239707,
|
|
"num_tokens": 43449322.0,
|
|
"mean_token_accuracy": 0.8950501901904742,
|
|
"epoch": 1.4154068367838228,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"loss": 0.4914593887329102,
|
|
"grad_norm": 0.216796875,
|
|
"learning_rate": 1.237520384227977e-05,
|
|
"entropy": 0.490766015201807,
|
|
"num_tokens": 44335714.0,
|
|
"mean_token_accuracy": 0.8949617861708006,
|
|
"epoch": 1.4442946557534906,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"eval_loss": 0.5075456500053406,
|
|
"eval_runtime": 121.5152,
|
|
"eval_samples_per_second": 111.212,
|
|
"eval_steps_per_second": 18.541,
|
|
"eval_entropy": 0.494863443701626,
|
|
"eval_num_tokens": 44335714.0,
|
|
"eval_mean_token_accuracy": 0.8929759670351539,
|
|
"epoch": 1.4442946557534906,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"loss": 0.49099269866943357,
|
|
"grad_norm": 0.21875,
|
|
"learning_rate": 1.2047435296011018e-05,
|
|
"entropy": 0.49070664674043657,
|
|
"num_tokens": 45223045.0,
|
|
"mean_token_accuracy": 0.8956617527206738,
|
|
"epoch": 1.4731824747231583,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"loss": 0.4920531463623047,
|
|
"grad_norm": 0.1943359375,
|
|
"learning_rate": 1.171735402172818e-05,
|
|
"entropy": 0.49278342053294183,
|
|
"num_tokens": 46109072.0,
|
|
"mean_token_accuracy": 0.8947107720375062,
|
|
"epoch": 1.5020702936928263,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"loss": 0.4921791458129883,
|
|
"grad_norm": 0.2197265625,
|
|
"learning_rate": 1.1385332870388473e-05,
|
|
"entropy": 0.49249339212973914,
|
|
"num_tokens": 46995491.0,
|
|
"mean_token_accuracy": 0.8952183723449707,
|
|
"epoch": 1.530958112662494,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"loss": 0.4928662872314453,
|
|
"grad_norm": 0.2158203125,
|
|
"learning_rate": 1.1051746884182222e-05,
|
|
"entropy": 0.4916799567639828,
|
|
"num_tokens": 47881628.0,
|
|
"mean_token_accuracy": 0.8948841803272565,
|
|
"epoch": 1.5598459316321618,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"loss": 0.4906736373901367,
|
|
"grad_norm": 0.2041015625,
|
|
"learning_rate": 1.0716972872895268e-05,
|
|
"entropy": 0.4912960589925448,
|
|
"num_tokens": 48769412.0,
|
|
"mean_token_accuracy": 0.8952219298481942,
|
|
"epoch": 1.5887337506018295,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"eval_loss": 0.5062649846076965,
|
|
"eval_runtime": 121.6481,
|
|
"eval_samples_per_second": 111.091,
|
|
"eval_steps_per_second": 18.521,
|
|
"eval_entropy": 0.49529570853027616,
|
|
"eval_num_tokens": 48769412.0,
|
|
"eval_mean_token_accuracy": 0.8931941252732563,
|
|
"epoch": 1.5887337506018295,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"loss": 0.49281944274902345,
|
|
"grad_norm": 0.2216796875,
|
|
"learning_rate": 1.0381388988274725e-05,
|
|
"entropy": 0.4927716707189878,
|
|
"num_tokens": 49656341.0,
|
|
"mean_token_accuracy": 0.8945845268170038,
|
|
"epoch": 1.6176215695714973,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"loss": 0.4927938842773438,
|
|
"grad_norm": 0.1953125,
|
|
"learning_rate": 1.0045374296878913e-05,
|
|
"entropy": 0.49278713996211687,
|
|
"num_tokens": 50542195.0,
|
|
"mean_token_accuracy": 0.8949234291911126,
|
|
"epoch": 1.646509388541165,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"loss": 0.49176959991455077,
|
|
"grad_norm": 0.2099609375,
|
|
"learning_rate": 9.709308351893933e-06,
|
|
"entropy": 0.4925773192942142,
|
|
"num_tokens": 51428935.0,
|
|
"mean_token_accuracy": 0.8949492185314496,
|
|
"epoch": 1.675397207510833,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"loss": 0.49320865631103517,
|
|
"grad_norm": 0.2041015625,
|
|
"learning_rate": 9.37357076440057e-06,
|
|
"entropy": 0.4937224745750427,
|
|
"num_tokens": 52316133.0,
|
|
"mean_token_accuracy": 0.894678007364273,
|
|
"epoch": 1.7042850264805007,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"loss": 0.49113529205322265,
|
|
"grad_norm": 0.1943359375,
|
|
"learning_rate": 9.038540774575775e-06,
|
|
"entropy": 0.49027820602059363,
|
|
"num_tokens": 53203155.0,
|
|
"mean_token_accuracy": 0.8950705190499624,
|
|
"epoch": 1.7331728454501685,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"eval_loss": 0.5055996179580688,
|
|
"eval_runtime": 121.1117,
|
|
"eval_samples_per_second": 111.583,
|
|
"eval_steps_per_second": 18.603,
|
|
"eval_entropy": 0.4944385351126214,
|
|
"eval_num_tokens": 53203155.0,
|
|
"eval_mean_token_accuracy": 0.8932915333636962,
|
|
"epoch": 1.7331728454501685,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"loss": 0.4895584487915039,
|
|
"grad_norm": 0.2216796875,
|
|
"learning_rate": 8.704596823313166e-06,
|
|
"entropy": 0.4888417159020901,
|
|
"num_tokens": 54089263.0,
|
|
"mean_token_accuracy": 0.8957057174046834,
|
|
"epoch": 1.7620606644198364,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"loss": 0.4907315063476563,
|
|
"grad_norm": 0.2294921875,
|
|
"learning_rate": 8.372116124746306e-06,
|
|
"entropy": 0.4891975859304269,
|
|
"num_tokens": 54975703.0,
|
|
"mean_token_accuracy": 0.8951012322306633,
|
|
"epoch": 1.7909484833895042,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"loss": 0.4888127136230469,
|
|
"grad_norm": 0.2099609375,
|
|
"learning_rate": 8.04147424015775e-06,
|
|
"entropy": 0.4891936507821083,
|
|
"num_tokens": 55863620.0,
|
|
"mean_token_accuracy": 0.8955637833476067,
|
|
"epoch": 1.819836302359172,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"loss": 0.4893819427490234,
|
|
"grad_norm": 0.19921875,
|
|
"learning_rate": 7.713044653755093e-06,
|
|
"entropy": 0.48820455322662987,
|
|
"num_tokens": 56750061.0,
|
|
"mean_token_accuracy": 0.8957294267416,
|
|
"epoch": 1.8487241213288397,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"loss": 0.48793128967285154,
|
|
"grad_norm": 0.2080078125,
|
|
"learning_rate": 7.387198350793201e-06,
|
|
"entropy": 0.4864146198829015,
|
|
"num_tokens": 57636695.0,
|
|
"mean_token_accuracy": 0.895841832458973,
|
|
"epoch": 1.8776119402985074,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"eval_loss": 0.5049271583557129,
|
|
"eval_runtime": 121.3147,
|
|
"eval_samples_per_second": 111.396,
|
|
"eval_steps_per_second": 18.572,
|
|
"eval_entropy": 0.4945345818731873,
|
|
"eval_num_tokens": 57636695.0,
|
|
"eval_mean_token_accuracy": 0.8933592520974977,
|
|
"epoch": 1.8776119402985074,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"loss": 0.4899849700927734,
|
|
"grad_norm": 0.21484375,
|
|
"learning_rate": 7.0643033985192415e-06,
|
|
"entropy": 0.48893075570464134,
|
|
"num_tokens": 58523499.0,
|
|
"mean_token_accuracy": 0.8950817889968554,
|
|
"epoch": 1.9064997592681752,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"loss": 0.4888337707519531,
|
|
"grad_norm": 0.2099609375,
|
|
"learning_rate": 6.744724530413773e-06,
|
|
"entropy": 0.4885369242727757,
|
|
"num_tokens": 59409639.0,
|
|
"mean_token_accuracy": 0.895801799595356,
|
|
"epoch": 1.935387578237843,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"loss": 0.48825778961181643,
|
|
"grad_norm": 0.2353515625,
|
|
"learning_rate": 6.428822734197584e-06,
|
|
"entropy": 0.48850176721811295,
|
|
"num_tokens": 60295475.0,
|
|
"mean_token_accuracy": 0.8954997793833415,
|
|
"epoch": 1.9642753972075109,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"loss": 0.48861705780029296,
|
|
"grad_norm": 0.2041015625,
|
|
"learning_rate": 6.116954844069659e-06,
|
|
"entropy": 0.4889233031868935,
|
|
"num_tokens": 61183207.0,
|
|
"mean_token_accuracy": 0.8955455178022385,
|
|
"epoch": 1.9931632161771786,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"loss": 0.4824595642089844,
|
|
"grad_norm": 0.2021484375,
|
|
"learning_rate": 5.8094731376368115e-06,
|
|
"entropy": 0.4824550705011872,
|
|
"num_tokens": 62065451.0,
|
|
"mean_token_accuracy": 0.8966363853834146,
|
|
"epoch": 2.0219547424169475,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"eval_loss": 0.5048847794532776,
|
|
"eval_runtime": 120.9542,
|
|
"eval_samples_per_second": 111.728,
|
|
"eval_steps_per_second": 18.627,
|
|
"eval_entropy": 0.485265318785463,
|
|
"eval_num_tokens": 62065451.0,
|
|
"eval_mean_token_accuracy": 0.8934332738174526,
|
|
"epoch": 2.0219547424169475,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"loss": 0.48333885192871096,
|
|
"grad_norm": 0.2197265625,
|
|
"learning_rate": 5.506724937990357e-06,
|
|
"entropy": 0.4820535824199518,
|
|
"num_tokens": 62952513.0,
|
|
"mean_token_accuracy": 0.8962123716870943,
|
|
"epoch": 2.0508425613866152,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"loss": 0.4831295394897461,
|
|
"grad_norm": 0.2255859375,
|
|
"learning_rate": 5.2090522213792734e-06,
|
|
"entropy": 0.48218101014693576,
|
|
"num_tokens": 63838911.0,
|
|
"mean_token_accuracy": 0.8962508221467336,
|
|
"epoch": 2.079730380356283,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"loss": 0.48569129943847655,
|
|
"grad_norm": 0.2421875,
|
|
"learning_rate": 4.916791230922975e-06,
|
|
"entropy": 0.48549183184901873,
|
|
"num_tokens": 64725297.0,
|
|
"mean_token_accuracy": 0.895985666513443,
|
|
"epoch": 2.1086181993259507,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"loss": 0.482639274597168,
|
|
"grad_norm": 0.2236328125,
|
|
"learning_rate": 4.630272096800113e-06,
|
|
"entropy": 0.48277713745832446,
|
|
"num_tokens": 65611469.0,
|
|
"mean_token_accuracy": 0.8964148736000062,
|
|
"epoch": 2.1375060182956185,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"loss": 0.48228111267089846,
|
|
"grad_norm": 0.2138671875,
|
|
"learning_rate": 4.3498184633423775e-06,
|
|
"entropy": 0.4816711642841498,
|
|
"num_tokens": 66498972.0,
|
|
"mean_token_accuracy": 0.8969011158744494,
|
|
"epoch": 2.1663938372652867,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"eval_loss": 0.5047065019607544,
|
|
"eval_runtime": 121.1419,
|
|
"eval_samples_per_second": 111.555,
|
|
"eval_steps_per_second": 18.598,
|
|
"eval_entropy": 0.4889771351272458,
|
|
"eval_num_tokens": 66498972.0,
|
|
"eval_mean_token_accuracy": 0.893424384577032,
|
|
"epoch": 2.1663938372652867,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"loss": 0.4861163330078125,
|
|
"grad_norm": 0.1884765625,
|
|
"learning_rate": 4.075747123454544e-06,
|
|
"entropy": 0.4855067411561807,
|
|
"num_tokens": 67386206.0,
|
|
"mean_token_accuracy": 0.8959952719012896,
|
|
"epoch": 2.1952816562349544,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"loss": 0.4826494598388672,
|
|
"grad_norm": 0.2373046875,
|
|
"learning_rate": 3.808367660773684e-06,
|
|
"entropy": 0.48214618876576426,
|
|
"num_tokens": 68272531.0,
|
|
"mean_token_accuracy": 0.8969175884127617,
|
|
"epoch": 2.224169475204622,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"loss": 0.4870090103149414,
|
|
"grad_norm": 0.2109375,
|
|
"learning_rate": 3.547982099971804e-06,
|
|
"entropy": 0.48708953022956847,
|
|
"num_tokens": 69159267.0,
|
|
"mean_token_accuracy": 0.8955915210644404,
|
|
"epoch": 2.25305729417429,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"loss": 0.48725185394287107,
|
|
"grad_norm": 0.205078125,
|
|
"learning_rate": 3.2948845655968743e-06,
|
|
"entropy": 0.48610410739978155,
|
|
"num_tokens": 70046517.0,
|
|
"mean_token_accuracy": 0.8959172365069389,
|
|
"epoch": 2.2819451131439576,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"loss": 0.4846121597290039,
|
|
"grad_norm": 0.23828125,
|
|
"learning_rate": 3.0493609498376174e-06,
|
|
"entropy": 0.4851847393314044,
|
|
"num_tokens": 70933414.0,
|
|
"mean_token_accuracy": 0.8960670222838719,
|
|
"epoch": 2.3108329321136254,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"eval_loss": 0.5046308040618896,
|
|
"eval_runtime": 121.3471,
|
|
"eval_samples_per_second": 111.367,
|
|
"eval_steps_per_second": 18.567,
|
|
"eval_entropy": 0.4883948566513595,
|
|
"eval_num_tokens": 70933414.0,
|
|
"eval_mean_token_accuracy": 0.8934336849387042,
|
|
"epoch": 2.3108329321136254,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"loss": 0.48443710327148437,
|
|
"grad_norm": 0.1962890625,
|
|
"learning_rate": 2.811688589587358e-06,
|
|
"entropy": 0.48562209352850916,
|
|
"num_tokens": 71820845.0,
|
|
"mean_token_accuracy": 0.8964889810482661,
|
|
"epoch": 2.339720751083293,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"loss": 0.4795720672607422,
|
|
"grad_norm": 0.197265625,
|
|
"learning_rate": 2.582135953171717e-06,
|
|
"entropy": 0.4802011082569758,
|
|
"num_tokens": 72707499.0,
|
|
"mean_token_accuracy": 0.8968185101946194,
|
|
"epoch": 2.368608570052961,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"loss": 0.48555206298828124,
|
|
"grad_norm": 0.23828125,
|
|
"learning_rate": 2.3609623370939707e-06,
|
|
"entropy": 0.48567692418893177,
|
|
"num_tokens": 73593737.0,
|
|
"mean_token_accuracy": 0.8955408794681231,
|
|
"epoch": 2.3974963890226286,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"loss": 0.4849796676635742,
|
|
"grad_norm": 0.2314453125,
|
|
"learning_rate": 2.148417573140682e-06,
|
|
"entropy": 0.4830826353530089,
|
|
"num_tokens": 74480089.0,
|
|
"mean_token_accuracy": 0.8961175856987635,
|
|
"epoch": 2.426384207992297,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"loss": 0.48482059478759765,
|
|
"grad_norm": 0.2119140625,
|
|
"learning_rate": 1.9447417461784214e-06,
|
|
"entropy": 0.48449866617719334,
|
|
"num_tokens": 75367239.0,
|
|
"mean_token_accuracy": 0.8959982444842657,
|
|
"epoch": 2.4552720269619646,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"eval_loss": 0.5046458840370178,
|
|
"eval_runtime": 121.1164,
|
|
"eval_samples_per_second": 111.579,
|
|
"eval_steps_per_second": 18.602,
|
|
"eval_entropy": 0.4878934076691647,
|
|
"eval_num_tokens": 75367239.0,
|
|
"eval_mean_token_accuracy": 0.8934249661257676,
|
|
"epoch": 2.4552720269619646,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"loss": 0.48140850067138674,
|
|
"grad_norm": 0.2138671875,
|
|
"learning_rate": 1.7501649229603213e-06,
|
|
"entropy": 0.48195242514212927,
|
|
"num_tokens": 76254713.0,
|
|
"mean_token_accuracy": 0.8970886744062105,
|
|
"epoch": 2.4841598459316323,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"loss": 0.48088233947753906,
|
|
"grad_norm": 0.203125,
|
|
"learning_rate": 1.564906892248851e-06,
|
|
"entropy": 0.4809670109550158,
|
|
"num_tokens": 77142380.0,
|
|
"mean_token_accuracy": 0.8968148503700892,
|
|
"epoch": 2.5130476649013,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"loss": 0.48114513397216796,
|
|
"grad_norm": 0.201171875,
|
|
"learning_rate": 1.389176916548326e-06,
|
|
"entropy": 0.48076146269838016,
|
|
"num_tokens": 78029752.0,
|
|
"mean_token_accuracy": 0.8966197535395622,
|
|
"epoch": 2.541935483870968,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"loss": 0.484271240234375,
|
|
"grad_norm": 0.216796875,
|
|
"learning_rate": 1.2231734957275866e-06,
|
|
"entropy": 0.48319436301787694,
|
|
"num_tokens": 78914544.0,
|
|
"mean_token_accuracy": 0.8967648883660634,
|
|
"epoch": 2.5708233028406355,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"loss": 0.4859063720703125,
|
|
"grad_norm": 0.22265625,
|
|
"learning_rate": 1.067084142799869e-06,
|
|
"entropy": 0.4856315462787946,
|
|
"num_tokens": 79801258.0,
|
|
"mean_token_accuracy": 0.8954879422982533,
|
|
"epoch": 2.5997111218103033,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"eval_loss": 0.5045983195304871,
|
|
"eval_runtime": 121.336,
|
|
"eval_samples_per_second": 111.377,
|
|
"eval_steps_per_second": 18.568,
|
|
"eval_entropy": 0.4877890812467058,
|
|
"eval_num_tokens": 79801258.0,
|
|
"eval_mean_token_accuracy": 0.8934447498941654,
|
|
"epoch": 2.5997111218103033,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"loss": 0.48263256072998045,
|
|
"grad_norm": 0.23046875,
|
|
"learning_rate": 9.210851721131397e-07,
|
|
"entropy": 0.48227159813046455,
|
|
"num_tokens": 80686981.0,
|
|
"mean_token_accuracy": 0.8968270209431648,
|
|
"epoch": 2.628598940779971,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"loss": 0.4856527709960938,
|
|
"grad_norm": 0.193359375,
|
|
"learning_rate": 7.85341500190131e-07,
|
|
"entropy": 0.4843608529369036,
|
|
"num_tokens": 81574851.0,
|
|
"mean_token_accuracy": 0.8959235412875811,
|
|
"epoch": 2.6574867597496388,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"loss": 0.48150875091552736,
|
|
"grad_norm": 0.2265625,
|
|
"learning_rate": 6.600064594430566e-07,
|
|
"entropy": 0.482298014909029,
|
|
"num_tokens": 82461264.0,
|
|
"mean_token_accuracy": 0.8967729851603508,
|
|
"epoch": 2.686374578719307,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"loss": 0.485591926574707,
|
|
"grad_norm": 0.201171875,
|
|
"learning_rate": 5.452216249734332e-07,
|
|
"entropy": 0.4847200144827366,
|
|
"num_tokens": 83348000.0,
|
|
"mean_token_accuracy": 0.8960838095347087,
|
|
"epoch": 2.7152623976889743,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"loss": 0.48456764221191406,
|
|
"grad_norm": 0.2109375,
|
|
"learning_rate": 4.4111665465264466e-07,
|
|
"entropy": 0.4836878172556559,
|
|
"num_tokens": 84234477.0,
|
|
"mean_token_accuracy": 0.8962146702408791,
|
|
"epoch": 2.7441502166586424,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"eval_loss": 0.504546046257019,
|
|
"eval_runtime": 121.2203,
|
|
"eval_samples_per_second": 111.483,
|
|
"eval_steps_per_second": 18.586,
|
|
"eval_entropy": 0.4879270561181647,
|
|
"eval_num_tokens": 84234477.0,
|
|
"eval_mean_token_accuracy": 0.8934456505248455,
|
|
"epoch": 2.7441502166586424,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"loss": 0.48081619262695313,
|
|
"grad_norm": 0.2060546875,
|
|
"learning_rate": 3.478091426638763e-07,
|
|
"entropy": 0.4814382904271285,
|
|
"num_tokens": 85120633.0,
|
|
"mean_token_accuracy": 0.8969470235705376,
|
|
"epoch": 2.77303803562831,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"loss": 0.4832990264892578,
|
|
"grad_norm": 0.20703125,
|
|
"learning_rate": 2.654044866708905e-07,
|
|
"entropy": 0.4842444409926732,
|
|
"num_tokens": 86007778.0,
|
|
"mean_token_accuracy": 0.8965607133507728,
|
|
"epoch": 2.801925854597978,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"loss": 0.4815947341918945,
|
|
"grad_norm": 0.2255859375,
|
|
"learning_rate": 1.939957687636451e-07,
|
|
"entropy": 0.48316286092003186,
|
|
"num_tokens": 86893679.0,
|
|
"mean_token_accuracy": 0.8967393870155017,
|
|
"epoch": 2.8308136735676457,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"loss": 0.4858028793334961,
|
|
"grad_norm": 0.2041015625,
|
|
"learning_rate": 1.336636503152622e-07,
|
|
"entropy": 0.4849201820790768,
|
|
"num_tokens": 87782234.0,
|
|
"mean_token_accuracy": 0.8959653866291046,
|
|
"epoch": 2.8597014925373134,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"loss": 0.48418357849121096,
|
|
"grad_norm": 0.20703125,
|
|
"learning_rate": 8.447628086910242e-08,
|
|
"entropy": 0.48366599187254905,
|
|
"num_tokens": 88669239.0,
|
|
"mean_token_accuracy": 0.8961514661709468,
|
|
"epoch": 2.888589311506981,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"eval_loss": 0.5046224594116211,
|
|
"eval_runtime": 120.9979,
|
|
"eval_samples_per_second": 111.688,
|
|
"eval_steps_per_second": 18.62,
|
|
"eval_entropy": 0.4881912901223738,
|
|
"eval_num_tokens": 88669239.0,
|
|
"eval_mean_token_accuracy": 0.8934334994050275,
|
|
"epoch": 2.888589311506981,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"loss": 0.4841903305053711,
|
|
"grad_norm": 0.21875,
|
|
"learning_rate": 4.648922115887078e-08,
|
|
"entropy": 0.48405092969536784,
|
|
"num_tokens": 89555716.0,
|
|
"mean_token_accuracy": 0.8961480244000752,
|
|
"epoch": 2.917477130476649,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"loss": 0.4839943313598633,
|
|
"grad_norm": 0.2314453125,
|
|
"learning_rate": 1.9745380348696887e-08,
|
|
"entropy": 0.48480835517247517,
|
|
"num_tokens": 90443198.0,
|
|
"mean_token_accuracy": 0.8965321667989095,
|
|
"epoch": 2.9463649494463167,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"loss": 0.48620803833007814,
|
|
"grad_norm": 0.2158203125,
|
|
"learning_rate": 4.274967564099619e-09,
|
|
"entropy": 0.48755290483434993,
|
|
"num_tokens": 91330106.0,
|
|
"mean_token_accuracy": 0.895567223628362,
|
|
"epoch": 2.9752527684159844,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"train_runtime": 17442.4557,
|
|
"train_samples_per_second": 21.433,
|
|
"train_steps_per_second": 0.595,
|
|
"total_flos": 2.0076201569253059e+18,
|
|
"train_loss": 0.5468513610710557,
|
|
"entropy": 0.4824813495930067,
|
|
"num_tokens": 92088141.0,
|
|
"mean_token_accuracy": 0.8962666590621963,
|
|
"epoch": 3.0,
|
|
"step": 10386
|
|
}
|
|
] |