3369 lines
82 KiB
JSON
3369 lines
82 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 600,
|
|
"global_step": 4677,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0021381227282446015,
|
|
"grad_norm": 13.399383544921875,
|
|
"learning_rate": 3.846153846153847e-07,
|
|
"loss": 3.2323,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.004276245456489203,
|
|
"grad_norm": 12.956534385681152,
|
|
"learning_rate": 8.11965811965812e-07,
|
|
"loss": 3.2089,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.006414368184733804,
|
|
"grad_norm": 6.201784610748291,
|
|
"learning_rate": 1.2393162393162394e-06,
|
|
"loss": 2.8859,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.008552490912978406,
|
|
"grad_norm": 5.795752048492432,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 2.469,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.010690613641223007,
|
|
"grad_norm": 4.1530351638793945,
|
|
"learning_rate": 2.094017094017094e-06,
|
|
"loss": 2.0068,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.012828736369467608,
|
|
"grad_norm": 4.0573530197143555,
|
|
"learning_rate": 2.5213675213675216e-06,
|
|
"loss": 1.5318,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.014966859097712209,
|
|
"grad_norm": 9.95679759979248,
|
|
"learning_rate": 2.948717948717949e-06,
|
|
"loss": 1.073,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.01710498182595681,
|
|
"grad_norm": 2.138951539993286,
|
|
"learning_rate": 3.3760683760683765e-06,
|
|
"loss": 0.7936,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.01924310455420141,
|
|
"grad_norm": 1.6260210275650024,
|
|
"learning_rate": 3.8034188034188036e-06,
|
|
"loss": 0.6121,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.021381227282446014,
|
|
"grad_norm": 3.553804636001587,
|
|
"learning_rate": 4.230769230769231e-06,
|
|
"loss": 0.4587,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.023519350010690613,
|
|
"grad_norm": 4.238368988037109,
|
|
"learning_rate": 4.658119658119659e-06,
|
|
"loss": 0.3611,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.025657472738935216,
|
|
"grad_norm": 3.7698726654052734,
|
|
"learning_rate": 5.085470085470086e-06,
|
|
"loss": 0.3285,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.027795595467179815,
|
|
"grad_norm": 6.797550201416016,
|
|
"learning_rate": 5.512820512820514e-06,
|
|
"loss": 0.2798,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.029933718195424418,
|
|
"grad_norm": 4.188291549682617,
|
|
"learning_rate": 5.940170940170941e-06,
|
|
"loss": 0.2401,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.03207184092366902,
|
|
"grad_norm": 0.8605281114578247,
|
|
"learning_rate": 6.367521367521368e-06,
|
|
"loss": 0.2354,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.03420996365191362,
|
|
"grad_norm": 1.5260062217712402,
|
|
"learning_rate": 6.794871794871796e-06,
|
|
"loss": 0.1918,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.03634808638015822,
|
|
"grad_norm": 1.3726520538330078,
|
|
"learning_rate": 7.222222222222223e-06,
|
|
"loss": 0.1818,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.03848620910840282,
|
|
"grad_norm": 1.8705629110336304,
|
|
"learning_rate": 7.649572649572649e-06,
|
|
"loss": 0.153,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.040624331836647425,
|
|
"grad_norm": 2.4513299465179443,
|
|
"learning_rate": 8.076923076923077e-06,
|
|
"loss": 0.154,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.04276245456489203,
|
|
"grad_norm": 0.2777731418609619,
|
|
"learning_rate": 8.504273504273505e-06,
|
|
"loss": 0.1415,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.04490057729313662,
|
|
"grad_norm": 0.23002663254737854,
|
|
"learning_rate": 8.931623931623933e-06,
|
|
"loss": 0.1326,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.047038700021381226,
|
|
"grad_norm": 1.1118742227554321,
|
|
"learning_rate": 9.358974358974359e-06,
|
|
"loss": 0.126,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.04917682274962583,
|
|
"grad_norm": 1.368764042854309,
|
|
"learning_rate": 9.786324786324787e-06,
|
|
"loss": 0.1286,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.05131494547787043,
|
|
"grad_norm": 0.519333004951477,
|
|
"learning_rate": 9.999968751679245e-06,
|
|
"loss": 0.1195,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.053453068206115034,
|
|
"grad_norm": 0.5977477431297302,
|
|
"learning_rate": 9.999718767456692e-06,
|
|
"loss": 0.1206,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.05559119093435963,
|
|
"grad_norm": 0.4855315089225769,
|
|
"learning_rate": 9.999218811510088e-06,
|
|
"loss": 0.1103,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.05772931366260423,
|
|
"grad_norm": 0.3158447742462158,
|
|
"learning_rate": 9.998468908835808e-06,
|
|
"loss": 0.1089,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.059867436390848836,
|
|
"grad_norm": 0.25278440117836,
|
|
"learning_rate": 9.997469096926852e-06,
|
|
"loss": 0.1089,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.06200555911909344,
|
|
"grad_norm": 0.25526583194732666,
|
|
"learning_rate": 9.996219425770975e-06,
|
|
"loss": 0.1024,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.06414368184733804,
|
|
"grad_norm": 0.7181093692779541,
|
|
"learning_rate": 9.994719957848182e-06,
|
|
"loss": 0.1004,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.06628180457558264,
|
|
"grad_norm": 0.5958463549613953,
|
|
"learning_rate": 9.992970768127605e-06,
|
|
"loss": 0.0957,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.06841992730382725,
|
|
"grad_norm": 0.8600453734397888,
|
|
"learning_rate": 9.990971944063758e-06,
|
|
"loss": 0.1029,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.07055805003207184,
|
|
"grad_norm": 0.267674058675766,
|
|
"learning_rate": 9.98872358559216e-06,
|
|
"loss": 0.0985,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.07269617276031644,
|
|
"grad_norm": 2.607964038848877,
|
|
"learning_rate": 9.986225805124345e-06,
|
|
"loss": 0.0921,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.07483429548856105,
|
|
"grad_norm": 0.9426595568656921,
|
|
"learning_rate": 9.983478727542233e-06,
|
|
"loss": 0.089,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.07697241821680564,
|
|
"grad_norm": 0.5454517602920532,
|
|
"learning_rate": 9.980482490191895e-06,
|
|
"loss": 0.0898,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.07911054094505024,
|
|
"grad_norm": 1.0879571437835693,
|
|
"learning_rate": 9.977237242876677e-06,
|
|
"loss": 0.0932,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.08124866367329485,
|
|
"grad_norm": 0.4590705335140228,
|
|
"learning_rate": 9.973743147849721e-06,
|
|
"loss": 0.0884,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.08338678640153944,
|
|
"grad_norm": 0.5434951186180115,
|
|
"learning_rate": 9.970000379805843e-06,
|
|
"loss": 0.0936,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.08552490912978405,
|
|
"grad_norm": 5.244622707366943,
|
|
"learning_rate": 9.966009125872806e-06,
|
|
"loss": 0.0892,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.08766303185802865,
|
|
"grad_norm": 0.16936945915222168,
|
|
"learning_rate": 9.96176958560196e-06,
|
|
"loss": 0.0872,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.08980115458627325,
|
|
"grad_norm": 0.4548771381378174,
|
|
"learning_rate": 9.957281970958264e-06,
|
|
"loss": 0.0879,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.09193927731451786,
|
|
"grad_norm": 1.1364250183105469,
|
|
"learning_rate": 9.952546506309691e-06,
|
|
"loss": 0.0869,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.09407740004276245,
|
|
"grad_norm": 0.7596004605293274,
|
|
"learning_rate": 9.94756342841601e-06,
|
|
"loss": 0.0873,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.09621552277100706,
|
|
"grad_norm": 0.2505151331424713,
|
|
"learning_rate": 9.94233298641695e-06,
|
|
"loss": 0.088,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.09835364549925166,
|
|
"grad_norm": 0.16528713703155518,
|
|
"learning_rate": 9.936855441819744e-06,
|
|
"loss": 0.0834,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.10049176822749625,
|
|
"grad_norm": 0.43566420674324036,
|
|
"learning_rate": 9.931131068486045e-06,
|
|
"loss": 0.0808,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.10262989095574086,
|
|
"grad_norm": 0.24824728071689606,
|
|
"learning_rate": 9.925160152618246e-06,
|
|
"loss": 0.0871,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.10476801368398546,
|
|
"grad_norm": 0.47349920868873596,
|
|
"learning_rate": 9.918942992745161e-06,
|
|
"loss": 0.0827,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.10690613641223007,
|
|
"grad_norm": 0.30691832304000854,
|
|
"learning_rate": 9.912479899707117e-06,
|
|
"loss": 0.0834,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.10904425914047466,
|
|
"grad_norm": 0.1434836983680725,
|
|
"learning_rate": 9.905771196640384e-06,
|
|
"loss": 0.0811,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.11118238186871926,
|
|
"grad_norm": 0.38805854320526123,
|
|
"learning_rate": 9.898817218961043e-06,
|
|
"loss": 0.0805,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.11332050459696387,
|
|
"grad_norm": 0.9806022644042969,
|
|
"learning_rate": 9.89161831434821e-06,
|
|
"loss": 0.0832,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.11545862732520847,
|
|
"grad_norm": 0.4095519185066223,
|
|
"learning_rate": 9.88417484272665e-06,
|
|
"loss": 0.0821,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.11759675005345306,
|
|
"grad_norm": 0.14556068181991577,
|
|
"learning_rate": 9.87648717624878e-06,
|
|
"loss": 0.0813,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.11973487278169767,
|
|
"grad_norm": 0.1319616138935089,
|
|
"learning_rate": 9.868555699276065e-06,
|
|
"loss": 0.0783,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.12187299550994227,
|
|
"grad_norm": 0.34334319829940796,
|
|
"learning_rate": 9.860380808359808e-06,
|
|
"loss": 0.0812,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.12401111823818688,
|
|
"grad_norm": 0.28321507573127747,
|
|
"learning_rate": 9.851962912221315e-06,
|
|
"loss": 0.0833,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.1261492409664315,
|
|
"grad_norm": 0.28973206877708435,
|
|
"learning_rate": 9.843302431731456e-06,
|
|
"loss": 0.0781,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.12828736369467608,
|
|
"grad_norm": 1.1749194860458374,
|
|
"learning_rate": 9.834399799889637e-06,
|
|
"loss": 0.0843,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.12828736369467608,
|
|
"eval_loss": 0.08129256218671799,
|
|
"eval_runtime": 471.6671,
|
|
"eval_samples_per_second": 4.906,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.13042548642292068,
|
|
"grad_norm": 0.261525422334671,
|
|
"learning_rate": 9.825255461802137e-06,
|
|
"loss": 0.0819,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.13256360915116527,
|
|
"grad_norm": 0.3438051640987396,
|
|
"learning_rate": 9.815869874659866e-06,
|
|
"loss": 0.0823,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.13470173187940987,
|
|
"grad_norm": 0.34199830889701843,
|
|
"learning_rate": 9.806243507715494e-06,
|
|
"loss": 0.0808,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.1368398546076545,
|
|
"grad_norm": 0.22008945047855377,
|
|
"learning_rate": 9.796376842260004e-06,
|
|
"loss": 0.0774,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.1389779773358991,
|
|
"grad_norm": 0.1826457530260086,
|
|
"learning_rate": 9.786270371598613e-06,
|
|
"loss": 0.0748,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.14111610006414368,
|
|
"grad_norm": 0.15165981650352478,
|
|
"learning_rate": 9.775924601026127e-06,
|
|
"loss": 0.0782,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.14325422279238828,
|
|
"grad_norm": 0.3881567120552063,
|
|
"learning_rate": 9.765340047801656e-06,
|
|
"loss": 0.0764,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.14539234552063288,
|
|
"grad_norm": 0.5628389120101929,
|
|
"learning_rate": 9.754517241122771e-06,
|
|
"loss": 0.0774,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.14753046824887747,
|
|
"grad_norm": 0.18720681965351105,
|
|
"learning_rate": 9.743456722099039e-06,
|
|
"loss": 0.0779,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.1496685909771221,
|
|
"grad_norm": 0.21351012587547302,
|
|
"learning_rate": 9.732159043724963e-06,
|
|
"loss": 0.0782,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.1518067137053667,
|
|
"grad_norm": 0.31227338314056396,
|
|
"learning_rate": 9.720624770852341e-06,
|
|
"loss": 0.077,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.1539448364336113,
|
|
"grad_norm": 0.18778669834136963,
|
|
"learning_rate": 9.70885448016203e-06,
|
|
"loss": 0.0751,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.15608295916185588,
|
|
"grad_norm": 0.2396603375673294,
|
|
"learning_rate": 9.696848760135093e-06,
|
|
"loss": 0.0776,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.15822108189010048,
|
|
"grad_norm": 0.24791832268238068,
|
|
"learning_rate": 9.684608211023406e-06,
|
|
"loss": 0.0729,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.1603592046183451,
|
|
"grad_norm": 0.5819320678710938,
|
|
"learning_rate": 9.672133444819619e-06,
|
|
"loss": 0.0738,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.1624973273465897,
|
|
"grad_norm": 0.2177390158176422,
|
|
"learning_rate": 9.659425085226581e-06,
|
|
"loss": 0.0789,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.1646354500748343,
|
|
"grad_norm": 0.14574196934700012,
|
|
"learning_rate": 9.646483767626138e-06,
|
|
"loss": 0.0755,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.1667735728030789,
|
|
"grad_norm": 0.16599872708320618,
|
|
"learning_rate": 9.63331013904738e-06,
|
|
"loss": 0.0758,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.16891169553132349,
|
|
"grad_norm": 0.21911288797855377,
|
|
"learning_rate": 9.619904858134281e-06,
|
|
"loss": 0.0763,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.1710498182595681,
|
|
"grad_norm": 0.12567879259586334,
|
|
"learning_rate": 9.606268595112776e-06,
|
|
"loss": 0.0752,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.1731879409878127,
|
|
"grad_norm": 0.13315202295780182,
|
|
"learning_rate": 9.59240203175725e-06,
|
|
"loss": 0.0727,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.1753260637160573,
|
|
"grad_norm": 0.4594784379005432,
|
|
"learning_rate": 9.57830586135644e-06,
|
|
"loss": 0.076,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.1774641864443019,
|
|
"grad_norm": 0.15148596465587616,
|
|
"learning_rate": 9.5639807886788e-06,
|
|
"loss": 0.0766,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.1796023091725465,
|
|
"grad_norm": 0.5493207573890686,
|
|
"learning_rate": 9.549427529937233e-06,
|
|
"loss": 0.0769,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.18174043190079112,
|
|
"grad_norm": 0.1707722693681717,
|
|
"learning_rate": 9.534646812753301e-06,
|
|
"loss": 0.0733,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.1838785546290357,
|
|
"grad_norm": 0.15772108733654022,
|
|
"learning_rate": 9.519639376120841e-06,
|
|
"loss": 0.0767,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.1860166773572803,
|
|
"grad_norm": 0.36901381611824036,
|
|
"learning_rate": 9.504405970369017e-06,
|
|
"loss": 0.0767,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.1881548000855249,
|
|
"grad_norm": 0.21110327541828156,
|
|
"learning_rate": 9.488947357124812e-06,
|
|
"loss": 0.0749,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.1902929228137695,
|
|
"grad_norm": 0.1756543517112732,
|
|
"learning_rate": 9.473264309274934e-06,
|
|
"loss": 0.0747,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.19243104554201412,
|
|
"grad_norm": 0.19304226338863373,
|
|
"learning_rate": 9.45735761092719e-06,
|
|
"loss": 0.0722,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.19456916827025872,
|
|
"grad_norm": 0.14899924397468567,
|
|
"learning_rate": 9.441228057371275e-06,
|
|
"loss": 0.0722,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.19670729099850331,
|
|
"grad_norm": 0.24590201675891876,
|
|
"learning_rate": 9.42487645503901e-06,
|
|
"loss": 0.0753,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.1988454137267479,
|
|
"grad_norm": 0.2768633961677551,
|
|
"learning_rate": 9.408303621464024e-06,
|
|
"loss": 0.0738,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.2009835364549925,
|
|
"grad_norm": 0.44510579109191895,
|
|
"learning_rate": 9.391510385240876e-06,
|
|
"loss": 0.0725,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.20312165918323713,
|
|
"grad_norm": 0.15240268409252167,
|
|
"learning_rate": 9.374497585983635e-06,
|
|
"loss": 0.0748,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.20525978191148173,
|
|
"grad_norm": 0.307160884141922,
|
|
"learning_rate": 9.3572660742839e-06,
|
|
"loss": 0.0734,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.20739790463972632,
|
|
"grad_norm": 0.19927459955215454,
|
|
"learning_rate": 9.339816711668262e-06,
|
|
"loss": 0.0723,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.20953602736797092,
|
|
"grad_norm": 0.12397543340921402,
|
|
"learning_rate": 9.322150370555242e-06,
|
|
"loss": 0.0728,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.2116741500962155,
|
|
"grad_norm": 0.20738820731639862,
|
|
"learning_rate": 9.304267934211672e-06,
|
|
"loss": 0.0749,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.21381227282446014,
|
|
"grad_norm": 0.14943927526474,
|
|
"learning_rate": 9.28617029670853e-06,
|
|
"loss": 0.0717,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.21595039555270473,
|
|
"grad_norm": 0.16114865243434906,
|
|
"learning_rate": 9.267858362876238e-06,
|
|
"loss": 0.0714,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.21808851828094933,
|
|
"grad_norm": 0.14569199085235596,
|
|
"learning_rate": 9.249333048259426e-06,
|
|
"loss": 0.0751,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.22022664100919392,
|
|
"grad_norm": 0.46952638030052185,
|
|
"learning_rate": 9.230595279071156e-06,
|
|
"loss": 0.0712,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.22236476373743852,
|
|
"grad_norm": 0.1461559236049652,
|
|
"learning_rate": 9.211645992146618e-06,
|
|
"loss": 0.0716,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.22450288646568314,
|
|
"grad_norm": 0.13023297488689423,
|
|
"learning_rate": 9.192486134896282e-06,
|
|
"loss": 0.0696,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.22664100919392774,
|
|
"grad_norm": 0.3277701437473297,
|
|
"learning_rate": 9.17311666525854e-06,
|
|
"loss": 0.0694,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.22877913192217234,
|
|
"grad_norm": 0.11651341617107391,
|
|
"learning_rate": 9.153538551651808e-06,
|
|
"loss": 0.0681,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.23091725465041693,
|
|
"grad_norm": 0.14407022297382355,
|
|
"learning_rate": 9.133752772926102e-06,
|
|
"loss": 0.0717,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.23305537737866153,
|
|
"grad_norm": 0.14212338626384735,
|
|
"learning_rate": 9.113760318314109e-06,
|
|
"loss": 0.0701,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.23519350010690612,
|
|
"grad_norm": 0.1330636888742447,
|
|
"learning_rate": 9.09356218738172e-06,
|
|
"loss": 0.0714,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.23733162283515075,
|
|
"grad_norm": 0.19741296768188477,
|
|
"learning_rate": 9.073159389978056e-06,
|
|
"loss": 0.0704,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.23946974556339534,
|
|
"grad_norm": 0.1623307764530182,
|
|
"learning_rate": 9.052552946184985e-06,
|
|
"loss": 0.0682,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.24160786829163994,
|
|
"grad_norm": 0.11294026672840118,
|
|
"learning_rate": 9.031743886266109e-06,
|
|
"loss": 0.0686,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.24374599101988453,
|
|
"grad_norm": 0.2032402753829956,
|
|
"learning_rate": 9.010733250615264e-06,
|
|
"loss": 0.0685,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.24588411374812913,
|
|
"grad_norm": 0.14151214063167572,
|
|
"learning_rate": 8.989522089704502e-06,
|
|
"loss": 0.0665,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.24802223647637375,
|
|
"grad_norm": 0.16347168385982513,
|
|
"learning_rate": 8.96811146403156e-06,
|
|
"loss": 0.0683,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.2501603592046183,
|
|
"grad_norm": 0.2190970480442047,
|
|
"learning_rate": 8.946502444066854e-06,
|
|
"loss": 0.0702,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.252298481932863,
|
|
"grad_norm": 0.1208793967962265,
|
|
"learning_rate": 8.924696110199944e-06,
|
|
"loss": 0.0687,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.25443660466110757,
|
|
"grad_norm": 0.16621464490890503,
|
|
"learning_rate": 8.902693552685532e-06,
|
|
"loss": 0.0705,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.25657472738935216,
|
|
"grad_norm": 0.468980997800827,
|
|
"learning_rate": 8.880495871588934e-06,
|
|
"loss": 0.0712,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.25657472738935216,
|
|
"eval_loss": 0.0711250901222229,
|
|
"eval_runtime": 472.4294,
|
|
"eval_samples_per_second": 4.898,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.25871285011759676,
|
|
"grad_norm": 0.145741268992424,
|
|
"learning_rate": 8.858104176731102e-06,
|
|
"loss": 0.0663,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.26085097284584136,
|
|
"grad_norm": 0.26027005910873413,
|
|
"learning_rate": 8.835519587633116e-06,
|
|
"loss": 0.0683,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.26298909557408595,
|
|
"grad_norm": 0.1468413919210434,
|
|
"learning_rate": 8.812743233460224e-06,
|
|
"loss": 0.07,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.26512721830233055,
|
|
"grad_norm": 0.12431179732084274,
|
|
"learning_rate": 8.789776252965378e-06,
|
|
"loss": 0.0712,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.26726534103057514,
|
|
"grad_norm": 0.10897620022296906,
|
|
"learning_rate": 8.76661979443231e-06,
|
|
"loss": 0.0706,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.26940346375881974,
|
|
"grad_norm": 0.4356347620487213,
|
|
"learning_rate": 8.74327501561811e-06,
|
|
"loss": 0.071,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.27154158648706433,
|
|
"grad_norm": 0.13456492125988007,
|
|
"learning_rate": 8.71974308369535e-06,
|
|
"loss": 0.075,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.273679709215309,
|
|
"grad_norm": 0.15002375841140747,
|
|
"learning_rate": 8.696025175193725e-06,
|
|
"loss": 0.0683,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.2758178319435536,
|
|
"grad_norm": 0.25080016255378723,
|
|
"learning_rate": 8.672122475941228e-06,
|
|
"loss": 0.0669,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.2779559546717982,
|
|
"grad_norm": 0.10547586530447006,
|
|
"learning_rate": 8.648036181004867e-06,
|
|
"loss": 0.0681,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.2800940774000428,
|
|
"grad_norm": 0.3222697079181671,
|
|
"learning_rate": 8.62376749463091e-06,
|
|
"loss": 0.0689,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.28223220012828737,
|
|
"grad_norm": 0.16391637921333313,
|
|
"learning_rate": 8.59931763018468e-06,
|
|
"loss": 0.0658,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.28437032285653197,
|
|
"grad_norm": 0.11527778953313828,
|
|
"learning_rate": 8.574687810089887e-06,
|
|
"loss": 0.0682,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.28650844558477656,
|
|
"grad_norm": 0.21346218883991241,
|
|
"learning_rate": 8.549879265767514e-06,
|
|
"loss": 0.0695,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.28864656831302116,
|
|
"grad_norm": 0.2827841341495514,
|
|
"learning_rate": 8.524893237574244e-06,
|
|
"loss": 0.0683,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.29078469104126575,
|
|
"grad_norm": 0.21771486103534698,
|
|
"learning_rate": 8.499730974740452e-06,
|
|
"loss": 0.0679,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.29292281376951035,
|
|
"grad_norm": 0.10718706995248795,
|
|
"learning_rate": 8.47439373530774e-06,
|
|
"loss": 0.0696,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.29506093649775494,
|
|
"grad_norm": 0.1284645050764084,
|
|
"learning_rate": 8.44888278606605e-06,
|
|
"loss": 0.069,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.2971990592259996,
|
|
"grad_norm": 0.09697470813989639,
|
|
"learning_rate": 8.423199402490314e-06,
|
|
"loss": 0.067,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.2993371819542442,
|
|
"grad_norm": 0.16259442269802094,
|
|
"learning_rate": 8.39734486867669e-06,
|
|
"loss": 0.0676,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.3014753046824888,
|
|
"grad_norm": 0.12434723228216171,
|
|
"learning_rate": 8.371320477278363e-06,
|
|
"loss": 0.0682,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.3036134274107334,
|
|
"grad_norm": 0.11663969606161118,
|
|
"learning_rate": 8.345127529440921e-06,
|
|
"loss": 0.0685,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.305751550138978,
|
|
"grad_norm": 0.15766265988349915,
|
|
"learning_rate": 8.318767334737286e-06,
|
|
"loss": 0.067,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.3078896728672226,
|
|
"grad_norm": 0.1343747079372406,
|
|
"learning_rate": 8.292241211102246e-06,
|
|
"loss": 0.0685,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.31002779559546717,
|
|
"grad_norm": 0.801880419254303,
|
|
"learning_rate": 8.265550484766574e-06,
|
|
"loss": 0.0721,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.31216591832371177,
|
|
"grad_norm": 0.40033191442489624,
|
|
"learning_rate": 8.238696490190701e-06,
|
|
"loss": 0.0668,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.31430404105195636,
|
|
"grad_norm": 0.10282248258590698,
|
|
"learning_rate": 8.211680569998011e-06,
|
|
"loss": 0.0699,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.31644216378020096,
|
|
"grad_norm": 0.128205344080925,
|
|
"learning_rate": 8.184504074907706e-06,
|
|
"loss": 0.0666,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.3185802865084456,
|
|
"grad_norm": 0.2041744887828827,
|
|
"learning_rate": 8.157168363667278e-06,
|
|
"loss": 0.0652,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.3207184092366902,
|
|
"grad_norm": 0.13964834809303284,
|
|
"learning_rate": 8.129674802984573e-06,
|
|
"loss": 0.0676,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.3228565319649348,
|
|
"grad_norm": 0.2170010209083557,
|
|
"learning_rate": 8.102024767459457e-06,
|
|
"loss": 0.0663,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.3249946546931794,
|
|
"grad_norm": 0.1943911761045456,
|
|
"learning_rate": 8.074219639515101e-06,
|
|
"loss": 0.0692,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.327132777421424,
|
|
"grad_norm": 0.16795021295547485,
|
|
"learning_rate": 8.046260809328848e-06,
|
|
"loss": 0.0675,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.3292709001496686,
|
|
"grad_norm": 0.11824577301740646,
|
|
"learning_rate": 8.018149674762723e-06,
|
|
"loss": 0.066,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.3314090228779132,
|
|
"grad_norm": 0.11008622497320175,
|
|
"learning_rate": 7.98988764129353e-06,
|
|
"loss": 0.0687,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.3335471456061578,
|
|
"grad_norm": 0.1290241926908493,
|
|
"learning_rate": 7.961476121942598e-06,
|
|
"loss": 0.0655,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.3356852683344024,
|
|
"grad_norm": 0.10323217511177063,
|
|
"learning_rate": 7.932916537205112e-06,
|
|
"loss": 0.0662,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.33782339106264697,
|
|
"grad_norm": 0.14995551109313965,
|
|
"learning_rate": 7.904210314979122e-06,
|
|
"loss": 0.0687,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.3399615137908916,
|
|
"grad_norm": 0.6756893992424011,
|
|
"learning_rate": 7.875358890494122e-06,
|
|
"loss": 0.0674,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.3420996365191362,
|
|
"grad_norm": 0.10807085037231445,
|
|
"learning_rate": 7.846363706239312e-06,
|
|
"loss": 0.0686,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.3442377592473808,
|
|
"grad_norm": 0.1266680657863617,
|
|
"learning_rate": 7.817226211891468e-06,
|
|
"loss": 0.0684,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.3463758819756254,
|
|
"grad_norm": 0.17101961374282837,
|
|
"learning_rate": 7.787947864242474e-06,
|
|
"loss": 0.0658,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.34851400470387,
|
|
"grad_norm": 0.10010931640863419,
|
|
"learning_rate": 7.75853012712647e-06,
|
|
"loss": 0.0687,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.3506521274321146,
|
|
"grad_norm": 0.0984787791967392,
|
|
"learning_rate": 7.728974471346678e-06,
|
|
"loss": 0.0682,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.3527902501603592,
|
|
"grad_norm": 0.15190352499485016,
|
|
"learning_rate": 7.699282374601857e-06,
|
|
"loss": 0.0665,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.3549283728886038,
|
|
"grad_norm": 0.10777094215154648,
|
|
"learning_rate": 7.66945532141243e-06,
|
|
"loss": 0.0671,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.3570664956168484,
|
|
"grad_norm": 0.11840742081403732,
|
|
"learning_rate": 7.639494803046261e-06,
|
|
"loss": 0.0642,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.359204618345093,
|
|
"grad_norm": 0.12402522563934326,
|
|
"learning_rate": 7.609402317444086e-06,
|
|
"loss": 0.0652,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.36134274107333764,
|
|
"grad_norm": 0.1083909198641777,
|
|
"learning_rate": 7.579179369144631e-06,
|
|
"loss": 0.0654,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.36348086380158223,
|
|
"grad_norm": 0.11909038573503494,
|
|
"learning_rate": 7.5488274692093874e-06,
|
|
"loss": 0.0657,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.36561898652982683,
|
|
"grad_norm": 0.12170397490262985,
|
|
"learning_rate": 7.518348135147063e-06,
|
|
"loss": 0.0677,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.3677571092580714,
|
|
"grad_norm": 0.0956568717956543,
|
|
"learning_rate": 7.487742890837704e-06,
|
|
"loss": 0.0666,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.369895231986316,
|
|
"grad_norm": 0.09268027544021606,
|
|
"learning_rate": 7.457013266456517e-06,
|
|
"loss": 0.065,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.3720333547145606,
|
|
"grad_norm": 0.09884276241064072,
|
|
"learning_rate": 7.426160798397355e-06,
|
|
"loss": 0.0655,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.3741714774428052,
|
|
"grad_norm": 0.13707584142684937,
|
|
"learning_rate": 7.395187029195906e-06,
|
|
"loss": 0.0633,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.3763096001710498,
|
|
"grad_norm": 0.18375861644744873,
|
|
"learning_rate": 7.364093507452572e-06,
|
|
"loss": 0.0666,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.3784477228992944,
|
|
"grad_norm": 0.09120248258113861,
|
|
"learning_rate": 7.33288178775504e-06,
|
|
"loss": 0.0663,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.380585845627539,
|
|
"grad_norm": 0.1667109578847885,
|
|
"learning_rate": 7.301553430600559e-06,
|
|
"loss": 0.0647,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.3827239683557836,
|
|
"grad_norm": 0.17563393712043762,
|
|
"learning_rate": 7.270110002317921e-06,
|
|
"loss": 0.0646,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.38486209108402825,
|
|
"grad_norm": 0.16917386651039124,
|
|
"learning_rate": 7.238553074989143e-06,
|
|
"loss": 0.0654,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.38486209108402825,
|
|
"eval_loss": 0.0673459991812706,
|
|
"eval_runtime": 472.3199,
|
|
"eval_samples_per_second": 4.899,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.38700021381227284,
|
|
"grad_norm": 0.11774443835020065,
|
|
"learning_rate": 7.206884226370875e-06,
|
|
"loss": 0.0655,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.38913833654051744,
|
|
"grad_norm": 0.119617760181427,
|
|
"learning_rate": 7.175105039815515e-06,
|
|
"loss": 0.0639,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.39127645926876203,
|
|
"grad_norm": 0.12271067500114441,
|
|
"learning_rate": 7.143217104192041e-06,
|
|
"loss": 0.0682,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.39341458199700663,
|
|
"grad_norm": 0.19674961268901825,
|
|
"learning_rate": 7.111222013806573e-06,
|
|
"loss": 0.0628,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.3955527047252512,
|
|
"grad_norm": 0.23792926967144012,
|
|
"learning_rate": 7.07912136832267e-06,
|
|
"loss": 0.0654,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.3976908274534958,
|
|
"grad_norm": 0.1446692794561386,
|
|
"learning_rate": 7.0469167726813445e-06,
|
|
"loss": 0.067,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.3998289501817404,
|
|
"grad_norm": 0.10155676305294037,
|
|
"learning_rate": 7.014609837020817e-06,
|
|
"loss": 0.0654,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.401967072909985,
|
|
"grad_norm": 0.11214682459831238,
|
|
"learning_rate": 6.9822021765960225e-06,
|
|
"loss": 0.065,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.4041051956382296,
|
|
"grad_norm": 0.09929853677749634,
|
|
"learning_rate": 6.949695411697848e-06,
|
|
"loss": 0.0656,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.40624331836647426,
|
|
"grad_norm": 0.10509800910949707,
|
|
"learning_rate": 6.9170911675721175e-06,
|
|
"loss": 0.0668,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.40838144109471886,
|
|
"grad_norm": 0.11406348645687103,
|
|
"learning_rate": 6.884391074338348e-06,
|
|
"loss": 0.0651,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.41051956382296345,
|
|
"grad_norm": 0.13327282667160034,
|
|
"learning_rate": 6.851596766908229e-06,
|
|
"loss": 0.0681,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.41265768655120805,
|
|
"grad_norm": 0.26162999868392944,
|
|
"learning_rate": 6.818709884903897e-06,
|
|
"loss": 0.0638,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.41479580927945264,
|
|
"grad_norm": 0.10129717737436295,
|
|
"learning_rate": 6.785732072575958e-06,
|
|
"loss": 0.0648,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.41693393200769724,
|
|
"grad_norm": 0.1475946605205536,
|
|
"learning_rate": 6.752664978721269e-06,
|
|
"loss": 0.0643,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.41907205473594183,
|
|
"grad_norm": 0.14560334384441376,
|
|
"learning_rate": 6.719510256600512e-06,
|
|
"loss": 0.0657,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.42121017746418643,
|
|
"grad_norm": 0.14020417630672455,
|
|
"learning_rate": 6.686269563855534e-06,
|
|
"loss": 0.0658,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.423348300192431,
|
|
"grad_norm": 0.1445358246564865,
|
|
"learning_rate": 6.652944562426469e-06,
|
|
"loss": 0.0654,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.4254864229206756,
|
|
"grad_norm": 0.1675933301448822,
|
|
"learning_rate": 6.619536918468643e-06,
|
|
"loss": 0.0621,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.4276245456489203,
|
|
"grad_norm": 0.12988397479057312,
|
|
"learning_rate": 6.586048302269277e-06,
|
|
"loss": 0.0637,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.42976266837716487,
|
|
"grad_norm": 0.10703526437282562,
|
|
"learning_rate": 6.5524803881639694e-06,
|
|
"loss": 0.0639,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.43190079110540947,
|
|
"grad_norm": 0.10743958503007889,
|
|
"learning_rate": 6.518834854452993e-06,
|
|
"loss": 0.0647,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.43403891383365406,
|
|
"grad_norm": 0.09658580273389816,
|
|
"learning_rate": 6.485113383317378e-06,
|
|
"loss": 0.0616,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.43617703656189866,
|
|
"grad_norm": 0.12495086342096329,
|
|
"learning_rate": 6.451317660734812e-06,
|
|
"loss": 0.0657,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.43831515929014325,
|
|
"grad_norm": 0.14097630977630615,
|
|
"learning_rate": 6.417449376395339e-06,
|
|
"loss": 0.0651,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.44045328201838785,
|
|
"grad_norm": 0.09859970957040787,
|
|
"learning_rate": 6.3835102236168885e-06,
|
|
"loss": 0.0634,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.44259140474663244,
|
|
"grad_norm": 0.27701902389526367,
|
|
"learning_rate": 6.34950189926061e-06,
|
|
"loss": 0.0653,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.44472952747487704,
|
|
"grad_norm": 0.10874440521001816,
|
|
"learning_rate": 6.315426103646036e-06,
|
|
"loss": 0.0654,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.44686765020312164,
|
|
"grad_norm": 0.14138060808181763,
|
|
"learning_rate": 6.281284540466067e-06,
|
|
"loss": 0.0645,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.4490057729313663,
|
|
"grad_norm": 0.09226495772600174,
|
|
"learning_rate": 6.247078916701797e-06,
|
|
"loss": 0.0635,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.4511438956596109,
|
|
"grad_norm": 0.10510533303022385,
|
|
"learning_rate": 6.212810942537167e-06,
|
|
"loss": 0.0609,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.4532820183878555,
|
|
"grad_norm": 0.13226036727428436,
|
|
"learning_rate": 6.178482331273462e-06,
|
|
"loss": 0.0631,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.4554201411161001,
|
|
"grad_norm": 0.09952764213085175,
|
|
"learning_rate": 6.144094799243647e-06,
|
|
"loss": 0.0664,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.45755826384434467,
|
|
"grad_norm": 0.25290119647979736,
|
|
"learning_rate": 6.1096500657265575e-06,
|
|
"loss": 0.0638,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.45969638657258927,
|
|
"grad_norm": 0.25737249851226807,
|
|
"learning_rate": 6.075149852860945e-06,
|
|
"loss": 0.0636,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.46183450930083386,
|
|
"grad_norm": 0.10331868380308151,
|
|
"learning_rate": 6.040595885559366e-06,
|
|
"loss": 0.0646,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.46397263202907846,
|
|
"grad_norm": 0.10511161386966705,
|
|
"learning_rate": 6.005989891421948e-06,
|
|
"loss": 0.0662,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.46611075475732305,
|
|
"grad_norm": 0.1610974222421646,
|
|
"learning_rate": 5.971333600650012e-06,
|
|
"loss": 0.0621,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.46824887748556765,
|
|
"grad_norm": 0.13542431592941284,
|
|
"learning_rate": 5.936628745959568e-06,
|
|
"loss": 0.0648,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.47038700021381225,
|
|
"grad_norm": 0.11100970953702927,
|
|
"learning_rate": 5.901877062494684e-06,
|
|
"loss": 0.0616,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.4725251229420569,
|
|
"grad_norm": 0.09847405552864075,
|
|
"learning_rate": 5.867080287740735e-06,
|
|
"loss": 0.0622,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.4746632456703015,
|
|
"grad_norm": 0.11069463193416595,
|
|
"learning_rate": 5.832240161437528e-06,
|
|
"loss": 0.0658,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.4768013683985461,
|
|
"grad_norm": 0.14424288272857666,
|
|
"learning_rate": 5.797358425492328e-06,
|
|
"loss": 0.0627,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.4789394911267907,
|
|
"grad_norm": 0.11425557732582092,
|
|
"learning_rate": 5.762436823892763e-06,
|
|
"loss": 0.0645,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.4810776138550353,
|
|
"grad_norm": 0.12212098389863968,
|
|
"learning_rate": 5.727477102619628e-06,
|
|
"loss": 0.0661,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.4832157365832799,
|
|
"grad_norm": 0.11695980280637741,
|
|
"learning_rate": 5.692481009559598e-06,
|
|
"loss": 0.0633,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.48535385931152447,
|
|
"grad_norm": 0.16033753752708435,
|
|
"learning_rate": 5.657450294417831e-06,
|
|
"loss": 0.068,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.48749198203976907,
|
|
"grad_norm": 0.09469865262508392,
|
|
"learning_rate": 5.622386708630488e-06,
|
|
"loss": 0.0657,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.48963010476801366,
|
|
"grad_norm": 0.10984878987073898,
|
|
"learning_rate": 5.587292005277176e-06,
|
|
"loss": 0.0617,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.49176822749625826,
|
|
"grad_norm": 0.10703036934137344,
|
|
"learning_rate": 5.552167938993286e-06,
|
|
"loss": 0.0641,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.4939063502245029,
|
|
"grad_norm": 0.09129951894283295,
|
|
"learning_rate": 5.51701626588227e-06,
|
|
"loss": 0.0648,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.4960444729527475,
|
|
"grad_norm": 0.14747264981269836,
|
|
"learning_rate": 5.481838743427852e-06,
|
|
"loss": 0.0617,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.4981825956809921,
|
|
"grad_norm": 0.11260967701673508,
|
|
"learning_rate": 5.446637130406141e-06,
|
|
"loss": 0.0631,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.5003207184092366,
|
|
"grad_norm": 0.1024189218878746,
|
|
"learning_rate": 5.411413186797709e-06,
|
|
"loss": 0.064,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.5024588411374813,
|
|
"grad_norm": 0.16150939464569092,
|
|
"learning_rate": 5.376168673699596e-06,
|
|
"loss": 0.0637,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.504596963865726,
|
|
"grad_norm": 0.14528174698352814,
|
|
"learning_rate": 5.340905353237254e-06,
|
|
"loss": 0.0655,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.5067350865939705,
|
|
"grad_norm": 0.12370527535676956,
|
|
"learning_rate": 5.305624988476452e-06,
|
|
"loss": 0.0635,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.5088732093222151,
|
|
"grad_norm": 0.09441283345222473,
|
|
"learning_rate": 5.270329343335126e-06,
|
|
"loss": 0.0651,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.5110113320504597,
|
|
"grad_norm": 0.09483297914266586,
|
|
"learning_rate": 5.235020182495188e-06,
|
|
"loss": 0.0658,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.5131494547787043,
|
|
"grad_norm": 0.11624085903167725,
|
|
"learning_rate": 5.199699271314289e-06,
|
|
"loss": 0.0675,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.5131494547787043,
|
|
"eval_loss": 0.06467495113611221,
|
|
"eval_runtime": 471.8273,
|
|
"eval_samples_per_second": 4.904,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.5152875775069489,
|
|
"grad_norm": 0.1344379037618637,
|
|
"learning_rate": 5.164368375737576e-06,
|
|
"loss": 0.0619,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.5174257002351935,
|
|
"grad_norm": 0.09949100762605667,
|
|
"learning_rate": 5.129029262209381e-06,
|
|
"loss": 0.0617,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.5195638229634381,
|
|
"grad_norm": 0.11078672856092453,
|
|
"learning_rate": 5.093683697584907e-06,
|
|
"loss": 0.0625,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.5217019456916827,
|
|
"grad_norm": 0.15946152806282043,
|
|
"learning_rate": 5.058333449041899e-06,
|
|
"loss": 0.0608,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.5238400684199273,
|
|
"grad_norm": 0.09759578853845596,
|
|
"learning_rate": 5.022980283992283e-06,
|
|
"loss": 0.0604,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.5259781911481719,
|
|
"grad_norm": 0.10458780080080032,
|
|
"learning_rate": 4.9876259699938e-06,
|
|
"loss": 0.063,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.5281163138764166,
|
|
"grad_norm": 0.1019633337855339,
|
|
"learning_rate": 4.952272274661637e-06,
|
|
"loss": 0.0608,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.5302544366046611,
|
|
"grad_norm": 0.09208445250988007,
|
|
"learning_rate": 4.916920965580052e-06,
|
|
"loss": 0.0652,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.5323925593329057,
|
|
"grad_norm": 0.11165319383144379,
|
|
"learning_rate": 4.881573810213989e-06,
|
|
"loss": 0.0615,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.5345306820611503,
|
|
"grad_norm": 0.18430490791797638,
|
|
"learning_rate": 4.8462325758207304e-06,
|
|
"loss": 0.0657,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.5366688047893949,
|
|
"grad_norm": 0.157784566283226,
|
|
"learning_rate": 4.810899029361515e-06,
|
|
"loss": 0.0653,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.5388069275176395,
|
|
"grad_norm": 0.13744202256202698,
|
|
"learning_rate": 4.775574937413211e-06,
|
|
"loss": 0.0618,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.5409450502458841,
|
|
"grad_norm": 0.13207334280014038,
|
|
"learning_rate": 4.740262066079994e-06,
|
|
"loss": 0.0644,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.5430831729741287,
|
|
"grad_norm": 0.16908520460128784,
|
|
"learning_rate": 4.70496218090503e-06,
|
|
"loss": 0.0642,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.5452212957023733,
|
|
"grad_norm": 0.13128970563411713,
|
|
"learning_rate": 4.669677046782221e-06,
|
|
"loss": 0.0652,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.547359418430618,
|
|
"grad_norm": 0.08551183342933655,
|
|
"learning_rate": 4.6344084278679574e-06,
|
|
"loss": 0.065,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.5494975411588625,
|
|
"grad_norm": 0.1189018115401268,
|
|
"learning_rate": 4.599158087492913e-06,
|
|
"loss": 0.0619,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.5516356638871072,
|
|
"grad_norm": 0.24343594908714294,
|
|
"learning_rate": 4.563927788073893e-06,
|
|
"loss": 0.0625,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.5537737866153517,
|
|
"grad_norm": 0.30038872361183167,
|
|
"learning_rate": 4.528719291025706e-06,
|
|
"loss": 0.062,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.5559119093435964,
|
|
"grad_norm": 0.08746038377285004,
|
|
"learning_rate": 4.493534356673102e-06,
|
|
"loss": 0.0638,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.5580500320718409,
|
|
"grad_norm": 0.09681444615125656,
|
|
"learning_rate": 4.458374744162773e-06,
|
|
"loss": 0.0647,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.5601881548000855,
|
|
"grad_norm": 0.11077167838811874,
|
|
"learning_rate": 4.423242211375381e-06,
|
|
"loss": 0.0643,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.5623262775283301,
|
|
"grad_norm": 0.08863001316785812,
|
|
"learning_rate": 4.388138514837685e-06,
|
|
"loss": 0.0627,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.5644644002565747,
|
|
"grad_norm": 0.13838346302509308,
|
|
"learning_rate": 4.35306540963471e-06,
|
|
"loss": 0.0622,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.5666025229848193,
|
|
"grad_norm": 0.09143807739019394,
|
|
"learning_rate": 4.318024649322001e-06,
|
|
"loss": 0.0627,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.5687406457130639,
|
|
"grad_norm": 0.19630184769630432,
|
|
"learning_rate": 4.283017985837955e-06,
|
|
"loss": 0.0626,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.5708787684413086,
|
|
"grad_norm": 0.10313283652067184,
|
|
"learning_rate": 4.248047169416221e-06,
|
|
"loss": 0.062,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.5730168911695531,
|
|
"grad_norm": 0.08923624455928802,
|
|
"learning_rate": 4.213113948498194e-06,
|
|
"loss": 0.0626,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.5751550138977978,
|
|
"grad_norm": 0.13680312037467957,
|
|
"learning_rate": 4.178220069645608e-06,
|
|
"loss": 0.0648,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.5772931366260423,
|
|
"grad_norm": 0.164349764585495,
|
|
"learning_rate": 4.143367277453197e-06,
|
|
"loss": 0.0622,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.579431259354287,
|
|
"grad_norm": 0.09581846743822098,
|
|
"learning_rate": 4.10855731446149e-06,
|
|
"loss": 0.0637,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.5815693820825315,
|
|
"grad_norm": 0.1474573314189911,
|
|
"learning_rate": 4.073791921069664e-06,
|
|
"loss": 0.0611,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.5837075048107762,
|
|
"grad_norm": 0.09582812339067459,
|
|
"learning_rate": 4.039072835448553e-06,
|
|
"loss": 0.0615,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.5858456275390207,
|
|
"grad_norm": 0.101468525826931,
|
|
"learning_rate": 4.004401793453731e-06,
|
|
"loss": 0.061,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.5879837502672653,
|
|
"grad_norm": 0.10648108273744583,
|
|
"learning_rate": 3.969780528538726e-06,
|
|
"loss": 0.0642,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.5901218729955099,
|
|
"grad_norm": 0.09285979717969894,
|
|
"learning_rate": 3.935210771668357e-06,
|
|
"loss": 0.062,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.5922599957237545,
|
|
"grad_norm": 0.09097687900066376,
|
|
"learning_rate": 3.900694251232182e-06,
|
|
"loss": 0.0608,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.5943981184519992,
|
|
"grad_norm": 0.12016556411981583,
|
|
"learning_rate": 3.8662326929580925e-06,
|
|
"loss": 0.0644,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.5965362411802437,
|
|
"grad_norm": 0.10592561960220337,
|
|
"learning_rate": 3.831827819826027e-06,
|
|
"loss": 0.0619,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.5986743639084884,
|
|
"grad_norm": 0.09551785886287689,
|
|
"learning_rate": 3.7974813519818288e-06,
|
|
"loss": 0.0629,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.6008124866367329,
|
|
"grad_norm": 0.13584552705287933,
|
|
"learning_rate": 3.7631950066512423e-06,
|
|
"loss": 0.0652,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.6029506093649776,
|
|
"grad_norm": 0.10060502588748932,
|
|
"learning_rate": 3.7289704980540586e-06,
|
|
"loss": 0.0602,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.6050887320932221,
|
|
"grad_norm": 0.10792689025402069,
|
|
"learning_rate": 3.694809537318402e-06,
|
|
"loss": 0.0635,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.6072268548214668,
|
|
"grad_norm": 0.1079002246260643,
|
|
"learning_rate": 3.660713832395193e-06,
|
|
"loss": 0.0646,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.6093649775497113,
|
|
"grad_norm": 0.12554524838924408,
|
|
"learning_rate": 3.626685087972743e-06,
|
|
"loss": 0.0607,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.611503100277956,
|
|
"grad_norm": 0.09405695647001266,
|
|
"learning_rate": 3.592725005391524e-06,
|
|
"loss": 0.065,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.6136412230062006,
|
|
"grad_norm": 0.09625021368265152,
|
|
"learning_rate": 3.55883528255912e-06,
|
|
"loss": 0.0621,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.6157793457344451,
|
|
"grad_norm": 0.1028503030538559,
|
|
"learning_rate": 3.525017613865321e-06,
|
|
"loss": 0.0628,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.6179174684626898,
|
|
"grad_norm": 0.08910300582647324,
|
|
"learning_rate": 3.491273690097421e-06,
|
|
"loss": 0.0599,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.6200555911909343,
|
|
"grad_norm": 0.1026788130402565,
|
|
"learning_rate": 3.45760519835567e-06,
|
|
"loss": 0.0612,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.622193713919179,
|
|
"grad_norm": 0.09384245425462723,
|
|
"learning_rate": 3.4240138219689343e-06,
|
|
"loss": 0.0625,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.6243318366474235,
|
|
"grad_norm": 0.09324868768453598,
|
|
"learning_rate": 3.390501240410535e-06,
|
|
"loss": 0.0611,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.6264699593756682,
|
|
"grad_norm": 0.1346089392900467,
|
|
"learning_rate": 3.3570691292142694e-06,
|
|
"loss": 0.0644,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.6286080821039127,
|
|
"grad_norm": 0.13233442604541779,
|
|
"learning_rate": 3.3237191598906536e-06,
|
|
"loss": 0.0634,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.6307462048321574,
|
|
"grad_norm": 0.11263269186019897,
|
|
"learning_rate": 3.2904529998433356e-06,
|
|
"loss": 0.0658,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.6328843275604019,
|
|
"grad_norm": 0.32706591486930847,
|
|
"learning_rate": 3.2572723122857416e-06,
|
|
"loss": 0.0656,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.6350224502886466,
|
|
"grad_norm": 0.2275686264038086,
|
|
"learning_rate": 3.224178756157918e-06,
|
|
"loss": 0.0614,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.6371605730168912,
|
|
"grad_norm": 0.09637604653835297,
|
|
"learning_rate": 3.191173986043583e-06,
|
|
"loss": 0.0607,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.6392986957451358,
|
|
"grad_norm": 0.10239467024803162,
|
|
"learning_rate": 3.1582596520874096e-06,
|
|
"loss": 0.0623,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.6414368184733804,
|
|
"grad_norm": 0.08615203946828842,
|
|
"learning_rate": 3.125437399912521e-06,
|
|
"loss": 0.0613,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.6414368184733804,
|
|
"eval_loss": 0.06311963498592377,
|
|
"eval_runtime": 471.8096,
|
|
"eval_samples_per_second": 4.905,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.643574941201625,
|
|
"grad_norm": 0.08130071312189102,
|
|
"learning_rate": 3.0927088705382092e-06,
|
|
"loss": 0.0637,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.6457130639298696,
|
|
"grad_norm": 0.09418642520904541,
|
|
"learning_rate": 3.060075700297896e-06,
|
|
"loss": 0.061,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.6478511866581141,
|
|
"grad_norm": 0.09334340691566467,
|
|
"learning_rate": 3.0275395207573178e-06,
|
|
"loss": 0.0598,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.6499893093863588,
|
|
"grad_norm": 0.10348668694496155,
|
|
"learning_rate": 2.9951019586329467e-06,
|
|
"loss": 0.0613,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.6521274321146033,
|
|
"grad_norm": 0.09085489809513092,
|
|
"learning_rate": 2.962764635710672e-06,
|
|
"loss": 0.0619,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.654265554842848,
|
|
"grad_norm": 0.09327159821987152,
|
|
"learning_rate": 2.930529168764702e-06,
|
|
"loss": 0.0635,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.6564036775710925,
|
|
"grad_norm": 0.0930228903889656,
|
|
"learning_rate": 2.89839716947674e-06,
|
|
"loss": 0.0643,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.6585418002993372,
|
|
"grad_norm": 0.1704426109790802,
|
|
"learning_rate": 2.8663702443553967e-06,
|
|
"loss": 0.0633,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.6606799230275818,
|
|
"grad_norm": 0.09673333913087845,
|
|
"learning_rate": 2.8344499946558714e-06,
|
|
"loss": 0.0606,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.6628180457558264,
|
|
"grad_norm": 0.07926999032497406,
|
|
"learning_rate": 2.8026380162999055e-06,
|
|
"loss": 0.0614,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.664956168484071,
|
|
"grad_norm": 0.09460759162902832,
|
|
"learning_rate": 2.7709358997959724e-06,
|
|
"loss": 0.0622,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.6670942912123156,
|
|
"grad_norm": 0.09549721330404282,
|
|
"learning_rate": 2.7393452301597645e-06,
|
|
"loss": 0.0618,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.6692324139405602,
|
|
"grad_norm": 0.09777417033910751,
|
|
"learning_rate": 2.7078675868349546e-06,
|
|
"loss": 0.0602,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.6713705366688048,
|
|
"grad_norm": 0.08188968896865845,
|
|
"learning_rate": 2.676504543614214e-06,
|
|
"loss": 0.0624,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.6735086593970494,
|
|
"grad_norm": 0.10578031837940216,
|
|
"learning_rate": 2.6452576685605385e-06,
|
|
"loss": 0.0608,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.6756467821252939,
|
|
"grad_norm": 0.09571292251348495,
|
|
"learning_rate": 2.614128523928848e-06,
|
|
"loss": 0.0613,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.6777849048535386,
|
|
"grad_norm": 0.09241370856761932,
|
|
"learning_rate": 2.583118666087869e-06,
|
|
"loss": 0.0615,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.6799230275817832,
|
|
"grad_norm": 0.09565988928079605,
|
|
"learning_rate": 2.552229645442337e-06,
|
|
"loss": 0.0605,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.6820611503100278,
|
|
"grad_norm": 0.1339564025402069,
|
|
"learning_rate": 2.5214630063554597e-06,
|
|
"loss": 0.0614,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.6841992730382724,
|
|
"grad_norm": 0.21678894758224487,
|
|
"learning_rate": 2.4908202870717267e-06,
|
|
"loss": 0.0631,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.686337395766517,
|
|
"grad_norm": 0.1025332510471344,
|
|
"learning_rate": 2.4603030196399796e-06,
|
|
"loss": 0.0612,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.6884755184947616,
|
|
"grad_norm": 0.09821213781833649,
|
|
"learning_rate": 2.4299127298368314e-06,
|
|
"loss": 0.0606,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.6906136412230062,
|
|
"grad_norm": 0.11024197936058044,
|
|
"learning_rate": 2.399650937090373e-06,
|
|
"loss": 0.0618,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.6927517639512508,
|
|
"grad_norm": 0.09742552042007446,
|
|
"learning_rate": 2.369519154404205e-06,
|
|
"loss": 0.0602,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.6948898866794954,
|
|
"grad_norm": 0.11659736931324005,
|
|
"learning_rate": 2.339518888281795e-06,
|
|
"loss": 0.0599,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.69702800940774,
|
|
"grad_norm": 0.12253785133361816,
|
|
"learning_rate": 2.3096516386511585e-06,
|
|
"loss": 0.062,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.6991661321359846,
|
|
"grad_norm": 0.10601403564214706,
|
|
"learning_rate": 2.279918898789865e-06,
|
|
"loss": 0.0603,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.7013042548642292,
|
|
"grad_norm": 0.1589890867471695,
|
|
"learning_rate": 2.2503221552503777e-06,
|
|
"loss": 0.0617,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.7034423775924739,
|
|
"grad_norm": 0.10667730122804642,
|
|
"learning_rate": 2.2208628877857276e-06,
|
|
"loss": 0.0595,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.7055805003207184,
|
|
"grad_norm": 0.09270080178976059,
|
|
"learning_rate": 2.1915425692755325e-06,
|
|
"loss": 0.0613,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.707718623048963,
|
|
"grad_norm": 0.08238282054662704,
|
|
"learning_rate": 2.162362665652364e-06,
|
|
"loss": 0.0593,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.7098567457772076,
|
|
"grad_norm": 0.08994623273611069,
|
|
"learning_rate": 2.1333246358284394e-06,
|
|
"loss": 0.0602,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.7119948685054522,
|
|
"grad_norm": 0.08377353101968765,
|
|
"learning_rate": 2.1044299316226962e-06,
|
|
"loss": 0.0639,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.7141329912336968,
|
|
"grad_norm": 0.11232832074165344,
|
|
"learning_rate": 2.0756799976881987e-06,
|
|
"loss": 0.0633,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.7162711139619414,
|
|
"grad_norm": 0.10576164722442627,
|
|
"learning_rate": 2.047076271439903e-06,
|
|
"loss": 0.0621,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.718409236690186,
|
|
"grad_norm": 0.10784109681844711,
|
|
"learning_rate": 2.018620182982803e-06,
|
|
"loss": 0.0633,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.7205473594184306,
|
|
"grad_norm": 0.10578976571559906,
|
|
"learning_rate": 1.9903131550404185e-06,
|
|
"loss": 0.0619,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.7226854821466753,
|
|
"grad_norm": 0.13373176753520966,
|
|
"learning_rate": 1.9621566028836717e-06,
|
|
"loss": 0.0589,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.7248236048749198,
|
|
"grad_norm": 0.09690658748149872,
|
|
"learning_rate": 1.9341519342601166e-06,
|
|
"loss": 0.0606,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.7269617276031645,
|
|
"grad_norm": 0.09626404196023941,
|
|
"learning_rate": 1.9063005493235692e-06,
|
|
"loss": 0.0597,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.729099850331409,
|
|
"grad_norm": 0.08039379119873047,
|
|
"learning_rate": 1.8786038405640954e-06,
|
|
"loss": 0.0619,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.7312379730596537,
|
|
"grad_norm": 0.08523211628198624,
|
|
"learning_rate": 1.8510631927383887e-06,
|
|
"loss": 0.0601,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.7333760957878982,
|
|
"grad_norm": 0.10265989601612091,
|
|
"learning_rate": 1.8236799828005402e-06,
|
|
"loss": 0.0602,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.7355142185161428,
|
|
"grad_norm": 0.0833195149898529,
|
|
"learning_rate": 1.796455579833198e-06,
|
|
"loss": 0.0613,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.7376523412443874,
|
|
"grad_norm": 0.08251874148845673,
|
|
"learning_rate": 1.7693913449791094e-06,
|
|
"loss": 0.061,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.739790463972632,
|
|
"grad_norm": 0.0832567885518074,
|
|
"learning_rate": 1.7424886313730765e-06,
|
|
"loss": 0.0607,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.7419285867008766,
|
|
"grad_norm": 0.09145036339759827,
|
|
"learning_rate": 1.7157487840742908e-06,
|
|
"loss": 0.0625,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.7440667094291212,
|
|
"grad_norm": 0.10999724268913269,
|
|
"learning_rate": 1.6891731399990952e-06,
|
|
"loss": 0.0618,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.7462048321573659,
|
|
"grad_norm": 0.08439410477876663,
|
|
"learning_rate": 1.6627630278541406e-06,
|
|
"loss": 0.062,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.7483429548856104,
|
|
"grad_norm": 0.09440149366855621,
|
|
"learning_rate": 1.6365197680699468e-06,
|
|
"loss": 0.0635,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.7504810776138551,
|
|
"grad_norm": 0.07940148562192917,
|
|
"learning_rate": 1.6104446727348944e-06,
|
|
"loss": 0.0594,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.7526192003420996,
|
|
"grad_norm": 0.174238920211792,
|
|
"learning_rate": 1.5845390455296195e-06,
|
|
"loss": 0.0602,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.7547573230703443,
|
|
"grad_norm": 0.08501884341239929,
|
|
"learning_rate": 1.5588041816618288e-06,
|
|
"loss": 0.0636,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.7568954457985888,
|
|
"grad_norm": 0.10274173319339752,
|
|
"learning_rate": 1.533241367801554e-06,
|
|
"loss": 0.0596,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.7590335685268335,
|
|
"grad_norm": 0.11729396134614944,
|
|
"learning_rate": 1.5078518820168097e-06,
|
|
"loss": 0.0587,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.761171691255078,
|
|
"grad_norm": 0.11430974304676056,
|
|
"learning_rate": 1.482636993709703e-06,
|
|
"loss": 0.0603,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.7633098139833226,
|
|
"grad_norm": 0.08141325414180756,
|
|
"learning_rate": 1.4575979635529653e-06,
|
|
"loss": 0.061,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.7654479367115672,
|
|
"grad_norm": 0.13267385959625244,
|
|
"learning_rate": 1.4327360434269138e-06,
|
|
"loss": 0.0621,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.7675860594398118,
|
|
"grad_norm": 0.18512435257434845,
|
|
"learning_rate": 1.4080524763568754e-06,
|
|
"loss": 0.0599,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.7697241821680565,
|
|
"grad_norm": 0.10558240115642548,
|
|
"learning_rate": 1.383548496451026e-06,
|
|
"loss": 0.0604,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.7697241821680565,
|
|
"eval_loss": 0.06212155520915985,
|
|
"eval_runtime": 472.0918,
|
|
"eval_samples_per_second": 4.902,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.771862304896301,
|
|
"grad_norm": 0.08125459402799606,
|
|
"learning_rate": 1.3592253288386937e-06,
|
|
"loss": 0.0569,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.7740004276245457,
|
|
"grad_norm": 0.10237967222929001,
|
|
"learning_rate": 1.33508418960911e-06,
|
|
"loss": 0.0634,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.7761385503527902,
|
|
"grad_norm": 0.13348835706710815,
|
|
"learning_rate": 1.3111262857506018e-06,
|
|
"loss": 0.0622,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.7782766730810349,
|
|
"grad_norm": 0.0952489823102951,
|
|
"learning_rate": 1.287352815090251e-06,
|
|
"loss": 0.0624,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.7804147958092794,
|
|
"grad_norm": 0.08191373199224472,
|
|
"learning_rate": 1.263764966234e-06,
|
|
"loss": 0.0608,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.7825529185375241,
|
|
"grad_norm": 0.1063610091805458,
|
|
"learning_rate": 1.2403639185072298e-06,
|
|
"loss": 0.0606,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.7846910412657686,
|
|
"grad_norm": 0.10587465018033981,
|
|
"learning_rate": 1.2171508418958005e-06,
|
|
"loss": 0.061,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.7868291639940133,
|
|
"grad_norm": 0.08978404104709625,
|
|
"learning_rate": 1.194126896987543e-06,
|
|
"loss": 0.0604,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.7889672867222579,
|
|
"grad_norm": 0.08431018143892288,
|
|
"learning_rate": 1.1712932349142481e-06,
|
|
"loss": 0.0587,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.7911054094505025,
|
|
"grad_norm": 0.09140188992023468,
|
|
"learning_rate": 1.1486509972941029e-06,
|
|
"loss": 0.059,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.7932435321787471,
|
|
"grad_norm": 0.21178825199604034,
|
|
"learning_rate": 1.1262013161746144e-06,
|
|
"loss": 0.0589,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.7953816549069916,
|
|
"grad_norm": 0.09437291324138641,
|
|
"learning_rate": 1.1039453139760154e-06,
|
|
"loss": 0.059,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.7975197776352363,
|
|
"grad_norm": 0.08276817947626114,
|
|
"learning_rate": 1.081884103435139e-06,
|
|
"loss": 0.062,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.7996579003634808,
|
|
"grad_norm": 0.08296237885951996,
|
|
"learning_rate": 1.060018787549793e-06,
|
|
"loss": 0.0595,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.8017960230917255,
|
|
"grad_norm": 0.09400928020477295,
|
|
"learning_rate": 1.03835045952361e-06,
|
|
"loss": 0.0586,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.80393414581997,
|
|
"grad_norm": 0.09929320216178894,
|
|
"learning_rate": 1.016880202711384e-06,
|
|
"loss": 0.0603,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.8060722685482147,
|
|
"grad_norm": 0.1045432910323143,
|
|
"learning_rate": 9.956090905649184e-07,
|
|
"loss": 0.0591,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.8082103912764592,
|
|
"grad_norm": 0.08326222002506256,
|
|
"learning_rate": 9.74538186579345e-07,
|
|
"loss": 0.0596,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.8103485140047039,
|
|
"grad_norm": 0.08769119530916214,
|
|
"learning_rate": 9.536685442399568e-07,
|
|
"loss": 0.0594,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.8124866367329485,
|
|
"grad_norm": 0.09135784208774567,
|
|
"learning_rate": 9.330012069695387e-07,
|
|
"loss": 0.059,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.8146247594611931,
|
|
"grad_norm": 0.09643464535474777,
|
|
"learning_rate": 9.125372080761985e-07,
|
|
"loss": 0.0584,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.8167628821894377,
|
|
"grad_norm": 0.08774056285619736,
|
|
"learning_rate": 8.922775707016973e-07,
|
|
"loss": 0.0617,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.8189010049176823,
|
|
"grad_norm": 0.1310407817363739,
|
|
"learning_rate": 8.722233077703096e-07,
|
|
"loss": 0.0618,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.8210391276459269,
|
|
"grad_norm": 0.09406092017889023,
|
|
"learning_rate": 8.523754219381631e-07,
|
|
"loss": 0.0581,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.8231772503741714,
|
|
"grad_norm": 0.111025370657444,
|
|
"learning_rate": 8.327349055431233e-07,
|
|
"loss": 0.061,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.8253153731024161,
|
|
"grad_norm": 0.08444110304117203,
|
|
"learning_rate": 8.13302740555173e-07,
|
|
"loss": 0.0613,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.8274534958306606,
|
|
"grad_norm": 0.16973550617694855,
|
|
"learning_rate": 7.940798985273124e-07,
|
|
"loss": 0.0622,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.8295916185589053,
|
|
"grad_norm": 0.14076325297355652,
|
|
"learning_rate": 7.750673405469949e-07,
|
|
"loss": 0.0622,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.8317297412871499,
|
|
"grad_norm": 0.12866050004959106,
|
|
"learning_rate": 7.562660171880632e-07,
|
|
"loss": 0.0623,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.8338678640153945,
|
|
"grad_norm": 0.08956257998943329,
|
|
"learning_rate": 7.376768684632357e-07,
|
|
"loss": 0.0589,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.8360059867436391,
|
|
"grad_norm": 0.09152022004127502,
|
|
"learning_rate": 7.193008237770971e-07,
|
|
"loss": 0.0615,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.8381441094718837,
|
|
"grad_norm": 0.0782172754406929,
|
|
"learning_rate": 7.011388018796389e-07,
|
|
"loss": 0.0611,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.8402822322001283,
|
|
"grad_norm": 0.0877286046743393,
|
|
"learning_rate": 6.831917108203217e-07,
|
|
"loss": 0.0597,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.8424203549283729,
|
|
"grad_norm": 0.08773230016231537,
|
|
"learning_rate": 6.654604479026728e-07,
|
|
"loss": 0.0601,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.8445584776566175,
|
|
"grad_norm": 0.08106860518455505,
|
|
"learning_rate": 6.479458996394294e-07,
|
|
"loss": 0.0633,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.846696600384862,
|
|
"grad_norm": 0.09297432750463486,
|
|
"learning_rate": 6.306489417082096e-07,
|
|
"loss": 0.0621,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.8488347231131067,
|
|
"grad_norm": 0.09097818285226822,
|
|
"learning_rate": 6.135704389077335e-07,
|
|
"loss": 0.0609,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.8509728458413512,
|
|
"grad_norm": 0.10589548945426941,
|
|
"learning_rate": 5.967112451145868e-07,
|
|
"loss": 0.0605,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.8531109685695959,
|
|
"grad_norm": 0.08168578892946243,
|
|
"learning_rate": 5.800722032405304e-07,
|
|
"loss": 0.0591,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.8552490912978405,
|
|
"grad_norm": 0.09122731536626816,
|
|
"learning_rate": 5.636541451903494e-07,
|
|
"loss": 0.0586,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.8573872140260851,
|
|
"grad_norm": 0.10044734925031662,
|
|
"learning_rate": 5.474578918202717e-07,
|
|
"loss": 0.0608,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.8595253367543297,
|
|
"grad_norm": 0.07848547399044037,
|
|
"learning_rate": 5.314842528969177e-07,
|
|
"loss": 0.0609,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.8616634594825743,
|
|
"grad_norm": 0.09690708667039871,
|
|
"learning_rate": 5.157340270568212e-07,
|
|
"loss": 0.0629,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.8638015822108189,
|
|
"grad_norm": 0.09269597381353378,
|
|
"learning_rate": 5.002080017664973e-07,
|
|
"loss": 0.0587,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.8659397049390635,
|
|
"grad_norm": 0.08430242538452148,
|
|
"learning_rate": 4.849069532830669e-07,
|
|
"loss": 0.0616,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.8680778276673081,
|
|
"grad_norm": 0.09234491735696793,
|
|
"learning_rate": 4.698316466154551e-07,
|
|
"loss": 0.0613,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.8702159503955527,
|
|
"grad_norm": 0.11234113574028015,
|
|
"learning_rate": 4.549828354861341e-07,
|
|
"loss": 0.06,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.8723540731237973,
|
|
"grad_norm": 0.08569945394992828,
|
|
"learning_rate": 4.4036126229344613e-07,
|
|
"loss": 0.0592,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.8744921958520419,
|
|
"grad_norm": 0.085124172270298,
|
|
"learning_rate": 4.2596765807448037e-07,
|
|
"loss": 0.0599,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.8766303185802865,
|
|
"grad_norm": 0.08620309084653854,
|
|
"learning_rate": 4.1180274246852724e-07,
|
|
"loss": 0.0644,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.8787684413085312,
|
|
"grad_norm": 0.08025231957435608,
|
|
"learning_rate": 3.97867223681096e-07,
|
|
"loss": 0.0607,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.8809065640367757,
|
|
"grad_norm": 0.08026058226823807,
|
|
"learning_rate": 3.841617984485069e-07,
|
|
"loss": 0.0585,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.8830446867650203,
|
|
"grad_norm": 0.07990364730358124,
|
|
"learning_rate": 3.706871520030553e-07,
|
|
"loss": 0.0622,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.8851828094932649,
|
|
"grad_norm": 0.11747777462005615,
|
|
"learning_rate": 3.574439580387562e-07,
|
|
"loss": 0.0624,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.8873209322215095,
|
|
"grad_norm": 0.08300217986106873,
|
|
"learning_rate": 3.444328786776557e-07,
|
|
"loss": 0.0617,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.8894590549497541,
|
|
"grad_norm": 0.11143102496862411,
|
|
"learning_rate": 3.3165456443673307e-07,
|
|
"loss": 0.061,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.8915971776779987,
|
|
"grad_norm": 0.0875178873538971,
|
|
"learning_rate": 3.1910965419537087e-07,
|
|
"loss": 0.062,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.8937353004062433,
|
|
"grad_norm": 0.07958797365427017,
|
|
"learning_rate": 3.0679877516341386e-07,
|
|
"loss": 0.0607,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.8958734231344879,
|
|
"grad_norm": 0.07875709980726242,
|
|
"learning_rate": 2.947225428498152e-07,
|
|
"loss": 0.0602,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.8980115458627326,
|
|
"grad_norm": 0.08480419218540192,
|
|
"learning_rate": 2.828815610318569e-07,
|
|
"loss": 0.0625,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.8980115458627326,
|
|
"eval_loss": 0.061606768518686295,
|
|
"eval_runtime": 472.6015,
|
|
"eval_samples_per_second": 4.896,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.9001496685909771,
|
|
"grad_norm": 0.08824723958969116,
|
|
"learning_rate": 2.7127642172496583e-07,
|
|
"loss": 0.0595,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.9022877913192218,
|
|
"grad_norm": 0.10775342583656311,
|
|
"learning_rate": 2.59907705153114e-07,
|
|
"loss": 0.0587,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.9044259140474663,
|
|
"grad_norm": 0.08595039695501328,
|
|
"learning_rate": 2.487759797198075e-07,
|
|
"loss": 0.0603,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.906564036775711,
|
|
"grad_norm": 0.09018037468194962,
|
|
"learning_rate": 2.3788180197967193e-07,
|
|
"loss": 0.061,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.9087021595039555,
|
|
"grad_norm": 0.10949289798736572,
|
|
"learning_rate": 2.272257166106201e-07,
|
|
"loss": 0.059,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.9108402822322001,
|
|
"grad_norm": 0.0874534547328949,
|
|
"learning_rate": 2.1680825638662527e-07,
|
|
"loss": 0.0607,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.9129784049604447,
|
|
"grad_norm": 0.09561596810817719,
|
|
"learning_rate": 2.06629942151082e-07,
|
|
"loss": 0.0605,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.9151165276886893,
|
|
"grad_norm": 0.09017440676689148,
|
|
"learning_rate": 1.9669128279076522e-07,
|
|
"loss": 0.0603,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.9172546504169339,
|
|
"grad_norm": 0.08165230602025986,
|
|
"learning_rate": 1.8699277521038672e-07,
|
|
"loss": 0.0607,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.9193927731451785,
|
|
"grad_norm": 0.09331604838371277,
|
|
"learning_rate": 1.7753490430775288e-07,
|
|
"loss": 0.0597,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.9215308958734232,
|
|
"grad_norm": 0.08846830576658249,
|
|
"learning_rate": 1.6831814294951843e-07,
|
|
"loss": 0.0612,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.9236690186016677,
|
|
"grad_norm": 0.1046655997633934,
|
|
"learning_rate": 1.5934295194754924e-07,
|
|
"loss": 0.0593,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.9258071413299124,
|
|
"grad_norm": 0.0815853402018547,
|
|
"learning_rate": 1.5060978003587745e-07,
|
|
"loss": 0.0614,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.9279452640581569,
|
|
"grad_norm": 0.08241681009531021,
|
|
"learning_rate": 1.4211906384827223e-07,
|
|
"loss": 0.0614,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.9300833867864016,
|
|
"grad_norm": 0.07734426110982895,
|
|
"learning_rate": 1.3387122789640163e-07,
|
|
"loss": 0.0597,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.9322215095146461,
|
|
"grad_norm": 0.09207015484571457,
|
|
"learning_rate": 1.2586668454861505e-07,
|
|
"loss": 0.0612,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.9343596322428908,
|
|
"grad_norm": 0.09312504529953003,
|
|
"learning_rate": 1.181058340093233e-07,
|
|
"loss": 0.0602,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.9364977549711353,
|
|
"grad_norm": 0.08721912652254105,
|
|
"learning_rate": 1.1058906429898764e-07,
|
|
"loss": 0.0627,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.93863587769938,
|
|
"grad_norm": 0.08252057433128357,
|
|
"learning_rate": 1.033167512347244e-07,
|
|
"loss": 0.0591,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.9407740004276245,
|
|
"grad_norm": 0.07943445444107056,
|
|
"learning_rate": 9.62892584115116e-08,
|
|
"loss": 0.0586,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.9429121231558691,
|
|
"grad_norm": 0.1239377111196518,
|
|
"learning_rate": 8.950693718401016e-08,
|
|
"loss": 0.0596,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.9450502458841138,
|
|
"grad_norm": 0.08284857869148254,
|
|
"learning_rate": 8.297012664900017e-08,
|
|
"loss": 0.0581,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.9471883686123583,
|
|
"grad_norm": 0.11521401256322861,
|
|
"learning_rate": 7.667915362842337e-08,
|
|
"loss": 0.0619,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.949326491340603,
|
|
"grad_norm": 0.0899544283747673,
|
|
"learning_rate": 7.063433265304509e-08,
|
|
"loss": 0.0645,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.9514646140688475,
|
|
"grad_norm": 0.07524080574512482,
|
|
"learning_rate": 6.483596594672959e-08,
|
|
"loss": 0.0597,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.9536027367970922,
|
|
"grad_norm": 0.10981776565313339,
|
|
"learning_rate": 5.928434341132605e-08,
|
|
"loss": 0.0582,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.9557408595253367,
|
|
"grad_norm": 0.09796682000160217,
|
|
"learning_rate": 5.397974261217909e-08,
|
|
"loss": 0.0568,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.9578789822535814,
|
|
"grad_norm": 0.0810341015458107,
|
|
"learning_rate": 4.892242876424702e-08,
|
|
"loss": 0.0612,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.9600171049818259,
|
|
"grad_norm": 0.08066795766353607,
|
|
"learning_rate": 4.411265471884363e-08,
|
|
"loss": 0.0629,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.9621552277100706,
|
|
"grad_norm": 0.08418738096952438,
|
|
"learning_rate": 3.955066095099769e-08,
|
|
"loss": 0.0584,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.9642933504383152,
|
|
"grad_norm": 0.07691395282745361,
|
|
"learning_rate": 3.523667554742704e-08,
|
|
"loss": 0.06,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.9664314731665598,
|
|
"grad_norm": 0.08319604396820068,
|
|
"learning_rate": 3.117091419513829e-08,
|
|
"loss": 0.0597,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.9685695958948044,
|
|
"grad_norm": 0.09783016890287399,
|
|
"learning_rate": 2.7353580170638714e-08,
|
|
"loss": 0.0605,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.9707077186230489,
|
|
"grad_norm": 0.10302453488111496,
|
|
"learning_rate": 2.3784864329777224e-08,
|
|
"loss": 0.0594,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.9728458413512936,
|
|
"grad_norm": 0.0973035991191864,
|
|
"learning_rate": 2.0464945098200296e-08,
|
|
"loss": 0.0634,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.9749839640795381,
|
|
"grad_norm": 0.07076684385538101,
|
|
"learning_rate": 1.739398846242968e-08,
|
|
"loss": 0.0618,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.9771220868077828,
|
|
"grad_norm": 0.1335037350654602,
|
|
"learning_rate": 1.4572147961567917e-08,
|
|
"loss": 0.062,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.9792602095360273,
|
|
"grad_norm": 0.08357635885477066,
|
|
"learning_rate": 1.1999564679616715e-08,
|
|
"loss": 0.0618,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.981398332264272,
|
|
"grad_norm": 0.08964463323354721,
|
|
"learning_rate": 9.67636723842591e-09,
|
|
"loss": 0.0604,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.9835364549925165,
|
|
"grad_norm": 0.08757588267326355,
|
|
"learning_rate": 7.602671791263616e-09,
|
|
"loss": 0.0619,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.9856745777207612,
|
|
"grad_norm": 0.08129626512527466,
|
|
"learning_rate": 5.778582017005874e-09,
|
|
"loss": 0.0591,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.9878127004490058,
|
|
"grad_norm": 0.08929581940174103,
|
|
"learning_rate": 4.204189114955793e-09,
|
|
"loss": 0.0581,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.9899508231772504,
|
|
"grad_norm": 0.09756463766098022,
|
|
"learning_rate": 2.8795718002821993e-09,
|
|
"loss": 0.0603,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.992088945905495,
|
|
"grad_norm": 0.09502363204956055,
|
|
"learning_rate": 1.80479630008501e-09,
|
|
"loss": 0.062,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.9942270686337396,
|
|
"grad_norm": 0.0853857547044754,
|
|
"learning_rate": 9.799163500834319e-10,
|
|
"loss": 0.0616,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.9963651913619842,
|
|
"grad_norm": 0.0833888053894043,
|
|
"learning_rate": 4.049731919303357e-10,
|
|
"loss": 0.0587,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.9985033140902287,
|
|
"grad_norm": 0.089773990213871,
|
|
"learning_rate": 7.999557114835022e-11,
|
|
"loss": 0.0608,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 4677,
|
|
"total_flos": 9.549834298038052e+19,
|
|
"train_loss": 0.10822304976860733,
|
|
"train_runtime": 69525.0456,
|
|
"train_samples_per_second": 1.076,
|
|
"train_steps_per_second": 0.067
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 4677,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.549834298038052e+19,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|