Files
deepseek-prover-v2-cpt-sft-1e/trainer_state.json
ModelHub XC aa13e379ad 初始化项目,由ModelHub XC社区提供模型
Model: formalmathatepfl/deepseek-prover-v2-cpt-sft-1e
Source: Original Platform
2026-06-11 02:52:18 +08:00

3369 lines
82 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 600,
"global_step": 4677,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021381227282446015,
"grad_norm": 13.399383544921875,
"learning_rate": 3.846153846153847e-07,
"loss": 3.2323,
"step": 10
},
{
"epoch": 0.004276245456489203,
"grad_norm": 12.956534385681152,
"learning_rate": 8.11965811965812e-07,
"loss": 3.2089,
"step": 20
},
{
"epoch": 0.006414368184733804,
"grad_norm": 6.201784610748291,
"learning_rate": 1.2393162393162394e-06,
"loss": 2.8859,
"step": 30
},
{
"epoch": 0.008552490912978406,
"grad_norm": 5.795752048492432,
"learning_rate": 1.6666666666666667e-06,
"loss": 2.469,
"step": 40
},
{
"epoch": 0.010690613641223007,
"grad_norm": 4.1530351638793945,
"learning_rate": 2.094017094017094e-06,
"loss": 2.0068,
"step": 50
},
{
"epoch": 0.012828736369467608,
"grad_norm": 4.0573530197143555,
"learning_rate": 2.5213675213675216e-06,
"loss": 1.5318,
"step": 60
},
{
"epoch": 0.014966859097712209,
"grad_norm": 9.95679759979248,
"learning_rate": 2.948717948717949e-06,
"loss": 1.073,
"step": 70
},
{
"epoch": 0.01710498182595681,
"grad_norm": 2.138951539993286,
"learning_rate": 3.3760683760683765e-06,
"loss": 0.7936,
"step": 80
},
{
"epoch": 0.01924310455420141,
"grad_norm": 1.6260210275650024,
"learning_rate": 3.8034188034188036e-06,
"loss": 0.6121,
"step": 90
},
{
"epoch": 0.021381227282446014,
"grad_norm": 3.553804636001587,
"learning_rate": 4.230769230769231e-06,
"loss": 0.4587,
"step": 100
},
{
"epoch": 0.023519350010690613,
"grad_norm": 4.238368988037109,
"learning_rate": 4.658119658119659e-06,
"loss": 0.3611,
"step": 110
},
{
"epoch": 0.025657472738935216,
"grad_norm": 3.7698726654052734,
"learning_rate": 5.085470085470086e-06,
"loss": 0.3285,
"step": 120
},
{
"epoch": 0.027795595467179815,
"grad_norm": 6.797550201416016,
"learning_rate": 5.512820512820514e-06,
"loss": 0.2798,
"step": 130
},
{
"epoch": 0.029933718195424418,
"grad_norm": 4.188291549682617,
"learning_rate": 5.940170940170941e-06,
"loss": 0.2401,
"step": 140
},
{
"epoch": 0.03207184092366902,
"grad_norm": 0.8605281114578247,
"learning_rate": 6.367521367521368e-06,
"loss": 0.2354,
"step": 150
},
{
"epoch": 0.03420996365191362,
"grad_norm": 1.5260062217712402,
"learning_rate": 6.794871794871796e-06,
"loss": 0.1918,
"step": 160
},
{
"epoch": 0.03634808638015822,
"grad_norm": 1.3726520538330078,
"learning_rate": 7.222222222222223e-06,
"loss": 0.1818,
"step": 170
},
{
"epoch": 0.03848620910840282,
"grad_norm": 1.8705629110336304,
"learning_rate": 7.649572649572649e-06,
"loss": 0.153,
"step": 180
},
{
"epoch": 0.040624331836647425,
"grad_norm": 2.4513299465179443,
"learning_rate": 8.076923076923077e-06,
"loss": 0.154,
"step": 190
},
{
"epoch": 0.04276245456489203,
"grad_norm": 0.2777731418609619,
"learning_rate": 8.504273504273505e-06,
"loss": 0.1415,
"step": 200
},
{
"epoch": 0.04490057729313662,
"grad_norm": 0.23002663254737854,
"learning_rate": 8.931623931623933e-06,
"loss": 0.1326,
"step": 210
},
{
"epoch": 0.047038700021381226,
"grad_norm": 1.1118742227554321,
"learning_rate": 9.358974358974359e-06,
"loss": 0.126,
"step": 220
},
{
"epoch": 0.04917682274962583,
"grad_norm": 1.368764042854309,
"learning_rate": 9.786324786324787e-06,
"loss": 0.1286,
"step": 230
},
{
"epoch": 0.05131494547787043,
"grad_norm": 0.519333004951477,
"learning_rate": 9.999968751679245e-06,
"loss": 0.1195,
"step": 240
},
{
"epoch": 0.053453068206115034,
"grad_norm": 0.5977477431297302,
"learning_rate": 9.999718767456692e-06,
"loss": 0.1206,
"step": 250
},
{
"epoch": 0.05559119093435963,
"grad_norm": 0.4855315089225769,
"learning_rate": 9.999218811510088e-06,
"loss": 0.1103,
"step": 260
},
{
"epoch": 0.05772931366260423,
"grad_norm": 0.3158447742462158,
"learning_rate": 9.998468908835808e-06,
"loss": 0.1089,
"step": 270
},
{
"epoch": 0.059867436390848836,
"grad_norm": 0.25278440117836,
"learning_rate": 9.997469096926852e-06,
"loss": 0.1089,
"step": 280
},
{
"epoch": 0.06200555911909344,
"grad_norm": 0.25526583194732666,
"learning_rate": 9.996219425770975e-06,
"loss": 0.1024,
"step": 290
},
{
"epoch": 0.06414368184733804,
"grad_norm": 0.7181093692779541,
"learning_rate": 9.994719957848182e-06,
"loss": 0.1004,
"step": 300
},
{
"epoch": 0.06628180457558264,
"grad_norm": 0.5958463549613953,
"learning_rate": 9.992970768127605e-06,
"loss": 0.0957,
"step": 310
},
{
"epoch": 0.06841992730382725,
"grad_norm": 0.8600453734397888,
"learning_rate": 9.990971944063758e-06,
"loss": 0.1029,
"step": 320
},
{
"epoch": 0.07055805003207184,
"grad_norm": 0.267674058675766,
"learning_rate": 9.98872358559216e-06,
"loss": 0.0985,
"step": 330
},
{
"epoch": 0.07269617276031644,
"grad_norm": 2.607964038848877,
"learning_rate": 9.986225805124345e-06,
"loss": 0.0921,
"step": 340
},
{
"epoch": 0.07483429548856105,
"grad_norm": 0.9426595568656921,
"learning_rate": 9.983478727542233e-06,
"loss": 0.089,
"step": 350
},
{
"epoch": 0.07697241821680564,
"grad_norm": 0.5454517602920532,
"learning_rate": 9.980482490191895e-06,
"loss": 0.0898,
"step": 360
},
{
"epoch": 0.07911054094505024,
"grad_norm": 1.0879571437835693,
"learning_rate": 9.977237242876677e-06,
"loss": 0.0932,
"step": 370
},
{
"epoch": 0.08124866367329485,
"grad_norm": 0.4590705335140228,
"learning_rate": 9.973743147849721e-06,
"loss": 0.0884,
"step": 380
},
{
"epoch": 0.08338678640153944,
"grad_norm": 0.5434951186180115,
"learning_rate": 9.970000379805843e-06,
"loss": 0.0936,
"step": 390
},
{
"epoch": 0.08552490912978405,
"grad_norm": 5.244622707366943,
"learning_rate": 9.966009125872806e-06,
"loss": 0.0892,
"step": 400
},
{
"epoch": 0.08766303185802865,
"grad_norm": 0.16936945915222168,
"learning_rate": 9.96176958560196e-06,
"loss": 0.0872,
"step": 410
},
{
"epoch": 0.08980115458627325,
"grad_norm": 0.4548771381378174,
"learning_rate": 9.957281970958264e-06,
"loss": 0.0879,
"step": 420
},
{
"epoch": 0.09193927731451786,
"grad_norm": 1.1364250183105469,
"learning_rate": 9.952546506309691e-06,
"loss": 0.0869,
"step": 430
},
{
"epoch": 0.09407740004276245,
"grad_norm": 0.7596004605293274,
"learning_rate": 9.94756342841601e-06,
"loss": 0.0873,
"step": 440
},
{
"epoch": 0.09621552277100706,
"grad_norm": 0.2505151331424713,
"learning_rate": 9.94233298641695e-06,
"loss": 0.088,
"step": 450
},
{
"epoch": 0.09835364549925166,
"grad_norm": 0.16528713703155518,
"learning_rate": 9.936855441819744e-06,
"loss": 0.0834,
"step": 460
},
{
"epoch": 0.10049176822749625,
"grad_norm": 0.43566420674324036,
"learning_rate": 9.931131068486045e-06,
"loss": 0.0808,
"step": 470
},
{
"epoch": 0.10262989095574086,
"grad_norm": 0.24824728071689606,
"learning_rate": 9.925160152618246e-06,
"loss": 0.0871,
"step": 480
},
{
"epoch": 0.10476801368398546,
"grad_norm": 0.47349920868873596,
"learning_rate": 9.918942992745161e-06,
"loss": 0.0827,
"step": 490
},
{
"epoch": 0.10690613641223007,
"grad_norm": 0.30691832304000854,
"learning_rate": 9.912479899707117e-06,
"loss": 0.0834,
"step": 500
},
{
"epoch": 0.10904425914047466,
"grad_norm": 0.1434836983680725,
"learning_rate": 9.905771196640384e-06,
"loss": 0.0811,
"step": 510
},
{
"epoch": 0.11118238186871926,
"grad_norm": 0.38805854320526123,
"learning_rate": 9.898817218961043e-06,
"loss": 0.0805,
"step": 520
},
{
"epoch": 0.11332050459696387,
"grad_norm": 0.9806022644042969,
"learning_rate": 9.89161831434821e-06,
"loss": 0.0832,
"step": 530
},
{
"epoch": 0.11545862732520847,
"grad_norm": 0.4095519185066223,
"learning_rate": 9.88417484272665e-06,
"loss": 0.0821,
"step": 540
},
{
"epoch": 0.11759675005345306,
"grad_norm": 0.14556068181991577,
"learning_rate": 9.87648717624878e-06,
"loss": 0.0813,
"step": 550
},
{
"epoch": 0.11973487278169767,
"grad_norm": 0.1319616138935089,
"learning_rate": 9.868555699276065e-06,
"loss": 0.0783,
"step": 560
},
{
"epoch": 0.12187299550994227,
"grad_norm": 0.34334319829940796,
"learning_rate": 9.860380808359808e-06,
"loss": 0.0812,
"step": 570
},
{
"epoch": 0.12401111823818688,
"grad_norm": 0.28321507573127747,
"learning_rate": 9.851962912221315e-06,
"loss": 0.0833,
"step": 580
},
{
"epoch": 0.1261492409664315,
"grad_norm": 0.28973206877708435,
"learning_rate": 9.843302431731456e-06,
"loss": 0.0781,
"step": 590
},
{
"epoch": 0.12828736369467608,
"grad_norm": 1.1749194860458374,
"learning_rate": 9.834399799889637e-06,
"loss": 0.0843,
"step": 600
},
{
"epoch": 0.12828736369467608,
"eval_loss": 0.08129256218671799,
"eval_runtime": 471.6671,
"eval_samples_per_second": 4.906,
"eval_steps_per_second": 0.307,
"step": 600
},
{
"epoch": 0.13042548642292068,
"grad_norm": 0.261525422334671,
"learning_rate": 9.825255461802137e-06,
"loss": 0.0819,
"step": 610
},
{
"epoch": 0.13256360915116527,
"grad_norm": 0.3438051640987396,
"learning_rate": 9.815869874659866e-06,
"loss": 0.0823,
"step": 620
},
{
"epoch": 0.13470173187940987,
"grad_norm": 0.34199830889701843,
"learning_rate": 9.806243507715494e-06,
"loss": 0.0808,
"step": 630
},
{
"epoch": 0.1368398546076545,
"grad_norm": 0.22008945047855377,
"learning_rate": 9.796376842260004e-06,
"loss": 0.0774,
"step": 640
},
{
"epoch": 0.1389779773358991,
"grad_norm": 0.1826457530260086,
"learning_rate": 9.786270371598613e-06,
"loss": 0.0748,
"step": 650
},
{
"epoch": 0.14111610006414368,
"grad_norm": 0.15165981650352478,
"learning_rate": 9.775924601026127e-06,
"loss": 0.0782,
"step": 660
},
{
"epoch": 0.14325422279238828,
"grad_norm": 0.3881567120552063,
"learning_rate": 9.765340047801656e-06,
"loss": 0.0764,
"step": 670
},
{
"epoch": 0.14539234552063288,
"grad_norm": 0.5628389120101929,
"learning_rate": 9.754517241122771e-06,
"loss": 0.0774,
"step": 680
},
{
"epoch": 0.14753046824887747,
"grad_norm": 0.18720681965351105,
"learning_rate": 9.743456722099039e-06,
"loss": 0.0779,
"step": 690
},
{
"epoch": 0.1496685909771221,
"grad_norm": 0.21351012587547302,
"learning_rate": 9.732159043724963e-06,
"loss": 0.0782,
"step": 700
},
{
"epoch": 0.1518067137053667,
"grad_norm": 0.31227338314056396,
"learning_rate": 9.720624770852341e-06,
"loss": 0.077,
"step": 710
},
{
"epoch": 0.1539448364336113,
"grad_norm": 0.18778669834136963,
"learning_rate": 9.70885448016203e-06,
"loss": 0.0751,
"step": 720
},
{
"epoch": 0.15608295916185588,
"grad_norm": 0.2396603375673294,
"learning_rate": 9.696848760135093e-06,
"loss": 0.0776,
"step": 730
},
{
"epoch": 0.15822108189010048,
"grad_norm": 0.24791832268238068,
"learning_rate": 9.684608211023406e-06,
"loss": 0.0729,
"step": 740
},
{
"epoch": 0.1603592046183451,
"grad_norm": 0.5819320678710938,
"learning_rate": 9.672133444819619e-06,
"loss": 0.0738,
"step": 750
},
{
"epoch": 0.1624973273465897,
"grad_norm": 0.2177390158176422,
"learning_rate": 9.659425085226581e-06,
"loss": 0.0789,
"step": 760
},
{
"epoch": 0.1646354500748343,
"grad_norm": 0.14574196934700012,
"learning_rate": 9.646483767626138e-06,
"loss": 0.0755,
"step": 770
},
{
"epoch": 0.1667735728030789,
"grad_norm": 0.16599872708320618,
"learning_rate": 9.63331013904738e-06,
"loss": 0.0758,
"step": 780
},
{
"epoch": 0.16891169553132349,
"grad_norm": 0.21911288797855377,
"learning_rate": 9.619904858134281e-06,
"loss": 0.0763,
"step": 790
},
{
"epoch": 0.1710498182595681,
"grad_norm": 0.12567879259586334,
"learning_rate": 9.606268595112776e-06,
"loss": 0.0752,
"step": 800
},
{
"epoch": 0.1731879409878127,
"grad_norm": 0.13315202295780182,
"learning_rate": 9.59240203175725e-06,
"loss": 0.0727,
"step": 810
},
{
"epoch": 0.1753260637160573,
"grad_norm": 0.4594784379005432,
"learning_rate": 9.57830586135644e-06,
"loss": 0.076,
"step": 820
},
{
"epoch": 0.1774641864443019,
"grad_norm": 0.15148596465587616,
"learning_rate": 9.5639807886788e-06,
"loss": 0.0766,
"step": 830
},
{
"epoch": 0.1796023091725465,
"grad_norm": 0.5493207573890686,
"learning_rate": 9.549427529937233e-06,
"loss": 0.0769,
"step": 840
},
{
"epoch": 0.18174043190079112,
"grad_norm": 0.1707722693681717,
"learning_rate": 9.534646812753301e-06,
"loss": 0.0733,
"step": 850
},
{
"epoch": 0.1838785546290357,
"grad_norm": 0.15772108733654022,
"learning_rate": 9.519639376120841e-06,
"loss": 0.0767,
"step": 860
},
{
"epoch": 0.1860166773572803,
"grad_norm": 0.36901381611824036,
"learning_rate": 9.504405970369017e-06,
"loss": 0.0767,
"step": 870
},
{
"epoch": 0.1881548000855249,
"grad_norm": 0.21110327541828156,
"learning_rate": 9.488947357124812e-06,
"loss": 0.0749,
"step": 880
},
{
"epoch": 0.1902929228137695,
"grad_norm": 0.1756543517112732,
"learning_rate": 9.473264309274934e-06,
"loss": 0.0747,
"step": 890
},
{
"epoch": 0.19243104554201412,
"grad_norm": 0.19304226338863373,
"learning_rate": 9.45735761092719e-06,
"loss": 0.0722,
"step": 900
},
{
"epoch": 0.19456916827025872,
"grad_norm": 0.14899924397468567,
"learning_rate": 9.441228057371275e-06,
"loss": 0.0722,
"step": 910
},
{
"epoch": 0.19670729099850331,
"grad_norm": 0.24590201675891876,
"learning_rate": 9.42487645503901e-06,
"loss": 0.0753,
"step": 920
},
{
"epoch": 0.1988454137267479,
"grad_norm": 0.2768633961677551,
"learning_rate": 9.408303621464024e-06,
"loss": 0.0738,
"step": 930
},
{
"epoch": 0.2009835364549925,
"grad_norm": 0.44510579109191895,
"learning_rate": 9.391510385240876e-06,
"loss": 0.0725,
"step": 940
},
{
"epoch": 0.20312165918323713,
"grad_norm": 0.15240268409252167,
"learning_rate": 9.374497585983635e-06,
"loss": 0.0748,
"step": 950
},
{
"epoch": 0.20525978191148173,
"grad_norm": 0.307160884141922,
"learning_rate": 9.3572660742839e-06,
"loss": 0.0734,
"step": 960
},
{
"epoch": 0.20739790463972632,
"grad_norm": 0.19927459955215454,
"learning_rate": 9.339816711668262e-06,
"loss": 0.0723,
"step": 970
},
{
"epoch": 0.20953602736797092,
"grad_norm": 0.12397543340921402,
"learning_rate": 9.322150370555242e-06,
"loss": 0.0728,
"step": 980
},
{
"epoch": 0.2116741500962155,
"grad_norm": 0.20738820731639862,
"learning_rate": 9.304267934211672e-06,
"loss": 0.0749,
"step": 990
},
{
"epoch": 0.21381227282446014,
"grad_norm": 0.14943927526474,
"learning_rate": 9.28617029670853e-06,
"loss": 0.0717,
"step": 1000
},
{
"epoch": 0.21595039555270473,
"grad_norm": 0.16114865243434906,
"learning_rate": 9.267858362876238e-06,
"loss": 0.0714,
"step": 1010
},
{
"epoch": 0.21808851828094933,
"grad_norm": 0.14569199085235596,
"learning_rate": 9.249333048259426e-06,
"loss": 0.0751,
"step": 1020
},
{
"epoch": 0.22022664100919392,
"grad_norm": 0.46952638030052185,
"learning_rate": 9.230595279071156e-06,
"loss": 0.0712,
"step": 1030
},
{
"epoch": 0.22236476373743852,
"grad_norm": 0.1461559236049652,
"learning_rate": 9.211645992146618e-06,
"loss": 0.0716,
"step": 1040
},
{
"epoch": 0.22450288646568314,
"grad_norm": 0.13023297488689423,
"learning_rate": 9.192486134896282e-06,
"loss": 0.0696,
"step": 1050
},
{
"epoch": 0.22664100919392774,
"grad_norm": 0.3277701437473297,
"learning_rate": 9.17311666525854e-06,
"loss": 0.0694,
"step": 1060
},
{
"epoch": 0.22877913192217234,
"grad_norm": 0.11651341617107391,
"learning_rate": 9.153538551651808e-06,
"loss": 0.0681,
"step": 1070
},
{
"epoch": 0.23091725465041693,
"grad_norm": 0.14407022297382355,
"learning_rate": 9.133752772926102e-06,
"loss": 0.0717,
"step": 1080
},
{
"epoch": 0.23305537737866153,
"grad_norm": 0.14212338626384735,
"learning_rate": 9.113760318314109e-06,
"loss": 0.0701,
"step": 1090
},
{
"epoch": 0.23519350010690612,
"grad_norm": 0.1330636888742447,
"learning_rate": 9.09356218738172e-06,
"loss": 0.0714,
"step": 1100
},
{
"epoch": 0.23733162283515075,
"grad_norm": 0.19741296768188477,
"learning_rate": 9.073159389978056e-06,
"loss": 0.0704,
"step": 1110
},
{
"epoch": 0.23946974556339534,
"grad_norm": 0.1623307764530182,
"learning_rate": 9.052552946184985e-06,
"loss": 0.0682,
"step": 1120
},
{
"epoch": 0.24160786829163994,
"grad_norm": 0.11294026672840118,
"learning_rate": 9.031743886266109e-06,
"loss": 0.0686,
"step": 1130
},
{
"epoch": 0.24374599101988453,
"grad_norm": 0.2032402753829956,
"learning_rate": 9.010733250615264e-06,
"loss": 0.0685,
"step": 1140
},
{
"epoch": 0.24588411374812913,
"grad_norm": 0.14151214063167572,
"learning_rate": 8.989522089704502e-06,
"loss": 0.0665,
"step": 1150
},
{
"epoch": 0.24802223647637375,
"grad_norm": 0.16347168385982513,
"learning_rate": 8.96811146403156e-06,
"loss": 0.0683,
"step": 1160
},
{
"epoch": 0.2501603592046183,
"grad_norm": 0.2190970480442047,
"learning_rate": 8.946502444066854e-06,
"loss": 0.0702,
"step": 1170
},
{
"epoch": 0.252298481932863,
"grad_norm": 0.1208793967962265,
"learning_rate": 8.924696110199944e-06,
"loss": 0.0687,
"step": 1180
},
{
"epoch": 0.25443660466110757,
"grad_norm": 0.16621464490890503,
"learning_rate": 8.902693552685532e-06,
"loss": 0.0705,
"step": 1190
},
{
"epoch": 0.25657472738935216,
"grad_norm": 0.468980997800827,
"learning_rate": 8.880495871588934e-06,
"loss": 0.0712,
"step": 1200
},
{
"epoch": 0.25657472738935216,
"eval_loss": 0.0711250901222229,
"eval_runtime": 472.4294,
"eval_samples_per_second": 4.898,
"eval_steps_per_second": 0.307,
"step": 1200
},
{
"epoch": 0.25871285011759676,
"grad_norm": 0.145741268992424,
"learning_rate": 8.858104176731102e-06,
"loss": 0.0663,
"step": 1210
},
{
"epoch": 0.26085097284584136,
"grad_norm": 0.26027005910873413,
"learning_rate": 8.835519587633116e-06,
"loss": 0.0683,
"step": 1220
},
{
"epoch": 0.26298909557408595,
"grad_norm": 0.1468413919210434,
"learning_rate": 8.812743233460224e-06,
"loss": 0.07,
"step": 1230
},
{
"epoch": 0.26512721830233055,
"grad_norm": 0.12431179732084274,
"learning_rate": 8.789776252965378e-06,
"loss": 0.0712,
"step": 1240
},
{
"epoch": 0.26726534103057514,
"grad_norm": 0.10897620022296906,
"learning_rate": 8.76661979443231e-06,
"loss": 0.0706,
"step": 1250
},
{
"epoch": 0.26940346375881974,
"grad_norm": 0.4356347620487213,
"learning_rate": 8.74327501561811e-06,
"loss": 0.071,
"step": 1260
},
{
"epoch": 0.27154158648706433,
"grad_norm": 0.13456492125988007,
"learning_rate": 8.71974308369535e-06,
"loss": 0.075,
"step": 1270
},
{
"epoch": 0.273679709215309,
"grad_norm": 0.15002375841140747,
"learning_rate": 8.696025175193725e-06,
"loss": 0.0683,
"step": 1280
},
{
"epoch": 0.2758178319435536,
"grad_norm": 0.25080016255378723,
"learning_rate": 8.672122475941228e-06,
"loss": 0.0669,
"step": 1290
},
{
"epoch": 0.2779559546717982,
"grad_norm": 0.10547586530447006,
"learning_rate": 8.648036181004867e-06,
"loss": 0.0681,
"step": 1300
},
{
"epoch": 0.2800940774000428,
"grad_norm": 0.3222697079181671,
"learning_rate": 8.62376749463091e-06,
"loss": 0.0689,
"step": 1310
},
{
"epoch": 0.28223220012828737,
"grad_norm": 0.16391637921333313,
"learning_rate": 8.59931763018468e-06,
"loss": 0.0658,
"step": 1320
},
{
"epoch": 0.28437032285653197,
"grad_norm": 0.11527778953313828,
"learning_rate": 8.574687810089887e-06,
"loss": 0.0682,
"step": 1330
},
{
"epoch": 0.28650844558477656,
"grad_norm": 0.21346218883991241,
"learning_rate": 8.549879265767514e-06,
"loss": 0.0695,
"step": 1340
},
{
"epoch": 0.28864656831302116,
"grad_norm": 0.2827841341495514,
"learning_rate": 8.524893237574244e-06,
"loss": 0.0683,
"step": 1350
},
{
"epoch": 0.29078469104126575,
"grad_norm": 0.21771486103534698,
"learning_rate": 8.499730974740452e-06,
"loss": 0.0679,
"step": 1360
},
{
"epoch": 0.29292281376951035,
"grad_norm": 0.10718706995248795,
"learning_rate": 8.47439373530774e-06,
"loss": 0.0696,
"step": 1370
},
{
"epoch": 0.29506093649775494,
"grad_norm": 0.1284645050764084,
"learning_rate": 8.44888278606605e-06,
"loss": 0.069,
"step": 1380
},
{
"epoch": 0.2971990592259996,
"grad_norm": 0.09697470813989639,
"learning_rate": 8.423199402490314e-06,
"loss": 0.067,
"step": 1390
},
{
"epoch": 0.2993371819542442,
"grad_norm": 0.16259442269802094,
"learning_rate": 8.39734486867669e-06,
"loss": 0.0676,
"step": 1400
},
{
"epoch": 0.3014753046824888,
"grad_norm": 0.12434723228216171,
"learning_rate": 8.371320477278363e-06,
"loss": 0.0682,
"step": 1410
},
{
"epoch": 0.3036134274107334,
"grad_norm": 0.11663969606161118,
"learning_rate": 8.345127529440921e-06,
"loss": 0.0685,
"step": 1420
},
{
"epoch": 0.305751550138978,
"grad_norm": 0.15766265988349915,
"learning_rate": 8.318767334737286e-06,
"loss": 0.067,
"step": 1430
},
{
"epoch": 0.3078896728672226,
"grad_norm": 0.1343747079372406,
"learning_rate": 8.292241211102246e-06,
"loss": 0.0685,
"step": 1440
},
{
"epoch": 0.31002779559546717,
"grad_norm": 0.801880419254303,
"learning_rate": 8.265550484766574e-06,
"loss": 0.0721,
"step": 1450
},
{
"epoch": 0.31216591832371177,
"grad_norm": 0.40033191442489624,
"learning_rate": 8.238696490190701e-06,
"loss": 0.0668,
"step": 1460
},
{
"epoch": 0.31430404105195636,
"grad_norm": 0.10282248258590698,
"learning_rate": 8.211680569998011e-06,
"loss": 0.0699,
"step": 1470
},
{
"epoch": 0.31644216378020096,
"grad_norm": 0.128205344080925,
"learning_rate": 8.184504074907706e-06,
"loss": 0.0666,
"step": 1480
},
{
"epoch": 0.3185802865084456,
"grad_norm": 0.2041744887828827,
"learning_rate": 8.157168363667278e-06,
"loss": 0.0652,
"step": 1490
},
{
"epoch": 0.3207184092366902,
"grad_norm": 0.13964834809303284,
"learning_rate": 8.129674802984573e-06,
"loss": 0.0676,
"step": 1500
},
{
"epoch": 0.3228565319649348,
"grad_norm": 0.2170010209083557,
"learning_rate": 8.102024767459457e-06,
"loss": 0.0663,
"step": 1510
},
{
"epoch": 0.3249946546931794,
"grad_norm": 0.1943911761045456,
"learning_rate": 8.074219639515101e-06,
"loss": 0.0692,
"step": 1520
},
{
"epoch": 0.327132777421424,
"grad_norm": 0.16795021295547485,
"learning_rate": 8.046260809328848e-06,
"loss": 0.0675,
"step": 1530
},
{
"epoch": 0.3292709001496686,
"grad_norm": 0.11824577301740646,
"learning_rate": 8.018149674762723e-06,
"loss": 0.066,
"step": 1540
},
{
"epoch": 0.3314090228779132,
"grad_norm": 0.11008622497320175,
"learning_rate": 7.98988764129353e-06,
"loss": 0.0687,
"step": 1550
},
{
"epoch": 0.3335471456061578,
"grad_norm": 0.1290241926908493,
"learning_rate": 7.961476121942598e-06,
"loss": 0.0655,
"step": 1560
},
{
"epoch": 0.3356852683344024,
"grad_norm": 0.10323217511177063,
"learning_rate": 7.932916537205112e-06,
"loss": 0.0662,
"step": 1570
},
{
"epoch": 0.33782339106264697,
"grad_norm": 0.14995551109313965,
"learning_rate": 7.904210314979122e-06,
"loss": 0.0687,
"step": 1580
},
{
"epoch": 0.3399615137908916,
"grad_norm": 0.6756893992424011,
"learning_rate": 7.875358890494122e-06,
"loss": 0.0674,
"step": 1590
},
{
"epoch": 0.3420996365191362,
"grad_norm": 0.10807085037231445,
"learning_rate": 7.846363706239312e-06,
"loss": 0.0686,
"step": 1600
},
{
"epoch": 0.3442377592473808,
"grad_norm": 0.1266680657863617,
"learning_rate": 7.817226211891468e-06,
"loss": 0.0684,
"step": 1610
},
{
"epoch": 0.3463758819756254,
"grad_norm": 0.17101961374282837,
"learning_rate": 7.787947864242474e-06,
"loss": 0.0658,
"step": 1620
},
{
"epoch": 0.34851400470387,
"grad_norm": 0.10010931640863419,
"learning_rate": 7.75853012712647e-06,
"loss": 0.0687,
"step": 1630
},
{
"epoch": 0.3506521274321146,
"grad_norm": 0.0984787791967392,
"learning_rate": 7.728974471346678e-06,
"loss": 0.0682,
"step": 1640
},
{
"epoch": 0.3527902501603592,
"grad_norm": 0.15190352499485016,
"learning_rate": 7.699282374601857e-06,
"loss": 0.0665,
"step": 1650
},
{
"epoch": 0.3549283728886038,
"grad_norm": 0.10777094215154648,
"learning_rate": 7.66945532141243e-06,
"loss": 0.0671,
"step": 1660
},
{
"epoch": 0.3570664956168484,
"grad_norm": 0.11840742081403732,
"learning_rate": 7.639494803046261e-06,
"loss": 0.0642,
"step": 1670
},
{
"epoch": 0.359204618345093,
"grad_norm": 0.12402522563934326,
"learning_rate": 7.609402317444086e-06,
"loss": 0.0652,
"step": 1680
},
{
"epoch": 0.36134274107333764,
"grad_norm": 0.1083909198641777,
"learning_rate": 7.579179369144631e-06,
"loss": 0.0654,
"step": 1690
},
{
"epoch": 0.36348086380158223,
"grad_norm": 0.11909038573503494,
"learning_rate": 7.5488274692093874e-06,
"loss": 0.0657,
"step": 1700
},
{
"epoch": 0.36561898652982683,
"grad_norm": 0.12170397490262985,
"learning_rate": 7.518348135147063e-06,
"loss": 0.0677,
"step": 1710
},
{
"epoch": 0.3677571092580714,
"grad_norm": 0.0956568717956543,
"learning_rate": 7.487742890837704e-06,
"loss": 0.0666,
"step": 1720
},
{
"epoch": 0.369895231986316,
"grad_norm": 0.09268027544021606,
"learning_rate": 7.457013266456517e-06,
"loss": 0.065,
"step": 1730
},
{
"epoch": 0.3720333547145606,
"grad_norm": 0.09884276241064072,
"learning_rate": 7.426160798397355e-06,
"loss": 0.0655,
"step": 1740
},
{
"epoch": 0.3741714774428052,
"grad_norm": 0.13707584142684937,
"learning_rate": 7.395187029195906e-06,
"loss": 0.0633,
"step": 1750
},
{
"epoch": 0.3763096001710498,
"grad_norm": 0.18375861644744873,
"learning_rate": 7.364093507452572e-06,
"loss": 0.0666,
"step": 1760
},
{
"epoch": 0.3784477228992944,
"grad_norm": 0.09120248258113861,
"learning_rate": 7.33288178775504e-06,
"loss": 0.0663,
"step": 1770
},
{
"epoch": 0.380585845627539,
"grad_norm": 0.1667109578847885,
"learning_rate": 7.301553430600559e-06,
"loss": 0.0647,
"step": 1780
},
{
"epoch": 0.3827239683557836,
"grad_norm": 0.17563393712043762,
"learning_rate": 7.270110002317921e-06,
"loss": 0.0646,
"step": 1790
},
{
"epoch": 0.38486209108402825,
"grad_norm": 0.16917386651039124,
"learning_rate": 7.238553074989143e-06,
"loss": 0.0654,
"step": 1800
},
{
"epoch": 0.38486209108402825,
"eval_loss": 0.0673459991812706,
"eval_runtime": 472.3199,
"eval_samples_per_second": 4.899,
"eval_steps_per_second": 0.307,
"step": 1800
},
{
"epoch": 0.38700021381227284,
"grad_norm": 0.11774443835020065,
"learning_rate": 7.206884226370875e-06,
"loss": 0.0655,
"step": 1810
},
{
"epoch": 0.38913833654051744,
"grad_norm": 0.119617760181427,
"learning_rate": 7.175105039815515e-06,
"loss": 0.0639,
"step": 1820
},
{
"epoch": 0.39127645926876203,
"grad_norm": 0.12271067500114441,
"learning_rate": 7.143217104192041e-06,
"loss": 0.0682,
"step": 1830
},
{
"epoch": 0.39341458199700663,
"grad_norm": 0.19674961268901825,
"learning_rate": 7.111222013806573e-06,
"loss": 0.0628,
"step": 1840
},
{
"epoch": 0.3955527047252512,
"grad_norm": 0.23792926967144012,
"learning_rate": 7.07912136832267e-06,
"loss": 0.0654,
"step": 1850
},
{
"epoch": 0.3976908274534958,
"grad_norm": 0.1446692794561386,
"learning_rate": 7.0469167726813445e-06,
"loss": 0.067,
"step": 1860
},
{
"epoch": 0.3998289501817404,
"grad_norm": 0.10155676305294037,
"learning_rate": 7.014609837020817e-06,
"loss": 0.0654,
"step": 1870
},
{
"epoch": 0.401967072909985,
"grad_norm": 0.11214682459831238,
"learning_rate": 6.9822021765960225e-06,
"loss": 0.065,
"step": 1880
},
{
"epoch": 0.4041051956382296,
"grad_norm": 0.09929853677749634,
"learning_rate": 6.949695411697848e-06,
"loss": 0.0656,
"step": 1890
},
{
"epoch": 0.40624331836647426,
"grad_norm": 0.10509800910949707,
"learning_rate": 6.9170911675721175e-06,
"loss": 0.0668,
"step": 1900
},
{
"epoch": 0.40838144109471886,
"grad_norm": 0.11406348645687103,
"learning_rate": 6.884391074338348e-06,
"loss": 0.0651,
"step": 1910
},
{
"epoch": 0.41051956382296345,
"grad_norm": 0.13327282667160034,
"learning_rate": 6.851596766908229e-06,
"loss": 0.0681,
"step": 1920
},
{
"epoch": 0.41265768655120805,
"grad_norm": 0.26162999868392944,
"learning_rate": 6.818709884903897e-06,
"loss": 0.0638,
"step": 1930
},
{
"epoch": 0.41479580927945264,
"grad_norm": 0.10129717737436295,
"learning_rate": 6.785732072575958e-06,
"loss": 0.0648,
"step": 1940
},
{
"epoch": 0.41693393200769724,
"grad_norm": 0.1475946605205536,
"learning_rate": 6.752664978721269e-06,
"loss": 0.0643,
"step": 1950
},
{
"epoch": 0.41907205473594183,
"grad_norm": 0.14560334384441376,
"learning_rate": 6.719510256600512e-06,
"loss": 0.0657,
"step": 1960
},
{
"epoch": 0.42121017746418643,
"grad_norm": 0.14020417630672455,
"learning_rate": 6.686269563855534e-06,
"loss": 0.0658,
"step": 1970
},
{
"epoch": 0.423348300192431,
"grad_norm": 0.1445358246564865,
"learning_rate": 6.652944562426469e-06,
"loss": 0.0654,
"step": 1980
},
{
"epoch": 0.4254864229206756,
"grad_norm": 0.1675933301448822,
"learning_rate": 6.619536918468643e-06,
"loss": 0.0621,
"step": 1990
},
{
"epoch": 0.4276245456489203,
"grad_norm": 0.12988397479057312,
"learning_rate": 6.586048302269277e-06,
"loss": 0.0637,
"step": 2000
},
{
"epoch": 0.42976266837716487,
"grad_norm": 0.10703526437282562,
"learning_rate": 6.5524803881639694e-06,
"loss": 0.0639,
"step": 2010
},
{
"epoch": 0.43190079110540947,
"grad_norm": 0.10743958503007889,
"learning_rate": 6.518834854452993e-06,
"loss": 0.0647,
"step": 2020
},
{
"epoch": 0.43403891383365406,
"grad_norm": 0.09658580273389816,
"learning_rate": 6.485113383317378e-06,
"loss": 0.0616,
"step": 2030
},
{
"epoch": 0.43617703656189866,
"grad_norm": 0.12495086342096329,
"learning_rate": 6.451317660734812e-06,
"loss": 0.0657,
"step": 2040
},
{
"epoch": 0.43831515929014325,
"grad_norm": 0.14097630977630615,
"learning_rate": 6.417449376395339e-06,
"loss": 0.0651,
"step": 2050
},
{
"epoch": 0.44045328201838785,
"grad_norm": 0.09859970957040787,
"learning_rate": 6.3835102236168885e-06,
"loss": 0.0634,
"step": 2060
},
{
"epoch": 0.44259140474663244,
"grad_norm": 0.27701902389526367,
"learning_rate": 6.34950189926061e-06,
"loss": 0.0653,
"step": 2070
},
{
"epoch": 0.44472952747487704,
"grad_norm": 0.10874440521001816,
"learning_rate": 6.315426103646036e-06,
"loss": 0.0654,
"step": 2080
},
{
"epoch": 0.44686765020312164,
"grad_norm": 0.14138060808181763,
"learning_rate": 6.281284540466067e-06,
"loss": 0.0645,
"step": 2090
},
{
"epoch": 0.4490057729313663,
"grad_norm": 0.09226495772600174,
"learning_rate": 6.247078916701797e-06,
"loss": 0.0635,
"step": 2100
},
{
"epoch": 0.4511438956596109,
"grad_norm": 0.10510533303022385,
"learning_rate": 6.212810942537167e-06,
"loss": 0.0609,
"step": 2110
},
{
"epoch": 0.4532820183878555,
"grad_norm": 0.13226036727428436,
"learning_rate": 6.178482331273462e-06,
"loss": 0.0631,
"step": 2120
},
{
"epoch": 0.4554201411161001,
"grad_norm": 0.09952764213085175,
"learning_rate": 6.144094799243647e-06,
"loss": 0.0664,
"step": 2130
},
{
"epoch": 0.45755826384434467,
"grad_norm": 0.25290119647979736,
"learning_rate": 6.1096500657265575e-06,
"loss": 0.0638,
"step": 2140
},
{
"epoch": 0.45969638657258927,
"grad_norm": 0.25737249851226807,
"learning_rate": 6.075149852860945e-06,
"loss": 0.0636,
"step": 2150
},
{
"epoch": 0.46183450930083386,
"grad_norm": 0.10331868380308151,
"learning_rate": 6.040595885559366e-06,
"loss": 0.0646,
"step": 2160
},
{
"epoch": 0.46397263202907846,
"grad_norm": 0.10511161386966705,
"learning_rate": 6.005989891421948e-06,
"loss": 0.0662,
"step": 2170
},
{
"epoch": 0.46611075475732305,
"grad_norm": 0.1610974222421646,
"learning_rate": 5.971333600650012e-06,
"loss": 0.0621,
"step": 2180
},
{
"epoch": 0.46824887748556765,
"grad_norm": 0.13542431592941284,
"learning_rate": 5.936628745959568e-06,
"loss": 0.0648,
"step": 2190
},
{
"epoch": 0.47038700021381225,
"grad_norm": 0.11100970953702927,
"learning_rate": 5.901877062494684e-06,
"loss": 0.0616,
"step": 2200
},
{
"epoch": 0.4725251229420569,
"grad_norm": 0.09847405552864075,
"learning_rate": 5.867080287740735e-06,
"loss": 0.0622,
"step": 2210
},
{
"epoch": 0.4746632456703015,
"grad_norm": 0.11069463193416595,
"learning_rate": 5.832240161437528e-06,
"loss": 0.0658,
"step": 2220
},
{
"epoch": 0.4768013683985461,
"grad_norm": 0.14424288272857666,
"learning_rate": 5.797358425492328e-06,
"loss": 0.0627,
"step": 2230
},
{
"epoch": 0.4789394911267907,
"grad_norm": 0.11425557732582092,
"learning_rate": 5.762436823892763e-06,
"loss": 0.0645,
"step": 2240
},
{
"epoch": 0.4810776138550353,
"grad_norm": 0.12212098389863968,
"learning_rate": 5.727477102619628e-06,
"loss": 0.0661,
"step": 2250
},
{
"epoch": 0.4832157365832799,
"grad_norm": 0.11695980280637741,
"learning_rate": 5.692481009559598e-06,
"loss": 0.0633,
"step": 2260
},
{
"epoch": 0.48535385931152447,
"grad_norm": 0.16033753752708435,
"learning_rate": 5.657450294417831e-06,
"loss": 0.068,
"step": 2270
},
{
"epoch": 0.48749198203976907,
"grad_norm": 0.09469865262508392,
"learning_rate": 5.622386708630488e-06,
"loss": 0.0657,
"step": 2280
},
{
"epoch": 0.48963010476801366,
"grad_norm": 0.10984878987073898,
"learning_rate": 5.587292005277176e-06,
"loss": 0.0617,
"step": 2290
},
{
"epoch": 0.49176822749625826,
"grad_norm": 0.10703036934137344,
"learning_rate": 5.552167938993286e-06,
"loss": 0.0641,
"step": 2300
},
{
"epoch": 0.4939063502245029,
"grad_norm": 0.09129951894283295,
"learning_rate": 5.51701626588227e-06,
"loss": 0.0648,
"step": 2310
},
{
"epoch": 0.4960444729527475,
"grad_norm": 0.14747264981269836,
"learning_rate": 5.481838743427852e-06,
"loss": 0.0617,
"step": 2320
},
{
"epoch": 0.4981825956809921,
"grad_norm": 0.11260967701673508,
"learning_rate": 5.446637130406141e-06,
"loss": 0.0631,
"step": 2330
},
{
"epoch": 0.5003207184092366,
"grad_norm": 0.1024189218878746,
"learning_rate": 5.411413186797709e-06,
"loss": 0.064,
"step": 2340
},
{
"epoch": 0.5024588411374813,
"grad_norm": 0.16150939464569092,
"learning_rate": 5.376168673699596e-06,
"loss": 0.0637,
"step": 2350
},
{
"epoch": 0.504596963865726,
"grad_norm": 0.14528174698352814,
"learning_rate": 5.340905353237254e-06,
"loss": 0.0655,
"step": 2360
},
{
"epoch": 0.5067350865939705,
"grad_norm": 0.12370527535676956,
"learning_rate": 5.305624988476452e-06,
"loss": 0.0635,
"step": 2370
},
{
"epoch": 0.5088732093222151,
"grad_norm": 0.09441283345222473,
"learning_rate": 5.270329343335126e-06,
"loss": 0.0651,
"step": 2380
},
{
"epoch": 0.5110113320504597,
"grad_norm": 0.09483297914266586,
"learning_rate": 5.235020182495188e-06,
"loss": 0.0658,
"step": 2390
},
{
"epoch": 0.5131494547787043,
"grad_norm": 0.11624085903167725,
"learning_rate": 5.199699271314289e-06,
"loss": 0.0675,
"step": 2400
},
{
"epoch": 0.5131494547787043,
"eval_loss": 0.06467495113611221,
"eval_runtime": 471.8273,
"eval_samples_per_second": 4.904,
"eval_steps_per_second": 0.307,
"step": 2400
},
{
"epoch": 0.5152875775069489,
"grad_norm": 0.1344379037618637,
"learning_rate": 5.164368375737576e-06,
"loss": 0.0619,
"step": 2410
},
{
"epoch": 0.5174257002351935,
"grad_norm": 0.09949100762605667,
"learning_rate": 5.129029262209381e-06,
"loss": 0.0617,
"step": 2420
},
{
"epoch": 0.5195638229634381,
"grad_norm": 0.11078672856092453,
"learning_rate": 5.093683697584907e-06,
"loss": 0.0625,
"step": 2430
},
{
"epoch": 0.5217019456916827,
"grad_norm": 0.15946152806282043,
"learning_rate": 5.058333449041899e-06,
"loss": 0.0608,
"step": 2440
},
{
"epoch": 0.5238400684199273,
"grad_norm": 0.09759578853845596,
"learning_rate": 5.022980283992283e-06,
"loss": 0.0604,
"step": 2450
},
{
"epoch": 0.5259781911481719,
"grad_norm": 0.10458780080080032,
"learning_rate": 4.9876259699938e-06,
"loss": 0.063,
"step": 2460
},
{
"epoch": 0.5281163138764166,
"grad_norm": 0.1019633337855339,
"learning_rate": 4.952272274661637e-06,
"loss": 0.0608,
"step": 2470
},
{
"epoch": 0.5302544366046611,
"grad_norm": 0.09208445250988007,
"learning_rate": 4.916920965580052e-06,
"loss": 0.0652,
"step": 2480
},
{
"epoch": 0.5323925593329057,
"grad_norm": 0.11165319383144379,
"learning_rate": 4.881573810213989e-06,
"loss": 0.0615,
"step": 2490
},
{
"epoch": 0.5345306820611503,
"grad_norm": 0.18430490791797638,
"learning_rate": 4.8462325758207304e-06,
"loss": 0.0657,
"step": 2500
},
{
"epoch": 0.5366688047893949,
"grad_norm": 0.157784566283226,
"learning_rate": 4.810899029361515e-06,
"loss": 0.0653,
"step": 2510
},
{
"epoch": 0.5388069275176395,
"grad_norm": 0.13744202256202698,
"learning_rate": 4.775574937413211e-06,
"loss": 0.0618,
"step": 2520
},
{
"epoch": 0.5409450502458841,
"grad_norm": 0.13207334280014038,
"learning_rate": 4.740262066079994e-06,
"loss": 0.0644,
"step": 2530
},
{
"epoch": 0.5430831729741287,
"grad_norm": 0.16908520460128784,
"learning_rate": 4.70496218090503e-06,
"loss": 0.0642,
"step": 2540
},
{
"epoch": 0.5452212957023733,
"grad_norm": 0.13128970563411713,
"learning_rate": 4.669677046782221e-06,
"loss": 0.0652,
"step": 2550
},
{
"epoch": 0.547359418430618,
"grad_norm": 0.08551183342933655,
"learning_rate": 4.6344084278679574e-06,
"loss": 0.065,
"step": 2560
},
{
"epoch": 0.5494975411588625,
"grad_norm": 0.1189018115401268,
"learning_rate": 4.599158087492913e-06,
"loss": 0.0619,
"step": 2570
},
{
"epoch": 0.5516356638871072,
"grad_norm": 0.24343594908714294,
"learning_rate": 4.563927788073893e-06,
"loss": 0.0625,
"step": 2580
},
{
"epoch": 0.5537737866153517,
"grad_norm": 0.30038872361183167,
"learning_rate": 4.528719291025706e-06,
"loss": 0.062,
"step": 2590
},
{
"epoch": 0.5559119093435964,
"grad_norm": 0.08746038377285004,
"learning_rate": 4.493534356673102e-06,
"loss": 0.0638,
"step": 2600
},
{
"epoch": 0.5580500320718409,
"grad_norm": 0.09681444615125656,
"learning_rate": 4.458374744162773e-06,
"loss": 0.0647,
"step": 2610
},
{
"epoch": 0.5601881548000855,
"grad_norm": 0.11077167838811874,
"learning_rate": 4.423242211375381e-06,
"loss": 0.0643,
"step": 2620
},
{
"epoch": 0.5623262775283301,
"grad_norm": 0.08863001316785812,
"learning_rate": 4.388138514837685e-06,
"loss": 0.0627,
"step": 2630
},
{
"epoch": 0.5644644002565747,
"grad_norm": 0.13838346302509308,
"learning_rate": 4.35306540963471e-06,
"loss": 0.0622,
"step": 2640
},
{
"epoch": 0.5666025229848193,
"grad_norm": 0.09143807739019394,
"learning_rate": 4.318024649322001e-06,
"loss": 0.0627,
"step": 2650
},
{
"epoch": 0.5687406457130639,
"grad_norm": 0.19630184769630432,
"learning_rate": 4.283017985837955e-06,
"loss": 0.0626,
"step": 2660
},
{
"epoch": 0.5708787684413086,
"grad_norm": 0.10313283652067184,
"learning_rate": 4.248047169416221e-06,
"loss": 0.062,
"step": 2670
},
{
"epoch": 0.5730168911695531,
"grad_norm": 0.08923624455928802,
"learning_rate": 4.213113948498194e-06,
"loss": 0.0626,
"step": 2680
},
{
"epoch": 0.5751550138977978,
"grad_norm": 0.13680312037467957,
"learning_rate": 4.178220069645608e-06,
"loss": 0.0648,
"step": 2690
},
{
"epoch": 0.5772931366260423,
"grad_norm": 0.164349764585495,
"learning_rate": 4.143367277453197e-06,
"loss": 0.0622,
"step": 2700
},
{
"epoch": 0.579431259354287,
"grad_norm": 0.09581846743822098,
"learning_rate": 4.10855731446149e-06,
"loss": 0.0637,
"step": 2710
},
{
"epoch": 0.5815693820825315,
"grad_norm": 0.1474573314189911,
"learning_rate": 4.073791921069664e-06,
"loss": 0.0611,
"step": 2720
},
{
"epoch": 0.5837075048107762,
"grad_norm": 0.09582812339067459,
"learning_rate": 4.039072835448553e-06,
"loss": 0.0615,
"step": 2730
},
{
"epoch": 0.5858456275390207,
"grad_norm": 0.101468525826931,
"learning_rate": 4.004401793453731e-06,
"loss": 0.061,
"step": 2740
},
{
"epoch": 0.5879837502672653,
"grad_norm": 0.10648108273744583,
"learning_rate": 3.969780528538726e-06,
"loss": 0.0642,
"step": 2750
},
{
"epoch": 0.5901218729955099,
"grad_norm": 0.09285979717969894,
"learning_rate": 3.935210771668357e-06,
"loss": 0.062,
"step": 2760
},
{
"epoch": 0.5922599957237545,
"grad_norm": 0.09097687900066376,
"learning_rate": 3.900694251232182e-06,
"loss": 0.0608,
"step": 2770
},
{
"epoch": 0.5943981184519992,
"grad_norm": 0.12016556411981583,
"learning_rate": 3.8662326929580925e-06,
"loss": 0.0644,
"step": 2780
},
{
"epoch": 0.5965362411802437,
"grad_norm": 0.10592561960220337,
"learning_rate": 3.831827819826027e-06,
"loss": 0.0619,
"step": 2790
},
{
"epoch": 0.5986743639084884,
"grad_norm": 0.09551785886287689,
"learning_rate": 3.7974813519818288e-06,
"loss": 0.0629,
"step": 2800
},
{
"epoch": 0.6008124866367329,
"grad_norm": 0.13584552705287933,
"learning_rate": 3.7631950066512423e-06,
"loss": 0.0652,
"step": 2810
},
{
"epoch": 0.6029506093649776,
"grad_norm": 0.10060502588748932,
"learning_rate": 3.7289704980540586e-06,
"loss": 0.0602,
"step": 2820
},
{
"epoch": 0.6050887320932221,
"grad_norm": 0.10792689025402069,
"learning_rate": 3.694809537318402e-06,
"loss": 0.0635,
"step": 2830
},
{
"epoch": 0.6072268548214668,
"grad_norm": 0.1079002246260643,
"learning_rate": 3.660713832395193e-06,
"loss": 0.0646,
"step": 2840
},
{
"epoch": 0.6093649775497113,
"grad_norm": 0.12554524838924408,
"learning_rate": 3.626685087972743e-06,
"loss": 0.0607,
"step": 2850
},
{
"epoch": 0.611503100277956,
"grad_norm": 0.09405695647001266,
"learning_rate": 3.592725005391524e-06,
"loss": 0.065,
"step": 2860
},
{
"epoch": 0.6136412230062006,
"grad_norm": 0.09625021368265152,
"learning_rate": 3.55883528255912e-06,
"loss": 0.0621,
"step": 2870
},
{
"epoch": 0.6157793457344451,
"grad_norm": 0.1028503030538559,
"learning_rate": 3.525017613865321e-06,
"loss": 0.0628,
"step": 2880
},
{
"epoch": 0.6179174684626898,
"grad_norm": 0.08910300582647324,
"learning_rate": 3.491273690097421e-06,
"loss": 0.0599,
"step": 2890
},
{
"epoch": 0.6200555911909343,
"grad_norm": 0.1026788130402565,
"learning_rate": 3.45760519835567e-06,
"loss": 0.0612,
"step": 2900
},
{
"epoch": 0.622193713919179,
"grad_norm": 0.09384245425462723,
"learning_rate": 3.4240138219689343e-06,
"loss": 0.0625,
"step": 2910
},
{
"epoch": 0.6243318366474235,
"grad_norm": 0.09324868768453598,
"learning_rate": 3.390501240410535e-06,
"loss": 0.0611,
"step": 2920
},
{
"epoch": 0.6264699593756682,
"grad_norm": 0.1346089392900467,
"learning_rate": 3.3570691292142694e-06,
"loss": 0.0644,
"step": 2930
},
{
"epoch": 0.6286080821039127,
"grad_norm": 0.13233442604541779,
"learning_rate": 3.3237191598906536e-06,
"loss": 0.0634,
"step": 2940
},
{
"epoch": 0.6307462048321574,
"grad_norm": 0.11263269186019897,
"learning_rate": 3.2904529998433356e-06,
"loss": 0.0658,
"step": 2950
},
{
"epoch": 0.6328843275604019,
"grad_norm": 0.32706591486930847,
"learning_rate": 3.2572723122857416e-06,
"loss": 0.0656,
"step": 2960
},
{
"epoch": 0.6350224502886466,
"grad_norm": 0.2275686264038086,
"learning_rate": 3.224178756157918e-06,
"loss": 0.0614,
"step": 2970
},
{
"epoch": 0.6371605730168912,
"grad_norm": 0.09637604653835297,
"learning_rate": 3.191173986043583e-06,
"loss": 0.0607,
"step": 2980
},
{
"epoch": 0.6392986957451358,
"grad_norm": 0.10239467024803162,
"learning_rate": 3.1582596520874096e-06,
"loss": 0.0623,
"step": 2990
},
{
"epoch": 0.6414368184733804,
"grad_norm": 0.08615203946828842,
"learning_rate": 3.125437399912521e-06,
"loss": 0.0613,
"step": 3000
},
{
"epoch": 0.6414368184733804,
"eval_loss": 0.06311963498592377,
"eval_runtime": 471.8096,
"eval_samples_per_second": 4.905,
"eval_steps_per_second": 0.307,
"step": 3000
},
{
"epoch": 0.643574941201625,
"grad_norm": 0.08130071312189102,
"learning_rate": 3.0927088705382092e-06,
"loss": 0.0637,
"step": 3010
},
{
"epoch": 0.6457130639298696,
"grad_norm": 0.09418642520904541,
"learning_rate": 3.060075700297896e-06,
"loss": 0.061,
"step": 3020
},
{
"epoch": 0.6478511866581141,
"grad_norm": 0.09334340691566467,
"learning_rate": 3.0275395207573178e-06,
"loss": 0.0598,
"step": 3030
},
{
"epoch": 0.6499893093863588,
"grad_norm": 0.10348668694496155,
"learning_rate": 2.9951019586329467e-06,
"loss": 0.0613,
"step": 3040
},
{
"epoch": 0.6521274321146033,
"grad_norm": 0.09085489809513092,
"learning_rate": 2.962764635710672e-06,
"loss": 0.0619,
"step": 3050
},
{
"epoch": 0.654265554842848,
"grad_norm": 0.09327159821987152,
"learning_rate": 2.930529168764702e-06,
"loss": 0.0635,
"step": 3060
},
{
"epoch": 0.6564036775710925,
"grad_norm": 0.0930228903889656,
"learning_rate": 2.89839716947674e-06,
"loss": 0.0643,
"step": 3070
},
{
"epoch": 0.6585418002993372,
"grad_norm": 0.1704426109790802,
"learning_rate": 2.8663702443553967e-06,
"loss": 0.0633,
"step": 3080
},
{
"epoch": 0.6606799230275818,
"grad_norm": 0.09673333913087845,
"learning_rate": 2.8344499946558714e-06,
"loss": 0.0606,
"step": 3090
},
{
"epoch": 0.6628180457558264,
"grad_norm": 0.07926999032497406,
"learning_rate": 2.8026380162999055e-06,
"loss": 0.0614,
"step": 3100
},
{
"epoch": 0.664956168484071,
"grad_norm": 0.09460759162902832,
"learning_rate": 2.7709358997959724e-06,
"loss": 0.0622,
"step": 3110
},
{
"epoch": 0.6670942912123156,
"grad_norm": 0.09549721330404282,
"learning_rate": 2.7393452301597645e-06,
"loss": 0.0618,
"step": 3120
},
{
"epoch": 0.6692324139405602,
"grad_norm": 0.09777417033910751,
"learning_rate": 2.7078675868349546e-06,
"loss": 0.0602,
"step": 3130
},
{
"epoch": 0.6713705366688048,
"grad_norm": 0.08188968896865845,
"learning_rate": 2.676504543614214e-06,
"loss": 0.0624,
"step": 3140
},
{
"epoch": 0.6735086593970494,
"grad_norm": 0.10578031837940216,
"learning_rate": 2.6452576685605385e-06,
"loss": 0.0608,
"step": 3150
},
{
"epoch": 0.6756467821252939,
"grad_norm": 0.09571292251348495,
"learning_rate": 2.614128523928848e-06,
"loss": 0.0613,
"step": 3160
},
{
"epoch": 0.6777849048535386,
"grad_norm": 0.09241370856761932,
"learning_rate": 2.583118666087869e-06,
"loss": 0.0615,
"step": 3170
},
{
"epoch": 0.6799230275817832,
"grad_norm": 0.09565988928079605,
"learning_rate": 2.552229645442337e-06,
"loss": 0.0605,
"step": 3180
},
{
"epoch": 0.6820611503100278,
"grad_norm": 0.1339564025402069,
"learning_rate": 2.5214630063554597e-06,
"loss": 0.0614,
"step": 3190
},
{
"epoch": 0.6841992730382724,
"grad_norm": 0.21678894758224487,
"learning_rate": 2.4908202870717267e-06,
"loss": 0.0631,
"step": 3200
},
{
"epoch": 0.686337395766517,
"grad_norm": 0.1025332510471344,
"learning_rate": 2.4603030196399796e-06,
"loss": 0.0612,
"step": 3210
},
{
"epoch": 0.6884755184947616,
"grad_norm": 0.09821213781833649,
"learning_rate": 2.4299127298368314e-06,
"loss": 0.0606,
"step": 3220
},
{
"epoch": 0.6906136412230062,
"grad_norm": 0.11024197936058044,
"learning_rate": 2.399650937090373e-06,
"loss": 0.0618,
"step": 3230
},
{
"epoch": 0.6927517639512508,
"grad_norm": 0.09742552042007446,
"learning_rate": 2.369519154404205e-06,
"loss": 0.0602,
"step": 3240
},
{
"epoch": 0.6948898866794954,
"grad_norm": 0.11659736931324005,
"learning_rate": 2.339518888281795e-06,
"loss": 0.0599,
"step": 3250
},
{
"epoch": 0.69702800940774,
"grad_norm": 0.12253785133361816,
"learning_rate": 2.3096516386511585e-06,
"loss": 0.062,
"step": 3260
},
{
"epoch": 0.6991661321359846,
"grad_norm": 0.10601403564214706,
"learning_rate": 2.279918898789865e-06,
"loss": 0.0603,
"step": 3270
},
{
"epoch": 0.7013042548642292,
"grad_norm": 0.1589890867471695,
"learning_rate": 2.2503221552503777e-06,
"loss": 0.0617,
"step": 3280
},
{
"epoch": 0.7034423775924739,
"grad_norm": 0.10667730122804642,
"learning_rate": 2.2208628877857276e-06,
"loss": 0.0595,
"step": 3290
},
{
"epoch": 0.7055805003207184,
"grad_norm": 0.09270080178976059,
"learning_rate": 2.1915425692755325e-06,
"loss": 0.0613,
"step": 3300
},
{
"epoch": 0.707718623048963,
"grad_norm": 0.08238282054662704,
"learning_rate": 2.162362665652364e-06,
"loss": 0.0593,
"step": 3310
},
{
"epoch": 0.7098567457772076,
"grad_norm": 0.08994623273611069,
"learning_rate": 2.1333246358284394e-06,
"loss": 0.0602,
"step": 3320
},
{
"epoch": 0.7119948685054522,
"grad_norm": 0.08377353101968765,
"learning_rate": 2.1044299316226962e-06,
"loss": 0.0639,
"step": 3330
},
{
"epoch": 0.7141329912336968,
"grad_norm": 0.11232832074165344,
"learning_rate": 2.0756799976881987e-06,
"loss": 0.0633,
"step": 3340
},
{
"epoch": 0.7162711139619414,
"grad_norm": 0.10576164722442627,
"learning_rate": 2.047076271439903e-06,
"loss": 0.0621,
"step": 3350
},
{
"epoch": 0.718409236690186,
"grad_norm": 0.10784109681844711,
"learning_rate": 2.018620182982803e-06,
"loss": 0.0633,
"step": 3360
},
{
"epoch": 0.7205473594184306,
"grad_norm": 0.10578976571559906,
"learning_rate": 1.9903131550404185e-06,
"loss": 0.0619,
"step": 3370
},
{
"epoch": 0.7226854821466753,
"grad_norm": 0.13373176753520966,
"learning_rate": 1.9621566028836717e-06,
"loss": 0.0589,
"step": 3380
},
{
"epoch": 0.7248236048749198,
"grad_norm": 0.09690658748149872,
"learning_rate": 1.9341519342601166e-06,
"loss": 0.0606,
"step": 3390
},
{
"epoch": 0.7269617276031645,
"grad_norm": 0.09626404196023941,
"learning_rate": 1.9063005493235692e-06,
"loss": 0.0597,
"step": 3400
},
{
"epoch": 0.729099850331409,
"grad_norm": 0.08039379119873047,
"learning_rate": 1.8786038405640954e-06,
"loss": 0.0619,
"step": 3410
},
{
"epoch": 0.7312379730596537,
"grad_norm": 0.08523211628198624,
"learning_rate": 1.8510631927383887e-06,
"loss": 0.0601,
"step": 3420
},
{
"epoch": 0.7333760957878982,
"grad_norm": 0.10265989601612091,
"learning_rate": 1.8236799828005402e-06,
"loss": 0.0602,
"step": 3430
},
{
"epoch": 0.7355142185161428,
"grad_norm": 0.0833195149898529,
"learning_rate": 1.796455579833198e-06,
"loss": 0.0613,
"step": 3440
},
{
"epoch": 0.7376523412443874,
"grad_norm": 0.08251874148845673,
"learning_rate": 1.7693913449791094e-06,
"loss": 0.061,
"step": 3450
},
{
"epoch": 0.739790463972632,
"grad_norm": 0.0832567885518074,
"learning_rate": 1.7424886313730765e-06,
"loss": 0.0607,
"step": 3460
},
{
"epoch": 0.7419285867008766,
"grad_norm": 0.09145036339759827,
"learning_rate": 1.7157487840742908e-06,
"loss": 0.0625,
"step": 3470
},
{
"epoch": 0.7440667094291212,
"grad_norm": 0.10999724268913269,
"learning_rate": 1.6891731399990952e-06,
"loss": 0.0618,
"step": 3480
},
{
"epoch": 0.7462048321573659,
"grad_norm": 0.08439410477876663,
"learning_rate": 1.6627630278541406e-06,
"loss": 0.062,
"step": 3490
},
{
"epoch": 0.7483429548856104,
"grad_norm": 0.09440149366855621,
"learning_rate": 1.6365197680699468e-06,
"loss": 0.0635,
"step": 3500
},
{
"epoch": 0.7504810776138551,
"grad_norm": 0.07940148562192917,
"learning_rate": 1.6104446727348944e-06,
"loss": 0.0594,
"step": 3510
},
{
"epoch": 0.7526192003420996,
"grad_norm": 0.174238920211792,
"learning_rate": 1.5845390455296195e-06,
"loss": 0.0602,
"step": 3520
},
{
"epoch": 0.7547573230703443,
"grad_norm": 0.08501884341239929,
"learning_rate": 1.5588041816618288e-06,
"loss": 0.0636,
"step": 3530
},
{
"epoch": 0.7568954457985888,
"grad_norm": 0.10274173319339752,
"learning_rate": 1.533241367801554e-06,
"loss": 0.0596,
"step": 3540
},
{
"epoch": 0.7590335685268335,
"grad_norm": 0.11729396134614944,
"learning_rate": 1.5078518820168097e-06,
"loss": 0.0587,
"step": 3550
},
{
"epoch": 0.761171691255078,
"grad_norm": 0.11430974304676056,
"learning_rate": 1.482636993709703e-06,
"loss": 0.0603,
"step": 3560
},
{
"epoch": 0.7633098139833226,
"grad_norm": 0.08141325414180756,
"learning_rate": 1.4575979635529653e-06,
"loss": 0.061,
"step": 3570
},
{
"epoch": 0.7654479367115672,
"grad_norm": 0.13267385959625244,
"learning_rate": 1.4327360434269138e-06,
"loss": 0.0621,
"step": 3580
},
{
"epoch": 0.7675860594398118,
"grad_norm": 0.18512435257434845,
"learning_rate": 1.4080524763568754e-06,
"loss": 0.0599,
"step": 3590
},
{
"epoch": 0.7697241821680565,
"grad_norm": 0.10558240115642548,
"learning_rate": 1.383548496451026e-06,
"loss": 0.0604,
"step": 3600
},
{
"epoch": 0.7697241821680565,
"eval_loss": 0.06212155520915985,
"eval_runtime": 472.0918,
"eval_samples_per_second": 4.902,
"eval_steps_per_second": 0.307,
"step": 3600
},
{
"epoch": 0.771862304896301,
"grad_norm": 0.08125459402799606,
"learning_rate": 1.3592253288386937e-06,
"loss": 0.0569,
"step": 3610
},
{
"epoch": 0.7740004276245457,
"grad_norm": 0.10237967222929001,
"learning_rate": 1.33508418960911e-06,
"loss": 0.0634,
"step": 3620
},
{
"epoch": 0.7761385503527902,
"grad_norm": 0.13348835706710815,
"learning_rate": 1.3111262857506018e-06,
"loss": 0.0622,
"step": 3630
},
{
"epoch": 0.7782766730810349,
"grad_norm": 0.0952489823102951,
"learning_rate": 1.287352815090251e-06,
"loss": 0.0624,
"step": 3640
},
{
"epoch": 0.7804147958092794,
"grad_norm": 0.08191373199224472,
"learning_rate": 1.263764966234e-06,
"loss": 0.0608,
"step": 3650
},
{
"epoch": 0.7825529185375241,
"grad_norm": 0.1063610091805458,
"learning_rate": 1.2403639185072298e-06,
"loss": 0.0606,
"step": 3660
},
{
"epoch": 0.7846910412657686,
"grad_norm": 0.10587465018033981,
"learning_rate": 1.2171508418958005e-06,
"loss": 0.061,
"step": 3670
},
{
"epoch": 0.7868291639940133,
"grad_norm": 0.08978404104709625,
"learning_rate": 1.194126896987543e-06,
"loss": 0.0604,
"step": 3680
},
{
"epoch": 0.7889672867222579,
"grad_norm": 0.08431018143892288,
"learning_rate": 1.1712932349142481e-06,
"loss": 0.0587,
"step": 3690
},
{
"epoch": 0.7911054094505025,
"grad_norm": 0.09140188992023468,
"learning_rate": 1.1486509972941029e-06,
"loss": 0.059,
"step": 3700
},
{
"epoch": 0.7932435321787471,
"grad_norm": 0.21178825199604034,
"learning_rate": 1.1262013161746144e-06,
"loss": 0.0589,
"step": 3710
},
{
"epoch": 0.7953816549069916,
"grad_norm": 0.09437291324138641,
"learning_rate": 1.1039453139760154e-06,
"loss": 0.059,
"step": 3720
},
{
"epoch": 0.7975197776352363,
"grad_norm": 0.08276817947626114,
"learning_rate": 1.081884103435139e-06,
"loss": 0.062,
"step": 3730
},
{
"epoch": 0.7996579003634808,
"grad_norm": 0.08296237885951996,
"learning_rate": 1.060018787549793e-06,
"loss": 0.0595,
"step": 3740
},
{
"epoch": 0.8017960230917255,
"grad_norm": 0.09400928020477295,
"learning_rate": 1.03835045952361e-06,
"loss": 0.0586,
"step": 3750
},
{
"epoch": 0.80393414581997,
"grad_norm": 0.09929320216178894,
"learning_rate": 1.016880202711384e-06,
"loss": 0.0603,
"step": 3760
},
{
"epoch": 0.8060722685482147,
"grad_norm": 0.1045432910323143,
"learning_rate": 9.956090905649184e-07,
"loss": 0.0591,
"step": 3770
},
{
"epoch": 0.8082103912764592,
"grad_norm": 0.08326222002506256,
"learning_rate": 9.74538186579345e-07,
"loss": 0.0596,
"step": 3780
},
{
"epoch": 0.8103485140047039,
"grad_norm": 0.08769119530916214,
"learning_rate": 9.536685442399568e-07,
"loss": 0.0594,
"step": 3790
},
{
"epoch": 0.8124866367329485,
"grad_norm": 0.09135784208774567,
"learning_rate": 9.330012069695387e-07,
"loss": 0.059,
"step": 3800
},
{
"epoch": 0.8146247594611931,
"grad_norm": 0.09643464535474777,
"learning_rate": 9.125372080761985e-07,
"loss": 0.0584,
"step": 3810
},
{
"epoch": 0.8167628821894377,
"grad_norm": 0.08774056285619736,
"learning_rate": 8.922775707016973e-07,
"loss": 0.0617,
"step": 3820
},
{
"epoch": 0.8189010049176823,
"grad_norm": 0.1310407817363739,
"learning_rate": 8.722233077703096e-07,
"loss": 0.0618,
"step": 3830
},
{
"epoch": 0.8210391276459269,
"grad_norm": 0.09406092017889023,
"learning_rate": 8.523754219381631e-07,
"loss": 0.0581,
"step": 3840
},
{
"epoch": 0.8231772503741714,
"grad_norm": 0.111025370657444,
"learning_rate": 8.327349055431233e-07,
"loss": 0.061,
"step": 3850
},
{
"epoch": 0.8253153731024161,
"grad_norm": 0.08444110304117203,
"learning_rate": 8.13302740555173e-07,
"loss": 0.0613,
"step": 3860
},
{
"epoch": 0.8274534958306606,
"grad_norm": 0.16973550617694855,
"learning_rate": 7.940798985273124e-07,
"loss": 0.0622,
"step": 3870
},
{
"epoch": 0.8295916185589053,
"grad_norm": 0.14076325297355652,
"learning_rate": 7.750673405469949e-07,
"loss": 0.0622,
"step": 3880
},
{
"epoch": 0.8317297412871499,
"grad_norm": 0.12866050004959106,
"learning_rate": 7.562660171880632e-07,
"loss": 0.0623,
"step": 3890
},
{
"epoch": 0.8338678640153945,
"grad_norm": 0.08956257998943329,
"learning_rate": 7.376768684632357e-07,
"loss": 0.0589,
"step": 3900
},
{
"epoch": 0.8360059867436391,
"grad_norm": 0.09152022004127502,
"learning_rate": 7.193008237770971e-07,
"loss": 0.0615,
"step": 3910
},
{
"epoch": 0.8381441094718837,
"grad_norm": 0.0782172754406929,
"learning_rate": 7.011388018796389e-07,
"loss": 0.0611,
"step": 3920
},
{
"epoch": 0.8402822322001283,
"grad_norm": 0.0877286046743393,
"learning_rate": 6.831917108203217e-07,
"loss": 0.0597,
"step": 3930
},
{
"epoch": 0.8424203549283729,
"grad_norm": 0.08773230016231537,
"learning_rate": 6.654604479026728e-07,
"loss": 0.0601,
"step": 3940
},
{
"epoch": 0.8445584776566175,
"grad_norm": 0.08106860518455505,
"learning_rate": 6.479458996394294e-07,
"loss": 0.0633,
"step": 3950
},
{
"epoch": 0.846696600384862,
"grad_norm": 0.09297432750463486,
"learning_rate": 6.306489417082096e-07,
"loss": 0.0621,
"step": 3960
},
{
"epoch": 0.8488347231131067,
"grad_norm": 0.09097818285226822,
"learning_rate": 6.135704389077335e-07,
"loss": 0.0609,
"step": 3970
},
{
"epoch": 0.8509728458413512,
"grad_norm": 0.10589548945426941,
"learning_rate": 5.967112451145868e-07,
"loss": 0.0605,
"step": 3980
},
{
"epoch": 0.8531109685695959,
"grad_norm": 0.08168578892946243,
"learning_rate": 5.800722032405304e-07,
"loss": 0.0591,
"step": 3990
},
{
"epoch": 0.8552490912978405,
"grad_norm": 0.09122731536626816,
"learning_rate": 5.636541451903494e-07,
"loss": 0.0586,
"step": 4000
},
{
"epoch": 0.8573872140260851,
"grad_norm": 0.10044734925031662,
"learning_rate": 5.474578918202717e-07,
"loss": 0.0608,
"step": 4010
},
{
"epoch": 0.8595253367543297,
"grad_norm": 0.07848547399044037,
"learning_rate": 5.314842528969177e-07,
"loss": 0.0609,
"step": 4020
},
{
"epoch": 0.8616634594825743,
"grad_norm": 0.09690708667039871,
"learning_rate": 5.157340270568212e-07,
"loss": 0.0629,
"step": 4030
},
{
"epoch": 0.8638015822108189,
"grad_norm": 0.09269597381353378,
"learning_rate": 5.002080017664973e-07,
"loss": 0.0587,
"step": 4040
},
{
"epoch": 0.8659397049390635,
"grad_norm": 0.08430242538452148,
"learning_rate": 4.849069532830669e-07,
"loss": 0.0616,
"step": 4050
},
{
"epoch": 0.8680778276673081,
"grad_norm": 0.09234491735696793,
"learning_rate": 4.698316466154551e-07,
"loss": 0.0613,
"step": 4060
},
{
"epoch": 0.8702159503955527,
"grad_norm": 0.11234113574028015,
"learning_rate": 4.549828354861341e-07,
"loss": 0.06,
"step": 4070
},
{
"epoch": 0.8723540731237973,
"grad_norm": 0.08569945394992828,
"learning_rate": 4.4036126229344613e-07,
"loss": 0.0592,
"step": 4080
},
{
"epoch": 0.8744921958520419,
"grad_norm": 0.085124172270298,
"learning_rate": 4.2596765807448037e-07,
"loss": 0.0599,
"step": 4090
},
{
"epoch": 0.8766303185802865,
"grad_norm": 0.08620309084653854,
"learning_rate": 4.1180274246852724e-07,
"loss": 0.0644,
"step": 4100
},
{
"epoch": 0.8787684413085312,
"grad_norm": 0.08025231957435608,
"learning_rate": 3.97867223681096e-07,
"loss": 0.0607,
"step": 4110
},
{
"epoch": 0.8809065640367757,
"grad_norm": 0.08026058226823807,
"learning_rate": 3.841617984485069e-07,
"loss": 0.0585,
"step": 4120
},
{
"epoch": 0.8830446867650203,
"grad_norm": 0.07990364730358124,
"learning_rate": 3.706871520030553e-07,
"loss": 0.0622,
"step": 4130
},
{
"epoch": 0.8851828094932649,
"grad_norm": 0.11747777462005615,
"learning_rate": 3.574439580387562e-07,
"loss": 0.0624,
"step": 4140
},
{
"epoch": 0.8873209322215095,
"grad_norm": 0.08300217986106873,
"learning_rate": 3.444328786776557e-07,
"loss": 0.0617,
"step": 4150
},
{
"epoch": 0.8894590549497541,
"grad_norm": 0.11143102496862411,
"learning_rate": 3.3165456443673307e-07,
"loss": 0.061,
"step": 4160
},
{
"epoch": 0.8915971776779987,
"grad_norm": 0.0875178873538971,
"learning_rate": 3.1910965419537087e-07,
"loss": 0.062,
"step": 4170
},
{
"epoch": 0.8937353004062433,
"grad_norm": 0.07958797365427017,
"learning_rate": 3.0679877516341386e-07,
"loss": 0.0607,
"step": 4180
},
{
"epoch": 0.8958734231344879,
"grad_norm": 0.07875709980726242,
"learning_rate": 2.947225428498152e-07,
"loss": 0.0602,
"step": 4190
},
{
"epoch": 0.8980115458627326,
"grad_norm": 0.08480419218540192,
"learning_rate": 2.828815610318569e-07,
"loss": 0.0625,
"step": 4200
},
{
"epoch": 0.8980115458627326,
"eval_loss": 0.061606768518686295,
"eval_runtime": 472.6015,
"eval_samples_per_second": 4.896,
"eval_steps_per_second": 0.307,
"step": 4200
},
{
"epoch": 0.9001496685909771,
"grad_norm": 0.08824723958969116,
"learning_rate": 2.7127642172496583e-07,
"loss": 0.0595,
"step": 4210
},
{
"epoch": 0.9022877913192218,
"grad_norm": 0.10775342583656311,
"learning_rate": 2.59907705153114e-07,
"loss": 0.0587,
"step": 4220
},
{
"epoch": 0.9044259140474663,
"grad_norm": 0.08595039695501328,
"learning_rate": 2.487759797198075e-07,
"loss": 0.0603,
"step": 4230
},
{
"epoch": 0.906564036775711,
"grad_norm": 0.09018037468194962,
"learning_rate": 2.3788180197967193e-07,
"loss": 0.061,
"step": 4240
},
{
"epoch": 0.9087021595039555,
"grad_norm": 0.10949289798736572,
"learning_rate": 2.272257166106201e-07,
"loss": 0.059,
"step": 4250
},
{
"epoch": 0.9108402822322001,
"grad_norm": 0.0874534547328949,
"learning_rate": 2.1680825638662527e-07,
"loss": 0.0607,
"step": 4260
},
{
"epoch": 0.9129784049604447,
"grad_norm": 0.09561596810817719,
"learning_rate": 2.06629942151082e-07,
"loss": 0.0605,
"step": 4270
},
{
"epoch": 0.9151165276886893,
"grad_norm": 0.09017440676689148,
"learning_rate": 1.9669128279076522e-07,
"loss": 0.0603,
"step": 4280
},
{
"epoch": 0.9172546504169339,
"grad_norm": 0.08165230602025986,
"learning_rate": 1.8699277521038672e-07,
"loss": 0.0607,
"step": 4290
},
{
"epoch": 0.9193927731451785,
"grad_norm": 0.09331604838371277,
"learning_rate": 1.7753490430775288e-07,
"loss": 0.0597,
"step": 4300
},
{
"epoch": 0.9215308958734232,
"grad_norm": 0.08846830576658249,
"learning_rate": 1.6831814294951843e-07,
"loss": 0.0612,
"step": 4310
},
{
"epoch": 0.9236690186016677,
"grad_norm": 0.1046655997633934,
"learning_rate": 1.5934295194754924e-07,
"loss": 0.0593,
"step": 4320
},
{
"epoch": 0.9258071413299124,
"grad_norm": 0.0815853402018547,
"learning_rate": 1.5060978003587745e-07,
"loss": 0.0614,
"step": 4330
},
{
"epoch": 0.9279452640581569,
"grad_norm": 0.08241681009531021,
"learning_rate": 1.4211906384827223e-07,
"loss": 0.0614,
"step": 4340
},
{
"epoch": 0.9300833867864016,
"grad_norm": 0.07734426110982895,
"learning_rate": 1.3387122789640163e-07,
"loss": 0.0597,
"step": 4350
},
{
"epoch": 0.9322215095146461,
"grad_norm": 0.09207015484571457,
"learning_rate": 1.2586668454861505e-07,
"loss": 0.0612,
"step": 4360
},
{
"epoch": 0.9343596322428908,
"grad_norm": 0.09312504529953003,
"learning_rate": 1.181058340093233e-07,
"loss": 0.0602,
"step": 4370
},
{
"epoch": 0.9364977549711353,
"grad_norm": 0.08721912652254105,
"learning_rate": 1.1058906429898764e-07,
"loss": 0.0627,
"step": 4380
},
{
"epoch": 0.93863587769938,
"grad_norm": 0.08252057433128357,
"learning_rate": 1.033167512347244e-07,
"loss": 0.0591,
"step": 4390
},
{
"epoch": 0.9407740004276245,
"grad_norm": 0.07943445444107056,
"learning_rate": 9.62892584115116e-08,
"loss": 0.0586,
"step": 4400
},
{
"epoch": 0.9429121231558691,
"grad_norm": 0.1239377111196518,
"learning_rate": 8.950693718401016e-08,
"loss": 0.0596,
"step": 4410
},
{
"epoch": 0.9450502458841138,
"grad_norm": 0.08284857869148254,
"learning_rate": 8.297012664900017e-08,
"loss": 0.0581,
"step": 4420
},
{
"epoch": 0.9471883686123583,
"grad_norm": 0.11521401256322861,
"learning_rate": 7.667915362842337e-08,
"loss": 0.0619,
"step": 4430
},
{
"epoch": 0.949326491340603,
"grad_norm": 0.0899544283747673,
"learning_rate": 7.063433265304509e-08,
"loss": 0.0645,
"step": 4440
},
{
"epoch": 0.9514646140688475,
"grad_norm": 0.07524080574512482,
"learning_rate": 6.483596594672959e-08,
"loss": 0.0597,
"step": 4450
},
{
"epoch": 0.9536027367970922,
"grad_norm": 0.10981776565313339,
"learning_rate": 5.928434341132605e-08,
"loss": 0.0582,
"step": 4460
},
{
"epoch": 0.9557408595253367,
"grad_norm": 0.09796682000160217,
"learning_rate": 5.397974261217909e-08,
"loss": 0.0568,
"step": 4470
},
{
"epoch": 0.9578789822535814,
"grad_norm": 0.0810341015458107,
"learning_rate": 4.892242876424702e-08,
"loss": 0.0612,
"step": 4480
},
{
"epoch": 0.9600171049818259,
"grad_norm": 0.08066795766353607,
"learning_rate": 4.411265471884363e-08,
"loss": 0.0629,
"step": 4490
},
{
"epoch": 0.9621552277100706,
"grad_norm": 0.08418738096952438,
"learning_rate": 3.955066095099769e-08,
"loss": 0.0584,
"step": 4500
},
{
"epoch": 0.9642933504383152,
"grad_norm": 0.07691395282745361,
"learning_rate": 3.523667554742704e-08,
"loss": 0.06,
"step": 4510
},
{
"epoch": 0.9664314731665598,
"grad_norm": 0.08319604396820068,
"learning_rate": 3.117091419513829e-08,
"loss": 0.0597,
"step": 4520
},
{
"epoch": 0.9685695958948044,
"grad_norm": 0.09783016890287399,
"learning_rate": 2.7353580170638714e-08,
"loss": 0.0605,
"step": 4530
},
{
"epoch": 0.9707077186230489,
"grad_norm": 0.10302453488111496,
"learning_rate": 2.3784864329777224e-08,
"loss": 0.0594,
"step": 4540
},
{
"epoch": 0.9728458413512936,
"grad_norm": 0.0973035991191864,
"learning_rate": 2.0464945098200296e-08,
"loss": 0.0634,
"step": 4550
},
{
"epoch": 0.9749839640795381,
"grad_norm": 0.07076684385538101,
"learning_rate": 1.739398846242968e-08,
"loss": 0.0618,
"step": 4560
},
{
"epoch": 0.9771220868077828,
"grad_norm": 0.1335037350654602,
"learning_rate": 1.4572147961567917e-08,
"loss": 0.062,
"step": 4570
},
{
"epoch": 0.9792602095360273,
"grad_norm": 0.08357635885477066,
"learning_rate": 1.1999564679616715e-08,
"loss": 0.0618,
"step": 4580
},
{
"epoch": 0.981398332264272,
"grad_norm": 0.08964463323354721,
"learning_rate": 9.67636723842591e-09,
"loss": 0.0604,
"step": 4590
},
{
"epoch": 0.9835364549925165,
"grad_norm": 0.08757588267326355,
"learning_rate": 7.602671791263616e-09,
"loss": 0.0619,
"step": 4600
},
{
"epoch": 0.9856745777207612,
"grad_norm": 0.08129626512527466,
"learning_rate": 5.778582017005874e-09,
"loss": 0.0591,
"step": 4610
},
{
"epoch": 0.9878127004490058,
"grad_norm": 0.08929581940174103,
"learning_rate": 4.204189114955793e-09,
"loss": 0.0581,
"step": 4620
},
{
"epoch": 0.9899508231772504,
"grad_norm": 0.09756463766098022,
"learning_rate": 2.8795718002821993e-09,
"loss": 0.0603,
"step": 4630
},
{
"epoch": 0.992088945905495,
"grad_norm": 0.09502363204956055,
"learning_rate": 1.80479630008501e-09,
"loss": 0.062,
"step": 4640
},
{
"epoch": 0.9942270686337396,
"grad_norm": 0.0853857547044754,
"learning_rate": 9.799163500834319e-10,
"loss": 0.0616,
"step": 4650
},
{
"epoch": 0.9963651913619842,
"grad_norm": 0.0833888053894043,
"learning_rate": 4.049731919303357e-10,
"loss": 0.0587,
"step": 4660
},
{
"epoch": 0.9985033140902287,
"grad_norm": 0.089773990213871,
"learning_rate": 7.999557114835022e-11,
"loss": 0.0608,
"step": 4670
},
{
"epoch": 1.0,
"step": 4677,
"total_flos": 9.549834298038052e+19,
"train_loss": 0.10822304976860733,
"train_runtime": 69525.0456,
"train_samples_per_second": 1.076,
"train_steps_per_second": 0.067
}
],
"logging_steps": 10,
"max_steps": 4677,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.549834298038052e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}