Files
Qwen3-8B-fim-v2v3pt/trainer_state.json
ModelHub XC e14892afa8 初始化项目,由ModelHub XC社区提供模型
Model: lllqaq/Qwen3-8B-fim-v2v3pt
Source: Original Platform
2026-04-11 23:26:02 +08:00

3117 lines
76 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4394,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022762839664248113,
"grad_norm": 10.595879842373915,
"learning_rate": 2.0454545454545456e-07,
"loss": 0.9592,
"step": 10
},
{
"epoch": 0.004552567932849623,
"grad_norm": 8.506899480641875,
"learning_rate": 4.3181818181818187e-07,
"loss": 0.9341,
"step": 20
},
{
"epoch": 0.006828851899274435,
"grad_norm": 4.048540973587426,
"learning_rate": 6.590909090909091e-07,
"loss": 0.8698,
"step": 30
},
{
"epoch": 0.009105135865699245,
"grad_norm": 1.7983922183043597,
"learning_rate": 8.863636363636364e-07,
"loss": 0.7762,
"step": 40
},
{
"epoch": 0.011381419832124057,
"grad_norm": 1.301643851792194,
"learning_rate": 1.1136363636363637e-06,
"loss": 0.7056,
"step": 50
},
{
"epoch": 0.01365770379854887,
"grad_norm": 0.8116123655731647,
"learning_rate": 1.3409090909090911e-06,
"loss": 0.636,
"step": 60
},
{
"epoch": 0.015933987764973682,
"grad_norm": 0.7273739176052791,
"learning_rate": 1.5681818181818184e-06,
"loss": 0.6355,
"step": 70
},
{
"epoch": 0.01821027173139849,
"grad_norm": 0.7380885266969118,
"learning_rate": 1.7954545454545456e-06,
"loss": 0.6222,
"step": 80
},
{
"epoch": 0.020486555697823303,
"grad_norm": 0.6975134603454084,
"learning_rate": 2.022727272727273e-06,
"loss": 0.5979,
"step": 90
},
{
"epoch": 0.022762839664248115,
"grad_norm": 0.7291820146919704,
"learning_rate": 2.25e-06,
"loss": 0.5945,
"step": 100
},
{
"epoch": 0.025039123630672927,
"grad_norm": 0.7008207721708937,
"learning_rate": 2.4772727272727275e-06,
"loss": 0.5705,
"step": 110
},
{
"epoch": 0.02731540759709774,
"grad_norm": 0.6449537277719501,
"learning_rate": 2.7045454545454545e-06,
"loss": 0.5718,
"step": 120
},
{
"epoch": 0.029591691563522548,
"grad_norm": 0.6058495976490267,
"learning_rate": 2.931818181818182e-06,
"loss": 0.5594,
"step": 130
},
{
"epoch": 0.031867975529947364,
"grad_norm": 0.681338176363733,
"learning_rate": 3.1590909090909094e-06,
"loss": 0.564,
"step": 140
},
{
"epoch": 0.034144259496372176,
"grad_norm": 0.7273342639401195,
"learning_rate": 3.3863636363636364e-06,
"loss": 0.5657,
"step": 150
},
{
"epoch": 0.03642054346279698,
"grad_norm": 0.6489466886504726,
"learning_rate": 3.6136363636363643e-06,
"loss": 0.5484,
"step": 160
},
{
"epoch": 0.03869682742922179,
"grad_norm": 0.6973554298115421,
"learning_rate": 3.840909090909091e-06,
"loss": 0.5445,
"step": 170
},
{
"epoch": 0.040973111395646605,
"grad_norm": 0.6394889437481598,
"learning_rate": 4.068181818181818e-06,
"loss": 0.548,
"step": 180
},
{
"epoch": 0.04324939536207142,
"grad_norm": 0.6430348845599891,
"learning_rate": 4.295454545454546e-06,
"loss": 0.5481,
"step": 190
},
{
"epoch": 0.04552567932849623,
"grad_norm": 0.7221066666159676,
"learning_rate": 4.522727272727273e-06,
"loss": 0.5492,
"step": 200
},
{
"epoch": 0.04780196329492104,
"grad_norm": 0.6814488809728508,
"learning_rate": 4.75e-06,
"loss": 0.5429,
"step": 210
},
{
"epoch": 0.050078247261345854,
"grad_norm": 0.6336335480126656,
"learning_rate": 4.977272727272728e-06,
"loss": 0.5323,
"step": 220
},
{
"epoch": 0.052354531227770666,
"grad_norm": 0.7343888412812761,
"learning_rate": 5.204545454545455e-06,
"loss": 0.5483,
"step": 230
},
{
"epoch": 0.05463081519419548,
"grad_norm": 0.8008844122645815,
"learning_rate": 5.431818181818182e-06,
"loss": 0.5286,
"step": 240
},
{
"epoch": 0.05690709916062029,
"grad_norm": 0.7508926850083371,
"learning_rate": 5.65909090909091e-06,
"loss": 0.5387,
"step": 250
},
{
"epoch": 0.059183383127045096,
"grad_norm": 0.6226438437116334,
"learning_rate": 5.886363636363637e-06,
"loss": 0.5311,
"step": 260
},
{
"epoch": 0.06145966709346991,
"grad_norm": 0.6829241845904538,
"learning_rate": 6.113636363636364e-06,
"loss": 0.5325,
"step": 270
},
{
"epoch": 0.06373595105989473,
"grad_norm": 0.7361115947795334,
"learning_rate": 6.340909090909091e-06,
"loss": 0.5344,
"step": 280
},
{
"epoch": 0.06601223502631953,
"grad_norm": 0.7763867544917724,
"learning_rate": 6.568181818181819e-06,
"loss": 0.5255,
"step": 290
},
{
"epoch": 0.06828851899274435,
"grad_norm": 0.7805934170558461,
"learning_rate": 6.795454545454546e-06,
"loss": 0.5237,
"step": 300
},
{
"epoch": 0.07056480295916916,
"grad_norm": 0.8252464031314231,
"learning_rate": 7.022727272727273e-06,
"loss": 0.5242,
"step": 310
},
{
"epoch": 0.07284108692559396,
"grad_norm": 0.7334894747403866,
"learning_rate": 7.25e-06,
"loss": 0.521,
"step": 320
},
{
"epoch": 0.07511737089201878,
"grad_norm": 0.6846204422508828,
"learning_rate": 7.477272727272727e-06,
"loss": 0.5229,
"step": 330
},
{
"epoch": 0.07739365485844359,
"grad_norm": 0.8052410281091429,
"learning_rate": 7.704545454545456e-06,
"loss": 0.5262,
"step": 340
},
{
"epoch": 0.0796699388248684,
"grad_norm": 0.6991957963444353,
"learning_rate": 7.931818181818182e-06,
"loss": 0.5335,
"step": 350
},
{
"epoch": 0.08194622279129321,
"grad_norm": 0.7125373155196583,
"learning_rate": 8.15909090909091e-06,
"loss": 0.5125,
"step": 360
},
{
"epoch": 0.08422250675771803,
"grad_norm": 0.7997954899533661,
"learning_rate": 8.386363636363638e-06,
"loss": 0.5121,
"step": 370
},
{
"epoch": 0.08649879072414283,
"grad_norm": 0.7683875875855587,
"learning_rate": 8.613636363636364e-06,
"loss": 0.5121,
"step": 380
},
{
"epoch": 0.08877507469056765,
"grad_norm": 0.67604209965691,
"learning_rate": 8.840909090909092e-06,
"loss": 0.5124,
"step": 390
},
{
"epoch": 0.09105135865699246,
"grad_norm": 0.8371326669517628,
"learning_rate": 9.06818181818182e-06,
"loss": 0.5253,
"step": 400
},
{
"epoch": 0.09332764262341726,
"grad_norm": 0.8604199705110958,
"learning_rate": 9.295454545454546e-06,
"loss": 0.5057,
"step": 410
},
{
"epoch": 0.09560392658984208,
"grad_norm": 0.7001685508024218,
"learning_rate": 9.522727272727274e-06,
"loss": 0.512,
"step": 420
},
{
"epoch": 0.09788021055626689,
"grad_norm": 0.746358066866932,
"learning_rate": 9.75e-06,
"loss": 0.5152,
"step": 430
},
{
"epoch": 0.10015649452269171,
"grad_norm": 0.7485703568589076,
"learning_rate": 9.977272727272728e-06,
"loss": 0.51,
"step": 440
},
{
"epoch": 0.10243277848911651,
"grad_norm": 0.9902309913638483,
"learning_rate": 9.999872165053986e-06,
"loss": 0.5156,
"step": 450
},
{
"epoch": 0.10470906245554133,
"grad_norm": 0.9153541257496008,
"learning_rate": 9.999430274867309e-06,
"loss": 0.5059,
"step": 460
},
{
"epoch": 0.10698534642196614,
"grad_norm": 0.6739494150459265,
"learning_rate": 9.998672779119897e-06,
"loss": 0.4995,
"step": 470
},
{
"epoch": 0.10926163038839096,
"grad_norm": 0.7147898292959126,
"learning_rate": 9.997599725631174e-06,
"loss": 0.5021,
"step": 480
},
{
"epoch": 0.11153791435481576,
"grad_norm": 0.7526289283194069,
"learning_rate": 9.996211182141184e-06,
"loss": 0.5113,
"step": 490
},
{
"epoch": 0.11381419832124058,
"grad_norm": 0.8898690822057496,
"learning_rate": 9.994507236306327e-06,
"loss": 0.5081,
"step": 500
},
{
"epoch": 0.11609048228766539,
"grad_norm": 0.8227051185656225,
"learning_rate": 9.99248799569382e-06,
"loss": 0.5034,
"step": 510
},
{
"epoch": 0.11836676625409019,
"grad_norm": 0.6375442627559045,
"learning_rate": 9.990153587774895e-06,
"loss": 0.5021,
"step": 520
},
{
"epoch": 0.12064305022051501,
"grad_norm": 0.6931479406599156,
"learning_rate": 9.98750415991677e-06,
"loss": 0.5044,
"step": 530
},
{
"epoch": 0.12291933418693982,
"grad_norm": 0.8097527394823517,
"learning_rate": 9.984539879373335e-06,
"loss": 0.5088,
"step": 540
},
{
"epoch": 0.12519561815336464,
"grad_norm": 0.8559434520312957,
"learning_rate": 9.981260933274597e-06,
"loss": 0.5111,
"step": 550
},
{
"epoch": 0.12747190211978945,
"grad_norm": 0.6767290786737241,
"learning_rate": 9.977667528614869e-06,
"loss": 0.4952,
"step": 560
},
{
"epoch": 0.12974818608621425,
"grad_norm": 0.6584609354184026,
"learning_rate": 9.973759892239696e-06,
"loss": 0.4916,
"step": 570
},
{
"epoch": 0.13202447005263906,
"grad_norm": 1.0067522800390891,
"learning_rate": 9.969538270831538e-06,
"loss": 0.4938,
"step": 580
},
{
"epoch": 0.13430075401906388,
"grad_norm": 0.7842725791926335,
"learning_rate": 9.9650029308942e-06,
"loss": 0.5031,
"step": 590
},
{
"epoch": 0.1365770379854887,
"grad_norm": 0.7213504076724643,
"learning_rate": 9.960154158736011e-06,
"loss": 0.4987,
"step": 600
},
{
"epoch": 0.1388533219519135,
"grad_norm": 0.7639458215235444,
"learning_rate": 9.954992260451737e-06,
"loss": 0.4916,
"step": 610
},
{
"epoch": 0.1411296059183383,
"grad_norm": 0.6736103272836254,
"learning_rate": 9.949517561903268e-06,
"loss": 0.4953,
"step": 620
},
{
"epoch": 0.14340588988476313,
"grad_norm": 0.7206570170845565,
"learning_rate": 9.943730408699047e-06,
"loss": 0.4949,
"step": 630
},
{
"epoch": 0.14568217385118792,
"grad_norm": 0.7325803766368787,
"learning_rate": 9.937631166172248e-06,
"loss": 0.5015,
"step": 640
},
{
"epoch": 0.14795845781761274,
"grad_norm": 0.7277234400694914,
"learning_rate": 9.931220219357714e-06,
"loss": 0.5065,
"step": 650
},
{
"epoch": 0.15023474178403756,
"grad_norm": 6.746710290505448,
"learning_rate": 9.924497972967652e-06,
"loss": 0.4918,
"step": 660
},
{
"epoch": 0.15251102575046238,
"grad_norm": 0.6381821044001871,
"learning_rate": 9.91746485136609e-06,
"loss": 0.4842,
"step": 670
},
{
"epoch": 0.15478730971688717,
"grad_norm": 0.6894734847882242,
"learning_rate": 9.91012129854207e-06,
"loss": 0.4874,
"step": 680
},
{
"epoch": 0.157063593683312,
"grad_norm": 0.7239229961207453,
"learning_rate": 9.90246777808164e-06,
"loss": 0.4817,
"step": 690
},
{
"epoch": 0.1593398776497368,
"grad_norm": 0.6946825414002048,
"learning_rate": 9.894504773138573e-06,
"loss": 0.5014,
"step": 700
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.7364735544657125,
"learning_rate": 9.88623278640388e-06,
"loss": 0.4861,
"step": 710
},
{
"epoch": 0.16389244558258642,
"grad_norm": 0.6496771859407187,
"learning_rate": 9.877652340074063e-06,
"loss": 0.4892,
"step": 720
},
{
"epoch": 0.16616872954901124,
"grad_norm": 0.6578218852464074,
"learning_rate": 9.868763975818156e-06,
"loss": 0.4866,
"step": 730
},
{
"epoch": 0.16844501351543606,
"grad_norm": 0.8508879887579466,
"learning_rate": 9.859568254743535e-06,
"loss": 0.4986,
"step": 740
},
{
"epoch": 0.17072129748186085,
"grad_norm": 0.783429883650226,
"learning_rate": 9.850065757360485e-06,
"loss": 0.4988,
"step": 750
},
{
"epoch": 0.17299758144828567,
"grad_norm": 0.8186071721692408,
"learning_rate": 9.840257083545562e-06,
"loss": 0.4818,
"step": 760
},
{
"epoch": 0.1752738654147105,
"grad_norm": 0.710423604497616,
"learning_rate": 9.83014285250372e-06,
"loss": 0.4888,
"step": 770
},
{
"epoch": 0.1775501493811353,
"grad_norm": 0.6800493756535392,
"learning_rate": 9.81972370272923e-06,
"loss": 0.4863,
"step": 780
},
{
"epoch": 0.1798264333475601,
"grad_norm": 0.7168752838596176,
"learning_rate": 9.809000291965354e-06,
"loss": 0.4911,
"step": 790
},
{
"epoch": 0.18210271731398492,
"grad_norm": 0.7743139671538918,
"learning_rate": 9.797973297162842e-06,
"loss": 0.4967,
"step": 800
},
{
"epoch": 0.18437900128040974,
"grad_norm": 0.6820565783740362,
"learning_rate": 9.78664341443719e-06,
"loss": 0.4766,
"step": 810
},
{
"epoch": 0.18665528524683453,
"grad_norm": 0.745509395049587,
"learning_rate": 9.775011359024692e-06,
"loss": 0.4831,
"step": 820
},
{
"epoch": 0.18893156921325935,
"grad_norm": 0.6587611779174343,
"learning_rate": 9.763077865237293e-06,
"loss": 0.486,
"step": 830
},
{
"epoch": 0.19120785317968417,
"grad_norm": 0.7570644152194894,
"learning_rate": 9.750843686416233e-06,
"loss": 0.4876,
"step": 840
},
{
"epoch": 0.193484137146109,
"grad_norm": 0.7438169073992149,
"learning_rate": 9.738309594884489e-06,
"loss": 0.498,
"step": 850
},
{
"epoch": 0.19576042111253378,
"grad_norm": 0.8075454984835392,
"learning_rate": 9.725476381898018e-06,
"loss": 0.4761,
"step": 860
},
{
"epoch": 0.1980367050789586,
"grad_norm": 0.7687120756908921,
"learning_rate": 9.712344857595804e-06,
"loss": 0.4735,
"step": 870
},
{
"epoch": 0.20031298904538342,
"grad_norm": 0.6692318540483727,
"learning_rate": 9.698915850948725e-06,
"loss": 0.4796,
"step": 880
},
{
"epoch": 0.20258927301180824,
"grad_norm": 0.7357514918311532,
"learning_rate": 9.685190209707214e-06,
"loss": 0.4881,
"step": 890
},
{
"epoch": 0.20486555697823303,
"grad_norm": 0.7672275789241998,
"learning_rate": 9.67116880034774e-06,
"loss": 0.4857,
"step": 900
},
{
"epoch": 0.20714184094465785,
"grad_norm": 0.7135300642695123,
"learning_rate": 9.656852508018111e-06,
"loss": 0.4791,
"step": 910
},
{
"epoch": 0.20941812491108266,
"grad_norm": 0.8360299716625731,
"learning_rate": 9.642242236481604e-06,
"loss": 0.4849,
"step": 920
},
{
"epoch": 0.21169440887750746,
"grad_norm": 0.8029294625940537,
"learning_rate": 9.6273389080599e-06,
"loss": 0.4797,
"step": 930
},
{
"epoch": 0.21397069284393228,
"grad_norm": 0.7293278863253679,
"learning_rate": 9.612143463574866e-06,
"loss": 0.4822,
"step": 940
},
{
"epoch": 0.2162469768103571,
"grad_norm": 0.7388020334147711,
"learning_rate": 9.596656862289158e-06,
"loss": 0.4918,
"step": 950
},
{
"epoch": 0.2185232607767819,
"grad_norm": 0.6641555373860104,
"learning_rate": 9.580880081845674e-06,
"loss": 0.4776,
"step": 960
},
{
"epoch": 0.2207995447432067,
"grad_norm": 0.8539043098487583,
"learning_rate": 9.564814118205825e-06,
"loss": 0.48,
"step": 970
},
{
"epoch": 0.22307582870963152,
"grad_norm": 0.7736828871091971,
"learning_rate": 9.548459985586668e-06,
"loss": 0.4819,
"step": 980
},
{
"epoch": 0.22535211267605634,
"grad_norm": 0.7761396197941033,
"learning_rate": 9.531818716396879e-06,
"loss": 0.4874,
"step": 990
},
{
"epoch": 0.22762839664248116,
"grad_norm": 0.8766761552909957,
"learning_rate": 9.514891361171584e-06,
"loss": 0.477,
"step": 1000
},
{
"epoch": 0.22990468060890595,
"grad_norm": 0.850710977709625,
"learning_rate": 9.497678988506027e-06,
"loss": 0.4809,
"step": 1010
},
{
"epoch": 0.23218096457533077,
"grad_norm": 0.7569136767466481,
"learning_rate": 9.480182684988128e-06,
"loss": 0.4798,
"step": 1020
},
{
"epoch": 0.2344572485417556,
"grad_norm": 1.0189056868147057,
"learning_rate": 9.462403555129875e-06,
"loss": 0.4799,
"step": 1030
},
{
"epoch": 0.23673353250818038,
"grad_norm": 0.7004478108183485,
"learning_rate": 9.444342721297607e-06,
"loss": 0.4786,
"step": 1040
},
{
"epoch": 0.2390098164746052,
"grad_norm": 0.8359707826058439,
"learning_rate": 9.426001323641156e-06,
"loss": 0.4715,
"step": 1050
},
{
"epoch": 0.24128610044103002,
"grad_norm": 0.6888590261970808,
"learning_rate": 9.40738052002187e-06,
"loss": 0.4682,
"step": 1060
},
{
"epoch": 0.24356238440745484,
"grad_norm": 0.7568931250342633,
"learning_rate": 9.388481485939532e-06,
"loss": 0.4746,
"step": 1070
},
{
"epoch": 0.24583866837387963,
"grad_norm": 0.7349584513921489,
"learning_rate": 9.369305414458128e-06,
"loss": 0.4763,
"step": 1080
},
{
"epoch": 0.24811495234030445,
"grad_norm": 0.8343501997237606,
"learning_rate": 9.349853516130556e-06,
"loss": 0.484,
"step": 1090
},
{
"epoch": 0.25039123630672927,
"grad_norm": 0.7193295893335135,
"learning_rate": 9.330127018922195e-06,
"loss": 0.4834,
"step": 1100
},
{
"epoch": 0.2526675202731541,
"grad_norm": 0.7580309140105336,
"learning_rate": 9.310127168133378e-06,
"loss": 0.4812,
"step": 1110
},
{
"epoch": 0.2549438042395789,
"grad_norm": 0.7613918214537019,
"learning_rate": 9.289855226320796e-06,
"loss": 0.4727,
"step": 1120
},
{
"epoch": 0.25722008820600367,
"grad_norm": 0.725239008063755,
"learning_rate": 9.269312473217777e-06,
"loss": 0.4803,
"step": 1130
},
{
"epoch": 0.2594963721724285,
"grad_norm": 0.8482520214554938,
"learning_rate": 9.248500205653518e-06,
"loss": 0.4745,
"step": 1140
},
{
"epoch": 0.2617726561388533,
"grad_norm": 0.7908430286997805,
"learning_rate": 9.22741973747119e-06,
"loss": 0.471,
"step": 1150
},
{
"epoch": 0.26404894010527813,
"grad_norm": 0.7036666630426738,
"learning_rate": 9.20607239944503e-06,
"loss": 0.4676,
"step": 1160
},
{
"epoch": 0.26632522407170295,
"grad_norm": 0.7263755674894325,
"learning_rate": 9.18445953919631e-06,
"loss": 0.4738,
"step": 1170
},
{
"epoch": 0.26860150803812777,
"grad_norm": 0.7487742930473097,
"learning_rate": 9.16258252110827e-06,
"loss": 0.4749,
"step": 1180
},
{
"epoch": 0.2708777920045526,
"grad_norm": 0.7931436305160696,
"learning_rate": 9.140442726239986e-06,
"loss": 0.4739,
"step": 1190
},
{
"epoch": 0.2731540759709774,
"grad_norm": 0.7971178665228899,
"learning_rate": 9.118041552239187e-06,
"loss": 0.4715,
"step": 1200
},
{
"epoch": 0.27543035993740217,
"grad_norm": 0.7352453908333937,
"learning_rate": 9.095380413254029e-06,
"loss": 0.4735,
"step": 1210
},
{
"epoch": 0.277706643903827,
"grad_norm": 0.6686848966506087,
"learning_rate": 9.072460739843807e-06,
"loss": 0.4701,
"step": 1220
},
{
"epoch": 0.2799829278702518,
"grad_norm": 0.7632913563075477,
"learning_rate": 9.049283978888665e-06,
"loss": 0.4709,
"step": 1230
},
{
"epoch": 0.2822592118366766,
"grad_norm": 0.9126191629516692,
"learning_rate": 9.025851593498245e-06,
"loss": 0.4812,
"step": 1240
},
{
"epoch": 0.28453549580310145,
"grad_norm": 0.7586654591086177,
"learning_rate": 9.002165062919321e-06,
"loss": 0.4759,
"step": 1250
},
{
"epoch": 0.28681177976952626,
"grad_norm": 0.6020016779620824,
"learning_rate": 8.978225882442431e-06,
"loss": 0.4585,
"step": 1260
},
{
"epoch": 0.2890880637359511,
"grad_norm": 0.6918094336131744,
"learning_rate": 8.95403556330747e-06,
"loss": 0.4734,
"step": 1270
},
{
"epoch": 0.29136434770237585,
"grad_norm": 0.8432310682535402,
"learning_rate": 8.929595632608286e-06,
"loss": 0.4657,
"step": 1280
},
{
"epoch": 0.29364063166880067,
"grad_norm": 0.7968867596780567,
"learning_rate": 8.904907633196287e-06,
"loss": 0.4689,
"step": 1290
},
{
"epoch": 0.2959169156352255,
"grad_norm": 0.8975405798229933,
"learning_rate": 8.879973123583041e-06,
"loss": 0.4742,
"step": 1300
},
{
"epoch": 0.2981931996016503,
"grad_norm": 0.8375787090323347,
"learning_rate": 8.854793677841878e-06,
"loss": 0.4679,
"step": 1310
},
{
"epoch": 0.3004694835680751,
"grad_norm": 0.7641222076424216,
"learning_rate": 8.829370885508538e-06,
"loss": 0.4668,
"step": 1320
},
{
"epoch": 0.30274576753449994,
"grad_norm": 0.7968284482238742,
"learning_rate": 8.803706351480819e-06,
"loss": 0.4621,
"step": 1330
},
{
"epoch": 0.30502205150092476,
"grad_norm": 0.7504856443013833,
"learning_rate": 8.777801695917257e-06,
"loss": 0.4638,
"step": 1340
},
{
"epoch": 0.3072983354673495,
"grad_norm": 0.7386127839372575,
"learning_rate": 8.751658554134861e-06,
"loss": 0.472,
"step": 1350
},
{
"epoch": 0.30957461943377435,
"grad_norm": 0.7519795926557973,
"learning_rate": 8.725278576505865e-06,
"loss": 0.463,
"step": 1360
},
{
"epoch": 0.31185090340019916,
"grad_norm": 0.6860694902316768,
"learning_rate": 8.698663428353551e-06,
"loss": 0.469,
"step": 1370
},
{
"epoch": 0.314127187366624,
"grad_norm": 0.84686290617533,
"learning_rate": 8.671814789847116e-06,
"loss": 0.4727,
"step": 1380
},
{
"epoch": 0.3164034713330488,
"grad_norm": 0.7119241552397302,
"learning_rate": 8.64473435589561e-06,
"loss": 0.4706,
"step": 1390
},
{
"epoch": 0.3186797552994736,
"grad_norm": 0.7745989228725879,
"learning_rate": 8.617423836040937e-06,
"loss": 0.4679,
"step": 1400
},
{
"epoch": 0.32095603926589844,
"grad_norm": 0.7227304748917153,
"learning_rate": 8.589884954349928e-06,
"loss": 0.4738,
"step": 1410
},
{
"epoch": 0.32323232323232326,
"grad_norm": 1.228623655278939,
"learning_rate": 8.562119449305517e-06,
"loss": 0.4648,
"step": 1420
},
{
"epoch": 0.325508607198748,
"grad_norm": 0.6887447415599827,
"learning_rate": 8.534129073696984e-06,
"loss": 0.4707,
"step": 1430
},
{
"epoch": 0.32778489116517284,
"grad_norm": 0.7108435007854059,
"learning_rate": 8.505915594509304e-06,
"loss": 0.4633,
"step": 1440
},
{
"epoch": 0.33006117513159766,
"grad_norm": 0.7768742017040517,
"learning_rate": 8.477480792811607e-06,
"loss": 0.466,
"step": 1450
},
{
"epoch": 0.3323374590980225,
"grad_norm": 0.9552934171248785,
"learning_rate": 8.448826463644733e-06,
"loss": 0.4615,
"step": 1460
},
{
"epoch": 0.3346137430644473,
"grad_norm": 0.7969014798994358,
"learning_rate": 8.419954415907925e-06,
"loss": 0.4685,
"step": 1470
},
{
"epoch": 0.3368900270308721,
"grad_norm": 0.8468048307712729,
"learning_rate": 8.390866472244624e-06,
"loss": 0.4599,
"step": 1480
},
{
"epoch": 0.33916631099729694,
"grad_norm": 0.8691044067047762,
"learning_rate": 8.36156446892742e-06,
"loss": 0.4722,
"step": 1490
},
{
"epoch": 0.3414425949637217,
"grad_norm": 0.9380457631209852,
"learning_rate": 8.332050255742126e-06,
"loss": 0.4741,
"step": 1500
},
{
"epoch": 0.3437188789301465,
"grad_norm": 0.8859250973757697,
"learning_rate": 8.302325695871e-06,
"loss": 0.4621,
"step": 1510
},
{
"epoch": 0.34599516289657134,
"grad_norm": 0.7488919862243867,
"learning_rate": 8.272392665775132e-06,
"loss": 0.4604,
"step": 1520
},
{
"epoch": 0.34827144686299616,
"grad_norm": 0.9715182317776861,
"learning_rate": 8.242253055075989e-06,
"loss": 0.463,
"step": 1530
},
{
"epoch": 0.350547730829421,
"grad_norm": 0.9442966262864598,
"learning_rate": 8.211908766436114e-06,
"loss": 0.4599,
"step": 1540
},
{
"epoch": 0.3528240147958458,
"grad_norm": 0.885636304605366,
"learning_rate": 8.181361715439023e-06,
"loss": 0.4753,
"step": 1550
},
{
"epoch": 0.3551002987622706,
"grad_norm": 0.8187381110333617,
"learning_rate": 8.15061383046828e-06,
"loss": 0.468,
"step": 1560
},
{
"epoch": 0.3573765827286954,
"grad_norm": 0.7537652164973274,
"learning_rate": 8.119667052585753e-06,
"loss": 0.4591,
"step": 1570
},
{
"epoch": 0.3596528666951202,
"grad_norm": 0.8597815188650263,
"learning_rate": 8.088523335409086e-06,
"loss": 0.4562,
"step": 1580
},
{
"epoch": 0.361929150661545,
"grad_norm": 0.745664972159197,
"learning_rate": 8.057184644988363e-06,
"loss": 0.4603,
"step": 1590
},
{
"epoch": 0.36420543462796984,
"grad_norm": 0.8238816020218861,
"learning_rate": 8.025652959682004e-06,
"loss": 0.4677,
"step": 1600
},
{
"epoch": 0.36648171859439466,
"grad_norm": 0.7416721413551182,
"learning_rate": 7.993930270031863e-06,
"loss": 0.4619,
"step": 1610
},
{
"epoch": 0.3687580025608195,
"grad_norm": 0.7143674127418036,
"learning_rate": 7.962018578637578e-06,
"loss": 0.4629,
"step": 1620
},
{
"epoch": 0.3710342865272443,
"grad_norm": 0.7454781957945812,
"learning_rate": 7.929919900030147e-06,
"loss": 0.4645,
"step": 1630
},
{
"epoch": 0.37331057049366906,
"grad_norm": 0.763351971097948,
"learning_rate": 7.897636260544752e-06,
"loss": 0.4619,
"step": 1640
},
{
"epoch": 0.3755868544600939,
"grad_norm": 0.7405410645904156,
"learning_rate": 7.865169698192842e-06,
"loss": 0.4628,
"step": 1650
},
{
"epoch": 0.3778631384265187,
"grad_norm": 0.7347060655230769,
"learning_rate": 7.832522262533481e-06,
"loss": 0.4649,
"step": 1660
},
{
"epoch": 0.3801394223929435,
"grad_norm": 0.7023247967552008,
"learning_rate": 7.799696014543949e-06,
"loss": 0.4593,
"step": 1670
},
{
"epoch": 0.38241570635936833,
"grad_norm": 0.9339187019341834,
"learning_rate": 7.766693026489655e-06,
"loss": 0.4541,
"step": 1680
},
{
"epoch": 0.38469199032579315,
"grad_norm": 0.7809209531769312,
"learning_rate": 7.733515381793305e-06,
"loss": 0.4653,
"step": 1690
},
{
"epoch": 0.386968274292218,
"grad_norm": 0.8161647769578234,
"learning_rate": 7.70016517490338e-06,
"loss": 0.4643,
"step": 1700
},
{
"epoch": 0.3892445582586428,
"grad_norm": 0.8219287065471175,
"learning_rate": 7.666644511161925e-06,
"loss": 0.4573,
"step": 1710
},
{
"epoch": 0.39152084222506756,
"grad_norm": 0.730307940342026,
"learning_rate": 7.632955506671633e-06,
"loss": 0.4587,
"step": 1720
},
{
"epoch": 0.3937971261914924,
"grad_norm": 0.7616849447054718,
"learning_rate": 7.599100288162267e-06,
"loss": 0.462,
"step": 1730
},
{
"epoch": 0.3960734101579172,
"grad_norm": 0.8087396986104316,
"learning_rate": 7.565080992856393e-06,
"loss": 0.4626,
"step": 1740
},
{
"epoch": 0.398349694124342,
"grad_norm": 0.725916588847792,
"learning_rate": 7.530899768334476e-06,
"loss": 0.4679,
"step": 1750
},
{
"epoch": 0.40062597809076683,
"grad_norm": 0.8356414931081636,
"learning_rate": 7.496558772399289e-06,
"loss": 0.4562,
"step": 1760
},
{
"epoch": 0.40290226205719165,
"grad_norm": 0.8552053129727125,
"learning_rate": 7.462060172939711e-06,
"loss": 0.4593,
"step": 1770
},
{
"epoch": 0.40517854602361647,
"grad_norm": 0.7085366390905654,
"learning_rate": 7.427406147793861e-06,
"loss": 0.4641,
"step": 1780
},
{
"epoch": 0.40745482999004123,
"grad_norm": 0.858819978106255,
"learning_rate": 7.392598884611617e-06,
"loss": 0.4595,
"step": 1790
},
{
"epoch": 0.40973111395646605,
"grad_norm": 0.8915575128268939,
"learning_rate": 7.357640580716516e-06,
"loss": 0.4609,
"step": 1800
},
{
"epoch": 0.4120073979228909,
"grad_norm": 0.8410874583168452,
"learning_rate": 7.32253344296704e-06,
"loss": 0.4519,
"step": 1810
},
{
"epoch": 0.4142836818893157,
"grad_norm": 0.7553762047323214,
"learning_rate": 7.2872796876173e-06,
"loss": 0.4509,
"step": 1820
},
{
"epoch": 0.4165599658557405,
"grad_norm": 0.7579109862136422,
"learning_rate": 7.251881540177125e-06,
"loss": 0.4639,
"step": 1830
},
{
"epoch": 0.41883624982216533,
"grad_norm": 0.8596157431482481,
"learning_rate": 7.2163412352715745e-06,
"loss": 0.4665,
"step": 1840
},
{
"epoch": 0.42111253378859015,
"grad_norm": 0.8318429397770328,
"learning_rate": 7.180661016499868e-06,
"loss": 0.46,
"step": 1850
},
{
"epoch": 0.4233888177550149,
"grad_norm": 0.6921114545354697,
"learning_rate": 7.144843136293746e-06,
"loss": 0.4578,
"step": 1860
},
{
"epoch": 0.42566510172143973,
"grad_norm": 0.6714500243037865,
"learning_rate": 7.108889855775289e-06,
"loss": 0.4507,
"step": 1870
},
{
"epoch": 0.42794138568786455,
"grad_norm": 0.7710332319490364,
"learning_rate": 7.0728034446141654e-06,
"loss": 0.4571,
"step": 1880
},
{
"epoch": 0.43021766965428937,
"grad_norm": 0.8446602836780411,
"learning_rate": 7.036586180884357e-06,
"loss": 0.4707,
"step": 1890
},
{
"epoch": 0.4324939536207142,
"grad_norm": 0.7674212691654148,
"learning_rate": 7.000240350920344e-06,
"loss": 0.462,
"step": 1900
},
{
"epoch": 0.434770237587139,
"grad_norm": 0.8556511437021528,
"learning_rate": 6.96376824917278e-06,
"loss": 0.4533,
"step": 1910
},
{
"epoch": 0.4370465215535638,
"grad_norm": 0.8078502008317322,
"learning_rate": 6.927172178063636e-06,
"loss": 0.4608,
"step": 1920
},
{
"epoch": 0.43932280551998865,
"grad_norm": 0.7150517188869988,
"learning_rate": 6.890454447840862e-06,
"loss": 0.4535,
"step": 1930
},
{
"epoch": 0.4415990894864134,
"grad_norm": 0.7523733013230715,
"learning_rate": 6.853617376432542e-06,
"loss": 0.4624,
"step": 1940
},
{
"epoch": 0.44387537345283823,
"grad_norm": 0.9728144367431893,
"learning_rate": 6.816663289300567e-06,
"loss": 0.4597,
"step": 1950
},
{
"epoch": 0.44615165741926305,
"grad_norm": 0.7177459968030785,
"learning_rate": 6.779594519293833e-06,
"loss": 0.4607,
"step": 1960
},
{
"epoch": 0.44842794138568787,
"grad_norm": 0.8088528853225262,
"learning_rate": 6.742413406500967e-06,
"loss": 0.4629,
"step": 1970
},
{
"epoch": 0.4507042253521127,
"grad_norm": 0.9175593683030001,
"learning_rate": 6.705122298102611e-06,
"loss": 0.4636,
"step": 1980
},
{
"epoch": 0.4529805093185375,
"grad_norm": 0.7621550740057926,
"learning_rate": 6.667723548223241e-06,
"loss": 0.4704,
"step": 1990
},
{
"epoch": 0.4552567932849623,
"grad_norm": 0.7894831335092004,
"learning_rate": 6.630219517782557e-06,
"loss": 0.4665,
"step": 2000
},
{
"epoch": 0.4575330772513871,
"grad_norm": 0.7289053283342039,
"learning_rate": 6.592612574346442e-06,
"loss": 0.4496,
"step": 2010
},
{
"epoch": 0.4598093612178119,
"grad_norm": 0.7268790659257058,
"learning_rate": 6.554905091977506e-06,
"loss": 0.4529,
"step": 2020
},
{
"epoch": 0.4620856451842367,
"grad_norm": 0.7436765967755353,
"learning_rate": 6.5170994510852035e-06,
"loss": 0.4548,
"step": 2030
},
{
"epoch": 0.46436192915066155,
"grad_norm": 0.7695864257948672,
"learning_rate": 6.479198038275578e-06,
"loss": 0.4539,
"step": 2040
},
{
"epoch": 0.46663821311708636,
"grad_norm": 0.8295487862322445,
"learning_rate": 6.441203246200587e-06,
"loss": 0.4634,
"step": 2050
},
{
"epoch": 0.4689144970835112,
"grad_norm": 0.8241912559218441,
"learning_rate": 6.403117473407065e-06,
"loss": 0.4496,
"step": 2060
},
{
"epoch": 0.471190781049936,
"grad_norm": 0.7076343515274734,
"learning_rate": 6.364943124185308e-06,
"loss": 0.4497,
"step": 2070
},
{
"epoch": 0.47346706501636077,
"grad_norm": 0.8310935730693106,
"learning_rate": 6.3266826084172835e-06,
"loss": 0.4648,
"step": 2080
},
{
"epoch": 0.4757433489827856,
"grad_norm": 0.7704280812181517,
"learning_rate": 6.288338341424515e-06,
"loss": 0.455,
"step": 2090
},
{
"epoch": 0.4780196329492104,
"grad_norm": 0.8181658490570038,
"learning_rate": 6.249912743815595e-06,
"loss": 0.4596,
"step": 2100
},
{
"epoch": 0.4802959169156352,
"grad_norm": 0.7431508637124334,
"learning_rate": 6.211408241333379e-06,
"loss": 0.4538,
"step": 2110
},
{
"epoch": 0.48257220088206004,
"grad_norm": 0.709619776250267,
"learning_rate": 6.172827264701857e-06,
"loss": 0.4537,
"step": 2120
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.8223387777060188,
"learning_rate": 6.134172249472702e-06,
"loss": 0.4551,
"step": 2130
},
{
"epoch": 0.4871247688149097,
"grad_norm": 0.7768854003850929,
"learning_rate": 6.095445635871516e-06,
"loss": 0.4657,
"step": 2140
},
{
"epoch": 0.48940105278133444,
"grad_norm": 0.769025632798714,
"learning_rate": 6.0566498686437855e-06,
"loss": 0.4557,
"step": 2150
},
{
"epoch": 0.49167733674775926,
"grad_norm": 0.6692137861485721,
"learning_rate": 6.0177873969005475e-06,
"loss": 0.4563,
"step": 2160
},
{
"epoch": 0.4939536207141841,
"grad_norm": 0.8907000112703206,
"learning_rate": 5.978860673963784e-06,
"loss": 0.4548,
"step": 2170
},
{
"epoch": 0.4962299046806089,
"grad_norm": 0.8129551036581305,
"learning_rate": 5.939872157211545e-06,
"loss": 0.4501,
"step": 2180
},
{
"epoch": 0.4985061886470337,
"grad_norm": 0.9050604723863666,
"learning_rate": 5.900824307922819e-06,
"loss": 0.4529,
"step": 2190
},
{
"epoch": 0.5007824726134585,
"grad_norm": 0.9464291168776866,
"learning_rate": 5.861719591122158e-06,
"loss": 0.4597,
"step": 2200
},
{
"epoch": 0.5030587565798833,
"grad_norm": 0.7629842734151758,
"learning_rate": 5.8225604754240635e-06,
"loss": 0.4547,
"step": 2210
},
{
"epoch": 0.5053350405463082,
"grad_norm": 0.737677886868225,
"learning_rate": 5.783349432877146e-06,
"loss": 0.4568,
"step": 2220
},
{
"epoch": 0.5076113245127329,
"grad_norm": 0.7440218389833005,
"learning_rate": 5.744088938808068e-06,
"loss": 0.4554,
"step": 2230
},
{
"epoch": 0.5098876084791578,
"grad_norm": 0.8000074715652351,
"learning_rate": 5.70478147166529e-06,
"loss": 0.4671,
"step": 2240
},
{
"epoch": 0.5121638924455826,
"grad_norm": 0.9238644399016241,
"learning_rate": 5.665429512862597e-06,
"loss": 0.4574,
"step": 2250
},
{
"epoch": 0.5144401764120073,
"grad_norm": 0.758841769369074,
"learning_rate": 5.626035546622457e-06,
"loss": 0.4558,
"step": 2260
},
{
"epoch": 0.5167164603784322,
"grad_norm": 0.7971224800656472,
"learning_rate": 5.586602059819199e-06,
"loss": 0.4496,
"step": 2270
},
{
"epoch": 0.518992744344857,
"grad_norm": 0.9171620412959115,
"learning_rate": 5.547131541822018e-06,
"loss": 0.4558,
"step": 2280
},
{
"epoch": 0.5212690283112819,
"grad_norm": 0.7842020256066858,
"learning_rate": 5.5076264843378225e-06,
"loss": 0.4527,
"step": 2290
},
{
"epoch": 0.5235453122777066,
"grad_norm": 0.9188201380044063,
"learning_rate": 5.4680893812539436e-06,
"loss": 0.4608,
"step": 2300
},
{
"epoch": 0.5258215962441315,
"grad_norm": 0.7861154037939578,
"learning_rate": 5.428522728480697e-06,
"loss": 0.4523,
"step": 2310
},
{
"epoch": 0.5280978802105563,
"grad_norm": 0.7920300857523709,
"learning_rate": 5.388929023793817e-06,
"loss": 0.4568,
"step": 2320
},
{
"epoch": 0.5303741641769811,
"grad_norm": 0.7612825596142501,
"learning_rate": 5.349310766676781e-06,
"loss": 0.4483,
"step": 2330
},
{
"epoch": 0.5326504481434059,
"grad_norm": 0.7537687250775554,
"learning_rate": 5.3096704581630195e-06,
"loss": 0.4563,
"step": 2340
},
{
"epoch": 0.5349267321098307,
"grad_norm": 0.751390092998076,
"learning_rate": 5.270010600678034e-06,
"loss": 0.4578,
"step": 2350
},
{
"epoch": 0.5372030160762555,
"grad_norm": 0.8063126059500658,
"learning_rate": 5.230333697881413e-06,
"loss": 0.4424,
"step": 2360
},
{
"epoch": 0.5394793000426803,
"grad_norm": 0.7268784420755078,
"learning_rate": 5.190642254508789e-06,
"loss": 0.4488,
"step": 2370
},
{
"epoch": 0.5417555840091052,
"grad_norm": 0.8219467384704279,
"learning_rate": 5.15093877621372e-06,
"loss": 0.4443,
"step": 2380
},
{
"epoch": 0.5440318679755299,
"grad_norm": 0.9341715266763854,
"learning_rate": 5.111225769409505e-06,
"loss": 0.4563,
"step": 2390
},
{
"epoch": 0.5463081519419548,
"grad_norm": 0.8890086015346076,
"learning_rate": 5.071505741110958e-06,
"loss": 0.4531,
"step": 2400
},
{
"epoch": 0.5485844359083796,
"grad_norm": 0.7859192247678671,
"learning_rate": 5.031781198776157e-06,
"loss": 0.4448,
"step": 2410
},
{
"epoch": 0.5508607198748043,
"grad_norm": 0.8457709944734434,
"learning_rate": 4.9920546501481355e-06,
"loss": 0.4502,
"step": 2420
},
{
"epoch": 0.5531370038412292,
"grad_norm": 0.777261473128808,
"learning_rate": 4.952328603096588e-06,
"loss": 0.4493,
"step": 2430
},
{
"epoch": 0.555413287807654,
"grad_norm": 0.7489538278905294,
"learning_rate": 4.912605565459537e-06,
"loss": 0.4532,
"step": 2440
},
{
"epoch": 0.5576895717740789,
"grad_norm": 0.7471858030987701,
"learning_rate": 4.872888044885031e-06,
"loss": 0.4662,
"step": 2450
},
{
"epoch": 0.5599658557405036,
"grad_norm": 0.76997073617317,
"learning_rate": 4.833178548672836e-06,
"loss": 0.449,
"step": 2460
},
{
"epoch": 0.5622421397069285,
"grad_norm": 0.8114769999661829,
"learning_rate": 4.793479583616152e-06,
"loss": 0.4511,
"step": 2470
},
{
"epoch": 0.5645184236733533,
"grad_norm": 0.7887812825481647,
"learning_rate": 4.753793655843362e-06,
"loss": 0.4531,
"step": 2480
},
{
"epoch": 0.566794707639778,
"grad_norm": 0.8266580905214915,
"learning_rate": 4.714123270659836e-06,
"loss": 0.4499,
"step": 2490
},
{
"epoch": 0.5690709916062029,
"grad_norm": 0.9772993025496673,
"learning_rate": 4.674470932389759e-06,
"loss": 0.462,
"step": 2500
},
{
"epoch": 0.5713472755726277,
"grad_norm": 0.7550741577854698,
"learning_rate": 4.634839144218047e-06,
"loss": 0.4424,
"step": 2510
},
{
"epoch": 0.5736235595390525,
"grad_norm": 0.8265913157075914,
"learning_rate": 4.595230408032324e-06,
"loss": 0.4468,
"step": 2520
},
{
"epoch": 0.5758998435054773,
"grad_norm": 0.7723721397391996,
"learning_rate": 4.555647224264978e-06,
"loss": 0.4448,
"step": 2530
},
{
"epoch": 0.5781761274719022,
"grad_norm": 0.7438678281440869,
"learning_rate": 4.516092091735324e-06,
"loss": 0.4537,
"step": 2540
},
{
"epoch": 0.5804524114383269,
"grad_norm": 0.827117915360568,
"learning_rate": 4.47656750749184e-06,
"loss": 0.4558,
"step": 2550
},
{
"epoch": 0.5827286954047517,
"grad_norm": 0.7273943037424042,
"learning_rate": 4.4370759666545495e-06,
"loss": 0.4444,
"step": 2560
},
{
"epoch": 0.5850049793711766,
"grad_norm": 0.7703519904997088,
"learning_rate": 4.397619962257498e-06,
"loss": 0.4481,
"step": 2570
},
{
"epoch": 0.5872812633376013,
"grad_norm": 0.803886348953792,
"learning_rate": 4.3582019850913796e-06,
"loss": 0.4487,
"step": 2580
},
{
"epoch": 0.5895575473040262,
"grad_norm": 0.7300835048061479,
"learning_rate": 4.3188245235462865e-06,
"loss": 0.4446,
"step": 2590
},
{
"epoch": 0.591833831270451,
"grad_norm": 0.8101791772935961,
"learning_rate": 4.2794900634546385e-06,
"loss": 0.4553,
"step": 2600
},
{
"epoch": 0.5941101152368758,
"grad_norm": 0.8207823650264575,
"learning_rate": 4.240201087934238e-06,
"loss": 0.4511,
"step": 2610
},
{
"epoch": 0.5963863992033006,
"grad_norm": 0.8176342960234186,
"learning_rate": 4.200960077231528e-06,
"loss": 0.4425,
"step": 2620
},
{
"epoch": 0.5986626831697254,
"grad_norm": 0.916935818899542,
"learning_rate": 4.161769508565012e-06,
"loss": 0.4442,
"step": 2630
},
{
"epoch": 0.6009389671361502,
"grad_norm": 0.7421424964297176,
"learning_rate": 4.122631855968873e-06,
"loss": 0.4509,
"step": 2640
},
{
"epoch": 0.603215251102575,
"grad_norm": 0.9115282778731496,
"learning_rate": 4.0835495901367955e-06,
"loss": 0.455,
"step": 2650
},
{
"epoch": 0.6054915350689999,
"grad_norm": 0.7816827264699414,
"learning_rate": 4.0445251782659875e-06,
"loss": 0.4381,
"step": 2660
},
{
"epoch": 0.6077678190354247,
"grad_norm": 2.194076497168195,
"learning_rate": 4.005561083901434e-06,
"loss": 0.4447,
"step": 2670
},
{
"epoch": 0.6100441030018495,
"grad_norm": 0.7573191196887984,
"learning_rate": 3.966659766780383e-06,
"loss": 0.4446,
"step": 2680
},
{
"epoch": 0.6123203869682743,
"grad_norm": 0.7687574770334604,
"learning_rate": 3.927823682677057e-06,
"loss": 0.4496,
"step": 2690
},
{
"epoch": 0.614596670934699,
"grad_norm": 0.9358386363240805,
"learning_rate": 3.889055283247628e-06,
"loss": 0.4568,
"step": 2700
},
{
"epoch": 0.6168729549011239,
"grad_norm": 0.7314708296261042,
"learning_rate": 3.850357015875456e-06,
"loss": 0.4446,
"step": 2710
},
{
"epoch": 0.6191492388675487,
"grad_norm": 0.8070457903866413,
"learning_rate": 3.8117313235165754e-06,
"loss": 0.4521,
"step": 2720
},
{
"epoch": 0.6214255228339736,
"grad_norm": 0.7257119700751845,
"learning_rate": 3.7731806445454856e-06,
"loss": 0.4427,
"step": 2730
},
{
"epoch": 0.6237018068003983,
"grad_norm": 0.7975212623445046,
"learning_rate": 3.7347074126012195e-06,
"loss": 0.4477,
"step": 2740
},
{
"epoch": 0.6259780907668232,
"grad_norm": 0.7714694828216863,
"learning_rate": 3.6963140564337074e-06,
"loss": 0.4538,
"step": 2750
},
{
"epoch": 0.628254374733248,
"grad_norm": 0.800630933191912,
"learning_rate": 3.658002999750462e-06,
"loss": 0.446,
"step": 2760
},
{
"epoch": 0.6305306586996727,
"grad_norm": 0.7408683914326505,
"learning_rate": 3.6197766610635656e-06,
"loss": 0.446,
"step": 2770
},
{
"epoch": 0.6328069426660976,
"grad_norm": 0.7609135405893662,
"learning_rate": 3.5816374535369934e-06,
"loss": 0.4416,
"step": 2780
},
{
"epoch": 0.6350832266325224,
"grad_norm": 0.8061635946819576,
"learning_rate": 3.543587784834288e-06,
"loss": 0.4385,
"step": 2790
},
{
"epoch": 0.6373595105989472,
"grad_norm": 0.8207487764566586,
"learning_rate": 3.5056300569665503e-06,
"loss": 0.4443,
"step": 2800
},
{
"epoch": 0.639635794565372,
"grad_norm": 0.7780495002255654,
"learning_rate": 3.4677666661408096e-06,
"loss": 0.4393,
"step": 2810
},
{
"epoch": 0.6419120785317969,
"grad_norm": 0.820529174759461,
"learning_rate": 3.4300000026087664e-06,
"loss": 0.448,
"step": 2820
},
{
"epoch": 0.6441883624982216,
"grad_norm": 0.7890561874485035,
"learning_rate": 3.392332450515886e-06,
"loss": 0.4489,
"step": 2830
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.8198574552313013,
"learning_rate": 3.3547663877508928e-06,
"loss": 0.4496,
"step": 2840
},
{
"epoch": 0.6487409304310713,
"grad_norm": 0.8209595374102931,
"learning_rate": 3.3173041857956716e-06,
"loss": 0.442,
"step": 2850
},
{
"epoch": 0.651017214397496,
"grad_norm": 0.8174067144646198,
"learning_rate": 3.2799482095755424e-06,
"loss": 0.4447,
"step": 2860
},
{
"epoch": 0.6532934983639209,
"grad_norm": 0.7098988026439182,
"learning_rate": 3.242700817309976e-06,
"loss": 0.4429,
"step": 2870
},
{
"epoch": 0.6555697823303457,
"grad_norm": 0.9250724312431224,
"learning_rate": 3.205564360363724e-06,
"loss": 0.4508,
"step": 2880
},
{
"epoch": 0.6578460662967706,
"grad_norm": 0.9050078757133033,
"learning_rate": 3.168541183098378e-06,
"loss": 0.447,
"step": 2890
},
{
"epoch": 0.6601223502631953,
"grad_norm": 0.7647575780260846,
"learning_rate": 3.131633622724377e-06,
"loss": 0.4521,
"step": 2900
},
{
"epoch": 0.6623986342296202,
"grad_norm": 0.749840221225747,
"learning_rate": 3.0948440091534594e-06,
"loss": 0.438,
"step": 2910
},
{
"epoch": 0.664674918196045,
"grad_norm": 0.7184423492925078,
"learning_rate": 3.058174664851582e-06,
"loss": 0.4465,
"step": 2920
},
{
"epoch": 0.6669512021624697,
"grad_norm": 0.7788843341529811,
"learning_rate": 3.0216279046923084e-06,
"loss": 0.4427,
"step": 2930
},
{
"epoch": 0.6692274861288946,
"grad_norm": 0.795166169097631,
"learning_rate": 2.9852060358106717e-06,
"loss": 0.4438,
"step": 2940
},
{
"epoch": 0.6715037700953194,
"grad_norm": 0.8307138251372255,
"learning_rate": 2.9489113574575272e-06,
"loss": 0.4467,
"step": 2950
},
{
"epoch": 0.6737800540617442,
"grad_norm": 0.7663349953487524,
"learning_rate": 2.912746160854417e-06,
"loss": 0.4491,
"step": 2960
},
{
"epoch": 0.676056338028169,
"grad_norm": 0.7838493815092107,
"learning_rate": 2.8767127290489084e-06,
"loss": 0.438,
"step": 2970
},
{
"epoch": 0.6783326219945939,
"grad_norm": 0.791824009698266,
"learning_rate": 2.840813336770487e-06,
"loss": 0.4372,
"step": 2980
},
{
"epoch": 0.6806089059610186,
"grad_norm": 0.73879969079544,
"learning_rate": 2.805050250286949e-06,
"loss": 0.4514,
"step": 2990
},
{
"epoch": 0.6828851899274434,
"grad_norm": 0.8099909989978515,
"learning_rate": 2.769425727261339e-06,
"loss": 0.4537,
"step": 3000
},
{
"epoch": 0.6851614738938683,
"grad_norm": 0.7590973946118406,
"learning_rate": 2.7339420166094183e-06,
"loss": 0.4463,
"step": 3010
},
{
"epoch": 0.687437757860293,
"grad_norm": 0.796201063821709,
"learning_rate": 2.6986013583577083e-06,
"loss": 0.4397,
"step": 3020
},
{
"epoch": 0.6897140418267179,
"grad_norm": 0.7050687084934512,
"learning_rate": 2.6634059835020733e-06,
"loss": 0.4268,
"step": 3030
},
{
"epoch": 0.6919903257931427,
"grad_norm": 1.0187421881981598,
"learning_rate": 2.628358113866881e-06,
"loss": 0.4438,
"step": 3040
},
{
"epoch": 0.6942666097595676,
"grad_norm": 0.8443744661543358,
"learning_rate": 2.5934599619647495e-06,
"loss": 0.4512,
"step": 3050
},
{
"epoch": 0.6965428937259923,
"grad_norm": 0.7864339330637931,
"learning_rate": 2.558713730856862e-06,
"loss": 0.4372,
"step": 3060
},
{
"epoch": 0.6988191776924171,
"grad_norm": 0.8039029994902843,
"learning_rate": 2.524121614013906e-06,
"loss": 0.447,
"step": 3070
},
{
"epoch": 0.701095461658842,
"grad_norm": 0.8554560338951394,
"learning_rate": 2.4896857951775973e-06,
"loss": 0.4418,
"step": 3080
},
{
"epoch": 0.7033717456252667,
"grad_norm": 0.7789235728757384,
"learning_rate": 2.455408448222814e-06,
"loss": 0.4428,
"step": 3090
},
{
"epoch": 0.7056480295916916,
"grad_norm": 0.8390767483792194,
"learning_rate": 2.4212917370203877e-06,
"loss": 0.4513,
"step": 3100
},
{
"epoch": 0.7079243135581164,
"grad_norm": 0.7423511083655429,
"learning_rate": 2.3873378153004736e-06,
"loss": 0.4415,
"step": 3110
},
{
"epoch": 0.7102005975245412,
"grad_norm": 0.7424046071658116,
"learning_rate": 2.3535488265166095e-06,
"loss": 0.4293,
"step": 3120
},
{
"epoch": 0.712476881490966,
"grad_norm": 0.7456496661301177,
"learning_rate": 2.319926903710398e-06,
"loss": 0.4438,
"step": 3130
},
{
"epoch": 0.7147531654573908,
"grad_norm": 0.8278781199522129,
"learning_rate": 2.2864741693768423e-06,
"loss": 0.4387,
"step": 3140
},
{
"epoch": 0.7170294494238156,
"grad_norm": 0.8116784082715538,
"learning_rate": 2.253192735330371e-06,
"loss": 0.4462,
"step": 3150
},
{
"epoch": 0.7193057333902404,
"grad_norm": 0.7004955096514237,
"learning_rate": 2.2200847025715142e-06,
"loss": 0.4398,
"step": 3160
},
{
"epoch": 0.7215820173566653,
"grad_norm": 0.7367447404574639,
"learning_rate": 2.1871521611542705e-06,
"loss": 0.4475,
"step": 3170
},
{
"epoch": 0.72385830132309,
"grad_norm": 0.7799543504096647,
"learning_rate": 2.1543971900541722e-06,
"loss": 0.443,
"step": 3180
},
{
"epoch": 0.7261345852895149,
"grad_norm": 0.8090558026400204,
"learning_rate": 2.1218218570370303e-06,
"loss": 0.4449,
"step": 3190
},
{
"epoch": 0.7284108692559397,
"grad_norm": 0.8879224467129067,
"learning_rate": 2.0894282185284147e-06,
"loss": 0.4484,
"step": 3200
},
{
"epoch": 0.7306871532223644,
"grad_norm": 0.7328095140462628,
"learning_rate": 2.057218319483828e-06,
"loss": 0.4414,
"step": 3210
},
{
"epoch": 0.7329634371887893,
"grad_norm": 0.7572339184999409,
"learning_rate": 2.0251941932596115e-06,
"loss": 0.4372,
"step": 3220
},
{
"epoch": 0.7352397211552141,
"grad_norm": 0.7491266549650365,
"learning_rate": 1.9933578614845784e-06,
"loss": 0.4393,
"step": 3230
},
{
"epoch": 0.737516005121639,
"grad_norm": 0.821203801856231,
"learning_rate": 1.961711333932407e-06,
"loss": 0.4507,
"step": 3240
},
{
"epoch": 0.7397922890880637,
"grad_norm": 0.8213618929354572,
"learning_rate": 1.930256608394747e-06,
"loss": 0.4404,
"step": 3250
},
{
"epoch": 0.7420685730544886,
"grad_norm": 0.7809128990538117,
"learning_rate": 1.898995670555112e-06,
"loss": 0.4338,
"step": 3260
},
{
"epoch": 0.7443448570209134,
"grad_norm": 0.7867605315635634,
"learning_rate": 1.8679304938635373e-06,
"loss": 0.4481,
"step": 3270
},
{
"epoch": 0.7466211409873381,
"grad_norm": 0.7482999924845694,
"learning_rate": 1.8370630394119742e-06,
"loss": 0.4343,
"step": 3280
},
{
"epoch": 0.748897424953763,
"grad_norm": 0.8060085030283564,
"learning_rate": 1.806395255810518e-06,
"loss": 0.4377,
"step": 3290
},
{
"epoch": 0.7511737089201878,
"grad_norm": 0.8755665289100689,
"learning_rate": 1.7759290790643696e-06,
"loss": 0.4451,
"step": 3300
},
{
"epoch": 0.7534499928866126,
"grad_norm": 0.8120416476683848,
"learning_rate": 1.745666432451638e-06,
"loss": 0.4387,
"step": 3310
},
{
"epoch": 0.7557262768530374,
"grad_norm": 0.8156630900998141,
"learning_rate": 1.7156092264019198e-06,
"loss": 0.4361,
"step": 3320
},
{
"epoch": 0.7580025608194623,
"grad_norm": 0.7998270545611499,
"learning_rate": 1.6857593583756915e-06,
"loss": 0.448,
"step": 3330
},
{
"epoch": 0.760278844785887,
"grad_norm": 0.8207266172010161,
"learning_rate": 1.6561187127445367e-06,
"loss": 0.4417,
"step": 3340
},
{
"epoch": 0.7625551287523119,
"grad_norm": 0.8067253437276407,
"learning_rate": 1.626689160672182e-06,
"loss": 0.4476,
"step": 3350
},
{
"epoch": 0.7648314127187367,
"grad_norm": 0.8853629851991137,
"learning_rate": 1.5974725599963776e-06,
"loss": 0.4325,
"step": 3360
},
{
"epoch": 0.7671076966851614,
"grad_norm": 0.921124886395691,
"learning_rate": 1.5684707551116074e-06,
"loss": 0.4385,
"step": 3370
},
{
"epoch": 0.7693839806515863,
"grad_norm": 0.9059423563712878,
"learning_rate": 1.5396855768526664e-06,
"loss": 0.4441,
"step": 3380
},
{
"epoch": 0.7716602646180111,
"grad_norm": 0.7726314288351178,
"learning_rate": 1.5111188423790773e-06,
"loss": 0.4367,
"step": 3390
},
{
"epoch": 0.773936548584436,
"grad_norm": 0.8371458674118885,
"learning_rate": 1.4827723550603706e-06,
"loss": 0.4494,
"step": 3400
},
{
"epoch": 0.7762128325508607,
"grad_norm": 0.9280929650984211,
"learning_rate": 1.4546479043622592e-06,
"loss": 0.4363,
"step": 3410
},
{
"epoch": 0.7784891165172856,
"grad_norm": 0.8794307651372741,
"learning_rate": 1.4267472657336473e-06,
"loss": 0.4398,
"step": 3420
},
{
"epoch": 0.7807654004837103,
"grad_norm": 0.8239130127708325,
"learning_rate": 1.3990722004945705e-06,
"loss": 0.4418,
"step": 3430
},
{
"epoch": 0.7830416844501351,
"grad_norm": 0.790534002165955,
"learning_rate": 1.371624455724998e-06,
"loss": 0.4457,
"step": 3440
},
{
"epoch": 0.78531796841656,
"grad_norm": 0.821515070101423,
"learning_rate": 1.3444057641545377e-06,
"loss": 0.4446,
"step": 3450
},
{
"epoch": 0.7875942523829847,
"grad_norm": 0.7821344445262979,
"learning_rate": 1.317417844053066e-06,
"loss": 0.4326,
"step": 3460
},
{
"epoch": 0.7898705363494096,
"grad_norm": 0.7672443240766755,
"learning_rate": 1.2906623991222384e-06,
"loss": 0.4392,
"step": 3470
},
{
"epoch": 0.7921468203158344,
"grad_norm": 0.8085724525323776,
"learning_rate": 1.2641411183879527e-06,
"loss": 0.4368,
"step": 3480
},
{
"epoch": 0.7944231042822593,
"grad_norm": 0.8245898899691535,
"learning_rate": 1.2378556760937172e-06,
"loss": 0.4383,
"step": 3490
},
{
"epoch": 0.796699388248684,
"grad_norm": 0.9491912069759774,
"learning_rate": 1.2118077315949555e-06,
"loss": 0.4433,
"step": 3500
},
{
"epoch": 0.7989756722151088,
"grad_norm": 0.766723308181965,
"learning_rate": 1.1859989292542617e-06,
"loss": 0.4391,
"step": 3510
},
{
"epoch": 0.8012519561815337,
"grad_norm": 0.9223119446048714,
"learning_rate": 1.16043089833759e-06,
"loss": 0.4353,
"step": 3520
},
{
"epoch": 0.8035282401479584,
"grad_norm": 0.8683331011804727,
"learning_rate": 1.1351052529114031e-06,
"loss": 0.4481,
"step": 3530
},
{
"epoch": 0.8058045241143833,
"grad_norm": 0.8456488216541104,
"learning_rate": 1.1100235917407749e-06,
"loss": 0.4423,
"step": 3540
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.8435393816637614,
"learning_rate": 1.0851874981884703e-06,
"loss": 0.4392,
"step": 3550
},
{
"epoch": 0.8103570920472329,
"grad_norm": 0.8551429696631416,
"learning_rate": 1.0605985401149854e-06,
"loss": 0.4373,
"step": 3560
},
{
"epoch": 0.8126333760136577,
"grad_norm": 0.8326619193733407,
"learning_rate": 1.0362582697795736e-06,
"loss": 0.4403,
"step": 3570
},
{
"epoch": 0.8149096599800825,
"grad_norm": 0.7750783611846149,
"learning_rate": 1.012168223742252e-06,
"loss": 0.4358,
"step": 3580
},
{
"epoch": 0.8171859439465073,
"grad_norm": 0.74086430166713,
"learning_rate": 9.883299227667997e-07,
"loss": 0.4376,
"step": 3590
},
{
"epoch": 0.8194622279129321,
"grad_norm": 0.8021836339934405,
"learning_rate": 9.647448717247598e-07,
"loss": 0.446,
"step": 3600
},
{
"epoch": 0.821738511879357,
"grad_norm": 0.9512917372490248,
"learning_rate": 9.414145595004365e-07,
"loss": 0.4342,
"step": 3610
},
{
"epoch": 0.8240147958457817,
"grad_norm": 0.8218941611192476,
"learning_rate": 9.183404588968981e-07,
"loss": 0.4389,
"step": 3620
},
{
"epoch": 0.8262910798122066,
"grad_norm": 0.8452290036325213,
"learning_rate": 8.955240265430182e-07,
"loss": 0.4352,
"step": 3630
},
{
"epoch": 0.8285673637786314,
"grad_norm": 0.8478009707657702,
"learning_rate": 8.729667028014999e-07,
"loss": 0.4512,
"step": 3640
},
{
"epoch": 0.8308436477450561,
"grad_norm": 0.9736536719337533,
"learning_rate": 8.506699116779643e-07,
"loss": 0.4359,
"step": 3650
},
{
"epoch": 0.833119931711481,
"grad_norm": 0.7223652613461555,
"learning_rate": 8.286350607310506e-07,
"loss": 0.434,
"step": 3660
},
{
"epoch": 0.8353962156779058,
"grad_norm": 0.7773708358623372,
"learning_rate": 8.068635409835541e-07,
"loss": 0.4367,
"step": 3670
},
{
"epoch": 0.8376724996443307,
"grad_norm": 0.8191260289584852,
"learning_rate": 7.853567268346212e-07,
"loss": 0.4427,
"step": 3680
},
{
"epoch": 0.8399487836107554,
"grad_norm": 0.8476046937498816,
"learning_rate": 7.641159759729821e-07,
"loss": 0.439,
"step": 3690
},
{
"epoch": 0.8422250675771803,
"grad_norm": 0.854034845886361,
"learning_rate": 7.431426292912414e-07,
"loss": 0.4458,
"step": 3700
},
{
"epoch": 0.8445013515436051,
"grad_norm": 0.8815331913866332,
"learning_rate": 7.224380108012325e-07,
"loss": 0.4299,
"step": 3710
},
{
"epoch": 0.8467776355100298,
"grad_norm": 0.7558898242505027,
"learning_rate": 7.020034275504329e-07,
"loss": 0.4363,
"step": 3720
},
{
"epoch": 0.8490539194764547,
"grad_norm": 0.8092910834412402,
"learning_rate": 6.81840169539451e-07,
"loss": 0.4374,
"step": 3730
},
{
"epoch": 0.8513302034428795,
"grad_norm": 0.7669287363471868,
"learning_rate": 6.619495096405959e-07,
"loss": 0.4405,
"step": 3740
},
{
"epoch": 0.8536064874093043,
"grad_norm": 0.794026413586518,
"learning_rate": 6.423327035175186e-07,
"loss": 0.447,
"step": 3750
},
{
"epoch": 0.8558827713757291,
"grad_norm": 0.7408546983770723,
"learning_rate": 6.229909895459429e-07,
"loss": 0.4315,
"step": 3760
},
{
"epoch": 0.858159055342154,
"grad_norm": 0.7237210059574698,
"learning_rate": 6.039255887354966e-07,
"loss": 0.4391,
"step": 3770
},
{
"epoch": 0.8604353393085787,
"grad_norm": 0.9595044766938607,
"learning_rate": 5.851377046526208e-07,
"loss": 0.4427,
"step": 3780
},
{
"epoch": 0.8627116232750035,
"grad_norm": 0.8802588662536729,
"learning_rate": 5.666285233445978e-07,
"loss": 0.4447,
"step": 3790
},
{
"epoch": 0.8649879072414284,
"grad_norm": 0.8249439161204124,
"learning_rate": 5.483992132646781e-07,
"loss": 0.4433,
"step": 3800
},
{
"epoch": 0.8672641912078531,
"grad_norm": 0.7832517731345501,
"learning_rate": 5.304509251983103e-07,
"loss": 0.4358,
"step": 3810
},
{
"epoch": 0.869540475174278,
"grad_norm": 0.7248468716835277,
"learning_rate": 5.127847921905076e-07,
"loss": 0.4449,
"step": 3820
},
{
"epoch": 0.8718167591407028,
"grad_norm": 0.7513659638517941,
"learning_rate": 4.954019294743045e-07,
"loss": 0.4448,
"step": 3830
},
{
"epoch": 0.8740930431071277,
"grad_norm": 0.9357495352200004,
"learning_rate": 4.783034344003673e-07,
"loss": 0.4398,
"step": 3840
},
{
"epoch": 0.8763693270735524,
"grad_norm": 0.7702794579988504,
"learning_rate": 4.6149038636771337e-07,
"loss": 0.4396,
"step": 3850
},
{
"epoch": 0.8786456110399773,
"grad_norm": 0.7237585600700893,
"learning_rate": 4.449638467555706e-07,
"loss": 0.4369,
"step": 3860
},
{
"epoch": 0.8809218950064021,
"grad_norm": 0.7689094322016036,
"learning_rate": 4.2872485885637803e-07,
"loss": 0.4419,
"step": 3870
},
{
"epoch": 0.8831981789728268,
"grad_norm": 0.6768771986551186,
"learning_rate": 4.1277444780992215e-07,
"loss": 0.4337,
"step": 3880
},
{
"epoch": 0.8854744629392517,
"grad_norm": 0.805045121729404,
"learning_rate": 3.9711362053862115e-07,
"loss": 0.4284,
"step": 3890
},
{
"epoch": 0.8877507469056765,
"grad_norm": 0.9172147345939642,
"learning_rate": 3.817433656839586e-07,
"loss": 0.4446,
"step": 3900
},
{
"epoch": 0.8900270308721013,
"grad_norm": 0.817219220444952,
"learning_rate": 3.6666465354407766e-07,
"loss": 0.4378,
"step": 3910
},
{
"epoch": 0.8923033148385261,
"grad_norm": 0.8368161427901388,
"learning_rate": 3.5187843601252157e-07,
"loss": 0.4396,
"step": 3920
},
{
"epoch": 0.894579598804951,
"grad_norm": 0.8185450548355138,
"learning_rate": 3.373856465181424e-07,
"loss": 0.4364,
"step": 3930
},
{
"epoch": 0.8968558827713757,
"grad_norm": 0.7618894593947647,
"learning_rate": 3.231871999661845e-07,
"loss": 0.4383,
"step": 3940
},
{
"epoch": 0.8991321667378005,
"grad_norm": 0.956580014268259,
"learning_rate": 3.0928399268051247e-07,
"loss": 0.442,
"step": 3950
},
{
"epoch": 0.9014084507042254,
"grad_norm": 0.800968878170199,
"learning_rate": 2.9567690234704295e-07,
"loss": 0.4395,
"step": 3960
},
{
"epoch": 0.9036847346706501,
"grad_norm": 0.8086633377655379,
"learning_rate": 2.8236678795832863e-07,
"loss": 0.4347,
"step": 3970
},
{
"epoch": 0.905961018637075,
"grad_norm": 0.7394969702194686,
"learning_rate": 2.693544897593325e-07,
"loss": 0.4359,
"step": 3980
},
{
"epoch": 0.9082373026034998,
"grad_norm": 0.786442599221493,
"learning_rate": 2.566408291943906e-07,
"loss": 0.4483,
"step": 3990
},
{
"epoch": 0.9105135865699246,
"grad_norm": 0.7730250654140295,
"learning_rate": 2.4422660885534635e-07,
"loss": 0.4506,
"step": 4000
},
{
"epoch": 0.9127898705363494,
"grad_norm": 0.7971290026825717,
"learning_rate": 2.3211261243089255e-07,
"loss": 0.4351,
"step": 4010
},
{
"epoch": 0.9150661545027742,
"grad_norm": 0.7956171103006121,
"learning_rate": 2.2029960465709433e-07,
"loss": 0.4358,
"step": 4020
},
{
"epoch": 0.917342438469199,
"grad_norm": 0.7975596769757733,
"learning_rate": 2.0878833126911135e-07,
"loss": 0.4429,
"step": 4030
},
{
"epoch": 0.9196187224356238,
"grad_norm": 0.7720277466201635,
"learning_rate": 1.9757951895412576e-07,
"loss": 0.4352,
"step": 4040
},
{
"epoch": 0.9218950064020487,
"grad_norm": 0.7853839927041535,
"learning_rate": 1.866738753054631e-07,
"loss": 0.4551,
"step": 4050
},
{
"epoch": 0.9241712903684735,
"grad_norm": 0.7825373294798991,
"learning_rate": 1.7607208877792604e-07,
"loss": 0.4417,
"step": 4060
},
{
"epoch": 0.9264475743348983,
"grad_norm": 0.8184549696249286,
"learning_rate": 1.6577482864432946e-07,
"loss": 0.4399,
"step": 4070
},
{
"epoch": 0.9287238583013231,
"grad_norm": 0.7713360616227353,
"learning_rate": 1.5578274495325618e-07,
"loss": 0.4329,
"step": 4080
},
{
"epoch": 0.9310001422677479,
"grad_norm": 0.7499304967763705,
"learning_rate": 1.4609646848801561e-07,
"loss": 0.4378,
"step": 4090
},
{
"epoch": 0.9332764262341727,
"grad_norm": 0.8556735326094547,
"learning_rate": 1.3671661072682585e-07,
"loss": 0.4463,
"step": 4100
},
{
"epoch": 0.9355527102005975,
"grad_norm": 0.7950794399913543,
"learning_rate": 1.276437638042116e-07,
"loss": 0.4324,
"step": 4110
},
{
"epoch": 0.9378289941670224,
"grad_norm": 0.8029386901058022,
"learning_rate": 1.1887850047362315e-07,
"loss": 0.4418,
"step": 4120
},
{
"epoch": 0.9401052781334471,
"grad_norm": 0.8040559392859772,
"learning_rate": 1.104213740712795e-07,
"loss": 0.4432,
"step": 4130
},
{
"epoch": 0.942381562099872,
"grad_norm": 0.7956315398376556,
"learning_rate": 1.0227291848123932e-07,
"loss": 0.4443,
"step": 4140
},
{
"epoch": 0.9446578460662968,
"grad_norm": 0.8316381693144498,
"learning_rate": 9.443364810169331e-08,
"loss": 0.4479,
"step": 4150
},
{
"epoch": 0.9469341300327215,
"grad_norm": 0.7160374953454977,
"learning_rate": 8.690405781249745e-08,
"loss": 0.4394,
"step": 4160
},
{
"epoch": 0.9492104139991464,
"grad_norm": 1.1619693324211688,
"learning_rate": 7.96846229439241e-08,
"loss": 0.4391,
"step": 4170
},
{
"epoch": 0.9514866979655712,
"grad_norm": 0.7807355700189923,
"learning_rate": 7.277579924666322e-08,
"loss": 0.4431,
"step": 4180
},
{
"epoch": 0.953762981931996,
"grad_norm": 0.799461080581998,
"learning_rate": 6.617802286304597e-08,
"loss": 0.4465,
"step": 4190
},
{
"epoch": 0.9560392658984208,
"grad_norm": 0.8615155608414043,
"learning_rate": 5.989171029951446e-08,
"loss": 0.4545,
"step": 4200
},
{
"epoch": 0.9583155498648457,
"grad_norm": 1.7829375204373485,
"learning_rate": 5.391725840032724e-08,
"loss": 0.4361,
"step": 4210
},
{
"epoch": 0.9605918338312704,
"grad_norm": 0.8158290333293733,
"learning_rate": 4.8255044322507714e-08,
"loss": 0.4319,
"step": 4220
},
{
"epoch": 0.9628681177976952,
"grad_norm": 0.7479808786166098,
"learning_rate": 4.290542551203536e-08,
"loss": 0.4452,
"step": 4230
},
{
"epoch": 0.9651444017641201,
"grad_norm": 0.823075917374607,
"learning_rate": 3.7868739681278796e-08,
"loss": 0.4395,
"step": 4240
},
{
"epoch": 0.9674206857305448,
"grad_norm": 0.8193285422862143,
"learning_rate": 3.314530478768008e-08,
"loss": 0.4378,
"step": 4250
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.7169988301586265,
"learning_rate": 2.8735419013677934e-08,
"loss": 0.4368,
"step": 4260
},
{
"epoch": 0.9719732536633945,
"grad_norm": 0.867983394784437,
"learning_rate": 2.4639360747888974e-08,
"loss": 0.4433,
"step": 4270
},
{
"epoch": 0.9742495376298194,
"grad_norm": 0.8926574534891935,
"learning_rate": 2.0857388567529502e-08,
"loss": 0.4366,
"step": 4280
},
{
"epoch": 0.9765258215962441,
"grad_norm": 0.8314945073143262,
"learning_rate": 1.738974122209358e-08,
"loss": 0.4472,
"step": 4290
},
{
"epoch": 0.9788021055626689,
"grad_norm": 0.8241979804740377,
"learning_rate": 1.4236637618282312e-08,
"loss": 0.4496,
"step": 4300
},
{
"epoch": 0.9810783895290938,
"grad_norm": 0.7393166267679743,
"learning_rate": 1.1398276806182107e-08,
"loss": 0.4315,
"step": 4310
},
{
"epoch": 0.9833546734955185,
"grad_norm": 0.7415102824589282,
"learning_rate": 8.874837966700855e-09,
"loss": 0.433,
"step": 4320
},
{
"epoch": 0.9856309574619434,
"grad_norm": 0.7946710429947246,
"learning_rate": 6.6664804002564145e-09,
"loss": 0.4364,
"step": 4330
},
{
"epoch": 0.9879072414283682,
"grad_norm": 0.8084200476976278,
"learning_rate": 4.773343516718543e-09,
"loss": 0.4312,
"step": 4340
},
{
"epoch": 0.990183525394793,
"grad_norm": 0.8535152486368869,
"learning_rate": 3.1955468266120505e-09,
"loss": 0.4462,
"step": 4350
},
{
"epoch": 0.9924598093612178,
"grad_norm": 0.7974643794010233,
"learning_rate": 1.9331899335661708e-09,
"loss": 0.4357,
"step": 4360
},
{
"epoch": 0.9947360933276427,
"grad_norm": 0.7710537576596679,
"learning_rate": 9.863525280340292e-10,
"loss": 0.4337,
"step": 4370
},
{
"epoch": 0.9970123772940674,
"grad_norm": 0.7708116142325829,
"learning_rate": 3.550943822550057e-10,
"loss": 0.4338,
"step": 4380
},
{
"epoch": 0.9992886612604922,
"grad_norm": 0.8061784317503751,
"learning_rate": 3.9455346487193846e-11,
"loss": 0.4427,
"step": 4390
},
{
"epoch": 1.0,
"step": 4394,
"total_flos": 9.741193804139987e+18,
"train_loss": 0.29473522613414266,
"train_runtime": 212859.8204,
"train_samples_per_second": 1.981,
"train_steps_per_second": 0.021
}
],
"logging_steps": 10,
"max_steps": 4394,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.741193804139987e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}