1954 lines
47 KiB
JSON
1954 lines
47 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 2.987419245154709,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 273,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.01088065283917035,
|
||
|
|
"grad_norm": 5.769476412265319,
|
||
|
|
"learning_rate": 2.8571428571428573e-06,
|
||
|
|
"loss": 0.8514,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0217613056783407,
|
||
|
|
"grad_norm": 5.824165855722914,
|
||
|
|
"learning_rate": 5.7142857142857145e-06,
|
||
|
|
"loss": 0.8564,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.032641958517511054,
|
||
|
|
"grad_norm": 5.385924326553742,
|
||
|
|
"learning_rate": 8.571428571428571e-06,
|
||
|
|
"loss": 0.8354,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0435226113566814,
|
||
|
|
"grad_norm": 2.307280123324516,
|
||
|
|
"learning_rate": 1.1428571428571429e-05,
|
||
|
|
"loss": 0.771,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05440326419585175,
|
||
|
|
"grad_norm": 3.8643476531037586,
|
||
|
|
"learning_rate": 1.4285714285714287e-05,
|
||
|
|
"loss": 0.7531,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06528391703502211,
|
||
|
|
"grad_norm": 4.185929449072588,
|
||
|
|
"learning_rate": 1.7142857142857142e-05,
|
||
|
|
"loss": 0.7605,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07616456987419246,
|
||
|
|
"grad_norm": 4.238592120499111,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.7288,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0870452227133628,
|
||
|
|
"grad_norm": 2.702549334633677,
|
||
|
|
"learning_rate": 2.2857142857142858e-05,
|
||
|
|
"loss": 0.7069,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09792587555253315,
|
||
|
|
"grad_norm": 2.8265702023660415,
|
||
|
|
"learning_rate": 2.5714285714285718e-05,
|
||
|
|
"loss": 0.6744,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1088065283917035,
|
||
|
|
"grad_norm": 2.1843367934470335,
|
||
|
|
"learning_rate": 2.8571428571428574e-05,
|
||
|
|
"loss": 0.6654,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11968718123087385,
|
||
|
|
"grad_norm": 1.5063626985021334,
|
||
|
|
"learning_rate": 3.142857142857143e-05,
|
||
|
|
"loss": 0.6374,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13056783407004421,
|
||
|
|
"grad_norm": 1.3685321573654194,
|
||
|
|
"learning_rate": 3.4285714285714284e-05,
|
||
|
|
"loss": 0.6326,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14144848690921455,
|
||
|
|
"grad_norm": 1.4488139563790012,
|
||
|
|
"learning_rate": 3.714285714285715e-05,
|
||
|
|
"loss": 0.6164,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1523291397483849,
|
||
|
|
"grad_norm": 1.1107459336385455,
|
||
|
|
"learning_rate": 4e-05,
|
||
|
|
"loss": 0.6096,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16320979258755525,
|
||
|
|
"grad_norm": 1.4034801732959803,
|
||
|
|
"learning_rate": 4.2857142857142856e-05,
|
||
|
|
"loss": 0.5988,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1740904454267256,
|
||
|
|
"grad_norm": 1.1173039797504758,
|
||
|
|
"learning_rate": 4.5714285714285716e-05,
|
||
|
|
"loss": 0.5979,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18497109826589594,
|
||
|
|
"grad_norm": 1.3758839510979688,
|
||
|
|
"learning_rate": 4.857142857142857e-05,
|
||
|
|
"loss": 0.5824,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1958517511050663,
|
||
|
|
"grad_norm": 0.9901489591062091,
|
||
|
|
"learning_rate": 5.1428571428571436e-05,
|
||
|
|
"loss": 0.5851,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20673240394423664,
|
||
|
|
"grad_norm": 1.3298335358386597,
|
||
|
|
"learning_rate": 5.4285714285714295e-05,
|
||
|
|
"loss": 0.5798,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.217613056783407,
|
||
|
|
"grad_norm": 1.1614826605670152,
|
||
|
|
"learning_rate": 5.714285714285715e-05,
|
||
|
|
"loss": 0.5623,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22849370962257737,
|
||
|
|
"grad_norm": 1.3114977506132772,
|
||
|
|
"learning_rate": 6.000000000000001e-05,
|
||
|
|
"loss": 0.5617,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2393743624617477,
|
||
|
|
"grad_norm": 0.9156072730087671,
|
||
|
|
"learning_rate": 6.285714285714286e-05,
|
||
|
|
"loss": 0.554,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25025501530091804,
|
||
|
|
"grad_norm": 1.5985352795964587,
|
||
|
|
"learning_rate": 6.571428571428571e-05,
|
||
|
|
"loss": 0.5728,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26113566814008843,
|
||
|
|
"grad_norm": 1.347383875892142,
|
||
|
|
"learning_rate": 6.857142857142857e-05,
|
||
|
|
"loss": 0.5663,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27201632097925876,
|
||
|
|
"grad_norm": 0.8364475716969787,
|
||
|
|
"learning_rate": 7.142857142857143e-05,
|
||
|
|
"loss": 0.5589,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2828969738184291,
|
||
|
|
"grad_norm": 1.6877024422160516,
|
||
|
|
"learning_rate": 7.42857142857143e-05,
|
||
|
|
"loss": 0.5561,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29377762665759943,
|
||
|
|
"grad_norm": 0.8544602518620728,
|
||
|
|
"learning_rate": 7.714285714285715e-05,
|
||
|
|
"loss": 0.5371,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3046582794967698,
|
||
|
|
"grad_norm": 1.10746206935203,
|
||
|
|
"learning_rate": 8e-05,
|
||
|
|
"loss": 0.5482,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31553893233594016,
|
||
|
|
"grad_norm": 0.8901965542271334,
|
||
|
|
"learning_rate": 7.999671154713278e-05,
|
||
|
|
"loss": 0.5468,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3264195851751105,
|
||
|
|
"grad_norm": 1.3171275211566689,
|
||
|
|
"learning_rate": 7.99868467292272e-05,
|
||
|
|
"loss": 0.5484,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3373002380142809,
|
||
|
|
"grad_norm": 1.119133823137938,
|
||
|
|
"learning_rate": 7.997040716828271e-05,
|
||
|
|
"loss": 0.5391,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3481808908534512,
|
||
|
|
"grad_norm": 766.2890164269288,
|
||
|
|
"learning_rate": 7.994739556733538e-05,
|
||
|
|
"loss": 7.0977,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35906154369262155,
|
||
|
|
"grad_norm": 360.8841002867855,
|
||
|
|
"learning_rate": 7.991781571001347e-05,
|
||
|
|
"loss": 8.9951,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3699421965317919,
|
||
|
|
"grad_norm": 378.0503338631233,
|
||
|
|
"learning_rate": 7.988167245991528e-05,
|
||
|
|
"loss": 6.6897,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3808228493709623,
|
||
|
|
"grad_norm": 7.672002821902226,
|
||
|
|
"learning_rate": 7.983897175980957e-05,
|
||
|
|
"loss": 0.7981,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3917035022101326,
|
||
|
|
"grad_norm": 4.42825926220236,
|
||
|
|
"learning_rate": 7.97897206306583e-05,
|
||
|
|
"loss": 0.7036,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40258415504930295,
|
||
|
|
"grad_norm": 13.3824847890278,
|
||
|
|
"learning_rate": 7.973392717046233e-05,
|
||
|
|
"loss": 0.6454,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4134648078884733,
|
||
|
|
"grad_norm": 4.184331419144958,
|
||
|
|
"learning_rate": 7.967160055292984e-05,
|
||
|
|
"loss": 0.6386,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4243454607276437,
|
||
|
|
"grad_norm": 2.5999303493722135,
|
||
|
|
"learning_rate": 7.960275102596809e-05,
|
||
|
|
"loss": 0.6128,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.435226113566814,
|
||
|
|
"grad_norm": 5.547292352391667,
|
||
|
|
"learning_rate": 7.952738990999824e-05,
|
||
|
|
"loss": 0.5773,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44610676640598435,
|
||
|
|
"grad_norm": 2.1944412304611167,
|
||
|
|
"learning_rate": 7.94455295960942e-05,
|
||
|
|
"loss": 0.569,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45698741924515474,
|
||
|
|
"grad_norm": 1.3921897468618165,
|
||
|
|
"learning_rate": 7.93571835439452e-05,
|
||
|
|
"loss": 0.5501,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46786807208432507,
|
||
|
|
"grad_norm": 1.3666229580107998,
|
||
|
|
"learning_rate": 7.926236627964262e-05,
|
||
|
|
"loss": 0.5486,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4787487249234954,
|
||
|
|
"grad_norm": 1.2707740638081064,
|
||
|
|
"learning_rate": 7.916109339329173e-05,
|
||
|
|
"loss": 0.5452,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48962937776266574,
|
||
|
|
"grad_norm": 0.7728181787873867,
|
||
|
|
"learning_rate": 7.905338153644818e-05,
|
||
|
|
"loss": 0.5349,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5005100306018361,
|
||
|
|
"grad_norm": 1.081424627354813,
|
||
|
|
"learning_rate": 7.89392484193802e-05,
|
||
|
|
"loss": 0.5359,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5113906834410065,
|
||
|
|
"grad_norm": 0.8451219565877732,
|
||
|
|
"learning_rate": 7.881871280815659e-05,
|
||
|
|
"loss": 0.5192,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5222713362801769,
|
||
|
|
"grad_norm": 0.9838361376918019,
|
||
|
|
"learning_rate": 7.869179452156118e-05,
|
||
|
|
"loss": 0.523,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5331519891193471,
|
||
|
|
"grad_norm": 0.7178144668508193,
|
||
|
|
"learning_rate": 7.855851442783414e-05,
|
||
|
|
"loss": 0.5277,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5440326419585175,
|
||
|
|
"grad_norm": 0.8653884854595517,
|
||
|
|
"learning_rate": 7.841889444124078e-05,
|
||
|
|
"loss": 0.5171,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5549132947976878,
|
||
|
|
"grad_norm": 0.9615871234590915,
|
||
|
|
"learning_rate": 7.827295751846836e-05,
|
||
|
|
"loss": 0.5228,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5657939476368582,
|
||
|
|
"grad_norm": 0.6109242817024244,
|
||
|
|
"learning_rate": 7.81207276548515e-05,
|
||
|
|
"loss": 0.5176,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5766746004760286,
|
||
|
|
"grad_norm": 0.7262379927240948,
|
||
|
|
"learning_rate": 7.796222988042676e-05,
|
||
|
|
"loss": 0.5173,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5875552533151989,
|
||
|
|
"grad_norm": 0.8000849152573422,
|
||
|
|
"learning_rate": 7.779749025581717e-05,
|
||
|
|
"loss": 0.5112,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5984359061543693,
|
||
|
|
"grad_norm": 0.6837286870058386,
|
||
|
|
"learning_rate": 7.762653586794731e-05,
|
||
|
|
"loss": 0.5207,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6093165589935396,
|
||
|
|
"grad_norm": 0.502300421602433,
|
||
|
|
"learning_rate": 7.74493948255895e-05,
|
||
|
|
"loss": 0.5023,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6201972118327099,
|
||
|
|
"grad_norm": 0.5742765191011999,
|
||
|
|
"learning_rate": 7.726609625474218e-05,
|
||
|
|
"loss": 0.493,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6310778646718803,
|
||
|
|
"grad_norm": 0.7752392361844105,
|
||
|
|
"learning_rate": 7.707667029384088e-05,
|
||
|
|
"loss": 0.5002,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6419585175110507,
|
||
|
|
"grad_norm": 0.8003464724621063,
|
||
|
|
"learning_rate": 7.688114808880283e-05,
|
||
|
|
"loss": 0.5014,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.652839170350221,
|
||
|
|
"grad_norm": 0.4467202145391146,
|
||
|
|
"learning_rate": 7.667956178790582e-05,
|
||
|
|
"loss": 0.4932,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6637198231893914,
|
||
|
|
"grad_norm": 0.47406542774154264,
|
||
|
|
"learning_rate": 7.647194453650228e-05,
|
||
|
|
"loss": 0.5052,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6746004760285618,
|
||
|
|
"grad_norm": 0.61628828690436,
|
||
|
|
"learning_rate": 7.625833047156953e-05,
|
||
|
|
"loss": 0.48,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.685481128867732,
|
||
|
|
"grad_norm": 0.6613121807693437,
|
||
|
|
"learning_rate": 7.603875471609677e-05,
|
||
|
|
"loss": 0.496,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6963617817069024,
|
||
|
|
"grad_norm": 0.5547782762077185,
|
||
|
|
"learning_rate": 7.581325337331013e-05,
|
||
|
|
"loss": 0.4961,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7072424345460727,
|
||
|
|
"grad_norm": 0.4876382130153157,
|
||
|
|
"learning_rate": 7.558186352073648e-05,
|
||
|
|
"loss": 0.4835,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7181230873852431,
|
||
|
|
"grad_norm": 0.5304127406212,
|
||
|
|
"learning_rate": 7.534462320410702e-05,
|
||
|
|
"loss": 0.4935,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7290037402244135,
|
||
|
|
"grad_norm": 0.41710180085319953,
|
||
|
|
"learning_rate": 7.510157143110172e-05,
|
||
|
|
"loss": 0.4906,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7398843930635838,
|
||
|
|
"grad_norm": 0.48357676956883017,
|
||
|
|
"learning_rate": 7.485274816493558e-05,
|
||
|
|
"loss": 0.4924,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7507650459027542,
|
||
|
|
"grad_norm": 0.5917098078277447,
|
||
|
|
"learning_rate": 7.459819431778775e-05,
|
||
|
|
"loss": 0.4947,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7616456987419246,
|
||
|
|
"grad_norm": 0.7595830676091241,
|
||
|
|
"learning_rate": 7.433795174407465e-05,
|
||
|
|
"loss": 0.4817,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7725263515810948,
|
||
|
|
"grad_norm": 0.7936800209738104,
|
||
|
|
"learning_rate": 7.407206323356818e-05,
|
||
|
|
"loss": 0.4918,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7834070044202652,
|
||
|
|
"grad_norm": 0.7864537130215999,
|
||
|
|
"learning_rate": 7.380057250436006e-05,
|
||
|
|
"loss": 0.4887,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7942876572594356,
|
||
|
|
"grad_norm": 0.49916613247579816,
|
||
|
|
"learning_rate": 7.352352419567362e-05,
|
||
|
|
"loss": 0.4816,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8051683100986059,
|
||
|
|
"grad_norm": 0.41101864085328815,
|
||
|
|
"learning_rate": 7.324096386052416e-05,
|
||
|
|
"loss": 0.485,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8160489629377763,
|
||
|
|
"grad_norm": 0.656161004995155,
|
||
|
|
"learning_rate": 7.295293795822887e-05,
|
||
|
|
"loss": 0.4744,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8269296157769466,
|
||
|
|
"grad_norm": 0.5394326858320477,
|
||
|
|
"learning_rate": 7.265949384676795e-05,
|
||
|
|
"loss": 0.4722,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.837810268616117,
|
||
|
|
"grad_norm": 0.2642185151465239,
|
||
|
|
"learning_rate": 7.236067977499791e-05,
|
||
|
|
"loss": 0.4807,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8486909214552874,
|
||
|
|
"grad_norm": 0.5422041951823006,
|
||
|
|
"learning_rate": 7.205654487471826e-05,
|
||
|
|
"loss": 0.4797,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8595715742944576,
|
||
|
|
"grad_norm": 0.5501247208421062,
|
||
|
|
"learning_rate": 7.174713915259331e-05,
|
||
|
|
"loss": 0.475,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.870452227133628,
|
||
|
|
"grad_norm": 0.4027113374681404,
|
||
|
|
"learning_rate": 7.143251348192971e-05,
|
||
|
|
"loss": 0.4677,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8813328799727984,
|
||
|
|
"grad_norm": 0.46847225987486674,
|
||
|
|
"learning_rate": 7.111271959431189e-05,
|
||
|
|
"loss": 0.4793,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8922135328119687,
|
||
|
|
"grad_norm": 0.37608598585235453,
|
||
|
|
"learning_rate": 7.078781007109625e-05,
|
||
|
|
"loss": 0.4695,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9030941856511391,
|
||
|
|
"grad_norm": 0.3314250240053766,
|
||
|
|
"learning_rate": 7.045783833476538e-05,
|
||
|
|
"loss": 0.4794,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9139748384903095,
|
||
|
|
"grad_norm": 0.35538493046088,
|
||
|
|
"learning_rate": 7.012285864014445e-05,
|
||
|
|
"loss": 0.4758,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9248554913294798,
|
||
|
|
"grad_norm": 0.40899275410289987,
|
||
|
|
"learning_rate": 6.978292606548029e-05,
|
||
|
|
"loss": 0.4716,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9357361441686501,
|
||
|
|
"grad_norm": 0.39602646601922714,
|
||
|
|
"learning_rate": 6.943809650338541e-05,
|
||
|
|
"loss": 0.4703,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9466167970078204,
|
||
|
|
"grad_norm": 0.32981646879813115,
|
||
|
|
"learning_rate": 6.908842665164789e-05,
|
||
|
|
"loss": 0.4665,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9574974498469908,
|
||
|
|
"grad_norm": 0.45700638514003344,
|
||
|
|
"learning_rate": 6.873397400390911e-05,
|
||
|
|
"loss": 0.4718,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9683781026861612,
|
||
|
|
"grad_norm": 0.4952520819569817,
|
||
|
|
"learning_rate": 6.837479684021032e-05,
|
||
|
|
"loss": 0.4726,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9792587555253315,
|
||
|
|
"grad_norm": 0.480434270208596,
|
||
|
|
"learning_rate": 6.80109542174102e-05,
|
||
|
|
"loss": 0.4699,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9901394083645019,
|
||
|
|
"grad_norm": 0.593562930442879,
|
||
|
|
"learning_rate": 6.76425059594746e-05,
|
||
|
|
"loss": 0.4788,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.009520571234274,
|
||
|
|
"grad_norm": 0.6664418776633385,
|
||
|
|
"learning_rate": 6.726951264763998e-05,
|
||
|
|
"loss": 0.456,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0204012240734444,
|
||
|
|
"grad_norm": 0.6935514470470976,
|
||
|
|
"learning_rate": 6.689203561045268e-05,
|
||
|
|
"loss": 0.4545,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0312818769126149,
|
||
|
|
"grad_norm": 0.8262173326620568,
|
||
|
|
"learning_rate": 6.651013691368492e-05,
|
||
|
|
"loss": 0.4589,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0421625297517851,
|
||
|
|
"grad_norm": 0.9951587995620871,
|
||
|
|
"learning_rate": 6.612387935012995e-05,
|
||
|
|
"loss": 0.4594,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0530431825909554,
|
||
|
|
"grad_norm": 0.9844429089044019,
|
||
|
|
"learning_rate": 6.573332642927737e-05,
|
||
|
|
"loss": 0.4514,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.063923835430126,
|
||
|
|
"grad_norm": 0.6991926155223892,
|
||
|
|
"learning_rate": 6.53385423668708e-05,
|
||
|
|
"loss": 0.4416,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0748044882692962,
|
||
|
|
"grad_norm": 0.585309533517111,
|
||
|
|
"learning_rate": 6.493959207434934e-05,
|
||
|
|
"loss": 0.4497,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0856851411084665,
|
||
|
|
"grad_norm": 0.7423320704031524,
|
||
|
|
"learning_rate": 6.453654114817467e-05,
|
||
|
|
"loss": 0.457,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0965657939476368,
|
||
|
|
"grad_norm": 0.6185657582865267,
|
||
|
|
"learning_rate": 6.412945585904545e-05,
|
||
|
|
"loss": 0.4481,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1074464467868073,
|
||
|
|
"grad_norm": 0.45496645094782234,
|
||
|
|
"learning_rate": 6.371840314100104e-05,
|
||
|
|
"loss": 0.4514,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1183270996259775,
|
||
|
|
"grad_norm": 0.6039336404495378,
|
||
|
|
"learning_rate": 6.330345058041585e-05,
|
||
|
|
"loss": 0.4583,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1292077524651478,
|
||
|
|
"grad_norm": 0.4358480651096581,
|
||
|
|
"learning_rate": 6.288466640488679e-05,
|
||
|
|
"loss": 0.4374,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1400884053043183,
|
||
|
|
"grad_norm": 0.43724639626696055,
|
||
|
|
"learning_rate": 6.2462119472015e-05,
|
||
|
|
"loss": 0.4428,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1509690581434886,
|
||
|
|
"grad_norm": 0.5729837222594828,
|
||
|
|
"learning_rate": 6.20358792580841e-05,
|
||
|
|
"loss": 0.4471,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1618497109826589,
|
||
|
|
"grad_norm": 0.39024335737726285,
|
||
|
|
"learning_rate": 6.160601584663681e-05,
|
||
|
|
"loss": 0.4453,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1727303638218294,
|
||
|
|
"grad_norm": 0.3646061376356188,
|
||
|
|
"learning_rate": 6.11725999169515e-05,
|
||
|
|
"loss": 0.4491,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1836110166609997,
|
||
|
|
"grad_norm": 0.46982607381941577,
|
||
|
|
"learning_rate": 6.0735702732421015e-05,
|
||
|
|
"loss": 0.4443,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.19449166950017,
|
||
|
|
"grad_norm": 0.29189646752552,
|
||
|
|
"learning_rate": 6.029539612883529e-05,
|
||
|
|
"loss": 0.4402,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2053723223393404,
|
||
|
|
"grad_norm": 0.3520773279039917,
|
||
|
|
"learning_rate": 5.9851752502570015e-05,
|
||
|
|
"loss": 0.435,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2162529751785107,
|
||
|
|
"grad_norm": 0.32075025375118393,
|
||
|
|
"learning_rate": 5.940484479868288e-05,
|
||
|
|
"loss": 0.4462,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.227133628017681,
|
||
|
|
"grad_norm": 0.2882478429202779,
|
||
|
|
"learning_rate": 5.895474649891995e-05,
|
||
|
|
"loss": 0.4421,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2380142808568515,
|
||
|
|
"grad_norm": 0.31925565019590013,
|
||
|
|
"learning_rate": 5.8501531609633424e-05,
|
||
|
|
"loss": 0.446,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2488949336960218,
|
||
|
|
"grad_norm": 0.21223179457423266,
|
||
|
|
"learning_rate": 5.8045274649613386e-05,
|
||
|
|
"loss": 0.4448,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.259775586535192,
|
||
|
|
"grad_norm": 0.26953627133631786,
|
||
|
|
"learning_rate": 5.7586050637835295e-05,
|
||
|
|
"loss": 0.4374,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2706562393743623,
|
||
|
|
"grad_norm": 0.21178369672133335,
|
||
|
|
"learning_rate": 5.7123935081125034e-05,
|
||
|
|
"loss": 0.4458,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2815368922135328,
|
||
|
|
"grad_norm": 0.21155767077821688,
|
||
|
|
"learning_rate": 5.6659003961743965e-05,
|
||
|
|
"loss": 0.4376,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2924175450527031,
|
||
|
|
"grad_norm": 0.21347423268768667,
|
||
|
|
"learning_rate": 5.619133372489575e-05,
|
||
|
|
"loss": 0.4515,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3032981978918734,
|
||
|
|
"grad_norm": 0.21576745513388285,
|
||
|
|
"learning_rate": 5.572100126615695e-05,
|
||
|
|
"loss": 0.4443,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.314178850731044,
|
||
|
|
"grad_norm": 0.24546247378355868,
|
||
|
|
"learning_rate": 5.524808391883367e-05,
|
||
|
|
"loss": 0.441,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3250595035702142,
|
||
|
|
"grad_norm": 0.2207361069513953,
|
||
|
|
"learning_rate": 5.477265944124626e-05,
|
||
|
|
"loss": 0.4354,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3359401564093845,
|
||
|
|
"grad_norm": 0.2444333045464523,
|
||
|
|
"learning_rate": 5.429480600394405e-05,
|
||
|
|
"loss": 0.4407,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.346820809248555,
|
||
|
|
"grad_norm": 0.24749025049340784,
|
||
|
|
"learning_rate": 5.381460217685231e-05,
|
||
|
|
"loss": 0.4359,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3577014620877252,
|
||
|
|
"grad_norm": 0.22381053800753611,
|
||
|
|
"learning_rate": 5.333212691635368e-05,
|
||
|
|
"loss": 0.4347,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3685821149268955,
|
||
|
|
"grad_norm": 0.24301738733290523,
|
||
|
|
"learning_rate": 5.2847459552305834e-05,
|
||
|
|
"loss": 0.4337,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.379462767766066,
|
||
|
|
"grad_norm": 0.24671775018284153,
|
||
|
|
"learning_rate": 5.23606797749979e-05,
|
||
|
|
"loss": 0.4384,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3903434206052363,
|
||
|
|
"grad_norm": 0.23843656791767695,
|
||
|
|
"learning_rate": 5.1871867622047624e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4012240734444066,
|
||
|
|
"grad_norm": 0.18481585189252123,
|
||
|
|
"learning_rate": 5.13811034652413e-05,
|
||
|
|
"loss": 0.4371,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.412104726283577,
|
||
|
|
"grad_norm": 0.18518123274811235,
|
||
|
|
"learning_rate": 5.088846799731885e-05,
|
||
|
|
"loss": 0.4342,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4229853791227474,
|
||
|
|
"grad_norm": 0.2010634600287658,
|
||
|
|
"learning_rate": 5.039404221870612e-05,
|
||
|
|
"loss": 0.4296,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4338660319619176,
|
||
|
|
"grad_norm": 0.23061272903823807,
|
||
|
|
"learning_rate": 4.989790742419658e-05,
|
||
|
|
"loss": 0.4415,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4447466848010881,
|
||
|
|
"grad_norm": 0.2507778238204534,
|
||
|
|
"learning_rate": 4.940014518958461e-05,
|
||
|
|
"loss": 0.4338,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4556273376402584,
|
||
|
|
"grad_norm": 0.27522736481625426,
|
||
|
|
"learning_rate": 4.890083735825258e-05,
|
||
|
|
"loss": 0.4397,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4665079904794287,
|
||
|
|
"grad_norm": 0.2145456230157738,
|
||
|
|
"learning_rate": 4.8400066027713974e-05,
|
||
|
|
"loss": 0.4271,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4773886433185992,
|
||
|
|
"grad_norm": 0.20238631194211945,
|
||
|
|
"learning_rate": 4.789791353611469e-05,
|
||
|
|
"loss": 0.4229,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4882692961577695,
|
||
|
|
"grad_norm": 0.17925376141197102,
|
||
|
|
"learning_rate": 4.7394462448694756e-05,
|
||
|
|
"loss": 0.4383,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4991499489969398,
|
||
|
|
"grad_norm": 0.16639345435419553,
|
||
|
|
"learning_rate": 4.688979554421276e-05,
|
||
|
|
"loss": 0.4307,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5100306018361103,
|
||
|
|
"grad_norm": 0.16546944794615542,
|
||
|
|
"learning_rate": 4.6383995801335176e-05,
|
||
|
|
"loss": 0.4413,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5209112546752805,
|
||
|
|
"grad_norm": 0.22823688961718305,
|
||
|
|
"learning_rate": 4.5877146384992725e-05,
|
||
|
|
"loss": 0.4343,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5317919075144508,
|
||
|
|
"grad_norm": 0.2077914352935504,
|
||
|
|
"learning_rate": 4.5369330632706223e-05,
|
||
|
|
"loss": 0.4304,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5426725603536213,
|
||
|
|
"grad_norm": 0.178090044195881,
|
||
|
|
"learning_rate": 4.486063204088402e-05,
|
||
|
|
"loss": 0.433,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5535532131927916,
|
||
|
|
"grad_norm": 0.17467896850970988,
|
||
|
|
"learning_rate": 4.435113425109324e-05,
|
||
|
|
"loss": 0.4298,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5644338660319619,
|
||
|
|
"grad_norm": 0.17004530557039635,
|
||
|
|
"learning_rate": 4.3840921036307274e-05,
|
||
|
|
"loss": 0.4348,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5753145188711324,
|
||
|
|
"grad_norm": 0.1956158514979538,
|
||
|
|
"learning_rate": 4.333007628713158e-05,
|
||
|
|
"loss": 0.4384,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5861951717103027,
|
||
|
|
"grad_norm": 0.16196619070156484,
|
||
|
|
"learning_rate": 4.281868399801016e-05,
|
||
|
|
"loss": 0.4362,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.597075824549473,
|
||
|
|
"grad_norm": 0.15774918769209736,
|
||
|
|
"learning_rate": 4.230682825341498e-05,
|
||
|
|
"loss": 0.4321,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6079564773886434,
|
||
|
|
"grad_norm": 0.14677757963300414,
|
||
|
|
"learning_rate": 4.17945932140206e-05,
|
||
|
|
"loss": 0.4343,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6188371302278135,
|
||
|
|
"grad_norm": 0.1586788180866913,
|
||
|
|
"learning_rate": 4.128206310286622e-05,
|
||
|
|
"loss": 0.4319,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.629717783066984,
|
||
|
|
"grad_norm": 0.15657539290671857,
|
||
|
|
"learning_rate": 4.0769322191507485e-05,
|
||
|
|
"loss": 0.4349,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6405984359061545,
|
||
|
|
"grad_norm": 0.13771725448321517,
|
||
|
|
"learning_rate": 4.025645478616045e-05,
|
||
|
|
"loss": 0.4286,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6514790887453246,
|
||
|
|
"grad_norm": 0.1792608152635085,
|
||
|
|
"learning_rate": 3.974354521383956e-05,
|
||
|
|
"loss": 0.4326,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.662359741584495,
|
||
|
|
"grad_norm": 0.19697920039522113,
|
||
|
|
"learning_rate": 3.923067780849252e-05,
|
||
|
|
"loss": 0.4325,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6732403944236656,
|
||
|
|
"grad_norm": 0.15631987727039992,
|
||
|
|
"learning_rate": 3.87179368971338e-05,
|
||
|
|
"loss": 0.4269,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6841210472628356,
|
||
|
|
"grad_norm": 0.1789741559362441,
|
||
|
|
"learning_rate": 3.820540678597942e-05,
|
||
|
|
"loss": 0.4352,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6950017001020061,
|
||
|
|
"grad_norm": 0.14589986696711105,
|
||
|
|
"learning_rate": 3.769317174658503e-05,
|
||
|
|
"loss": 0.4331,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7058823529411766,
|
||
|
|
"grad_norm": 0.16739347857930229,
|
||
|
|
"learning_rate": 3.718131600198984e-05,
|
||
|
|
"loss": 0.4385,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7167630057803467,
|
||
|
|
"grad_norm": 0.15981306017402697,
|
||
|
|
"learning_rate": 3.666992371286843e-05,
|
||
|
|
"loss": 0.4313,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7276436586195172,
|
||
|
|
"grad_norm": 0.15235627870495205,
|
||
|
|
"learning_rate": 3.615907896369273e-05,
|
||
|
|
"loss": 0.442,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7385243114586875,
|
||
|
|
"grad_norm": 0.15094599321638194,
|
||
|
|
"learning_rate": 3.564886574890677e-05,
|
||
|
|
"loss": 0.4376,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7494049642978577,
|
||
|
|
"grad_norm": 0.15338762976356135,
|
||
|
|
"learning_rate": 3.5139367959115986e-05,
|
||
|
|
"loss": 0.438,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7602856171370282,
|
||
|
|
"grad_norm": 0.13609508940355913,
|
||
|
|
"learning_rate": 3.4630669367293797e-05,
|
||
|
|
"loss": 0.4301,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7711662699761985,
|
||
|
|
"grad_norm": 0.14408065680405582,
|
||
|
|
"learning_rate": 3.412285361500729e-05,
|
||
|
|
"loss": 0.4365,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7820469228153688,
|
||
|
|
"grad_norm": 0.1523591488642166,
|
||
|
|
"learning_rate": 3.3616004198664845e-05,
|
||
|
|
"loss": 0.4261,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7929275756545393,
|
||
|
|
"grad_norm": 0.16103662054938223,
|
||
|
|
"learning_rate": 3.311020445578725e-05,
|
||
|
|
"loss": 0.4358,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8038082284937096,
|
||
|
|
"grad_norm": 0.1713601588521581,
|
||
|
|
"learning_rate": 3.260553755130525e-05,
|
||
|
|
"loss": 0.4317,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8146888813328799,
|
||
|
|
"grad_norm": 0.15595643791635083,
|
||
|
|
"learning_rate": 3.210208646388532e-05,
|
||
|
|
"loss": 0.4201,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8255695341720504,
|
||
|
|
"grad_norm": 0.14708760810268764,
|
||
|
|
"learning_rate": 3.1599933972286026e-05,
|
||
|
|
"loss": 0.4296,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8364501870112206,
|
||
|
|
"grad_norm": 0.13811567230657157,
|
||
|
|
"learning_rate": 3.109916264174743e-05,
|
||
|
|
"loss": 0.4252,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.847330839850391,
|
||
|
|
"grad_norm": 0.14763414231896016,
|
||
|
|
"learning_rate": 3.0599854810415393e-05,
|
||
|
|
"loss": 0.4201,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8582114926895614,
|
||
|
|
"grad_norm": 0.129668633060907,
|
||
|
|
"learning_rate": 3.0102092575803435e-05,
|
||
|
|
"loss": 0.4343,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8690921455287317,
|
||
|
|
"grad_norm": 0.13079136624725912,
|
||
|
|
"learning_rate": 2.9605957781293893e-05,
|
||
|
|
"loss": 0.4388,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.879972798367902,
|
||
|
|
"grad_norm": 0.14118617253700078,
|
||
|
|
"learning_rate": 2.911153200268116e-05,
|
||
|
|
"loss": 0.4361,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8908534512070725,
|
||
|
|
"grad_norm": 0.13385452899697334,
|
||
|
|
"learning_rate": 2.8618896534758707e-05,
|
||
|
|
"loss": 0.4303,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9017341040462428,
|
||
|
|
"grad_norm": 0.14708401774021865,
|
||
|
|
"learning_rate": 2.8128132377952376e-05,
|
||
|
|
"loss": 0.4332,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.912614756885413,
|
||
|
|
"grad_norm": 0.12272534113340453,
|
||
|
|
"learning_rate": 2.7639320225002108e-05,
|
||
|
|
"loss": 0.4228,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9234954097245835,
|
||
|
|
"grad_norm": 0.1138558266069995,
|
||
|
|
"learning_rate": 2.715254044769418e-05,
|
||
|
|
"loss": 0.4282,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9343760625637538,
|
||
|
|
"grad_norm": 0.12343156100233148,
|
||
|
|
"learning_rate": 2.666787308364634e-05,
|
||
|
|
"loss": 0.4284,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.945256715402924,
|
||
|
|
"grad_norm": 0.11224315249119837,
|
||
|
|
"learning_rate": 2.6185397823147703e-05,
|
||
|
|
"loss": 0.4265,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9561373682420946,
|
||
|
|
"grad_norm": 0.10896139436338177,
|
||
|
|
"learning_rate": 2.5705193996055977e-05,
|
||
|
|
"loss": 0.4255,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.967018021081265,
|
||
|
|
"grad_norm": 0.12908408500196814,
|
||
|
|
"learning_rate": 2.5227340558753755e-05,
|
||
|
|
"loss": 0.4267,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9778986739204352,
|
||
|
|
"grad_norm": 0.11019554007619542,
|
||
|
|
"learning_rate": 2.4751916081166336e-05,
|
||
|
|
"loss": 0.4322,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9887793267596057,
|
||
|
|
"grad_norm": 0.1233996372590572,
|
||
|
|
"learning_rate": 2.427899873384306e-05,
|
||
|
|
"loss": 0.4237,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0081604896293777,
|
||
|
|
"grad_norm": 0.15359648389426395,
|
||
|
|
"learning_rate": 2.3808666275104248e-05,
|
||
|
|
"loss": 0.4132,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.019041142468548,
|
||
|
|
"grad_norm": 0.13345360808541243,
|
||
|
|
"learning_rate": 2.334099603825605e-05,
|
||
|
|
"loss": 0.4088,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0299217953077187,
|
||
|
|
"grad_norm": 0.17411332047982292,
|
||
|
|
"learning_rate": 2.2876064918874993e-05,
|
||
|
|
"loss": 0.4049,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0408024481468887,
|
||
|
|
"grad_norm": 0.1422404399378093,
|
||
|
|
"learning_rate": 2.241394936216472e-05,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.051683100986059,
|
||
|
|
"grad_norm": 0.14694793749020102,
|
||
|
|
"learning_rate": 2.1954725350386614e-05,
|
||
|
|
"loss": 0.4004,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0625637538252297,
|
||
|
|
"grad_norm": 0.1408354112834221,
|
||
|
|
"learning_rate": 2.14984683903666e-05,
|
||
|
|
"loss": 0.4047,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0734444066643998,
|
||
|
|
"grad_norm": 0.13765680224710142,
|
||
|
|
"learning_rate": 2.1045253501080058e-05,
|
||
|
|
"loss": 0.4029,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0843250595035703,
|
||
|
|
"grad_norm": 0.13251592143019264,
|
||
|
|
"learning_rate": 2.0595155201317115e-05,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.095205712342741,
|
||
|
|
"grad_norm": 0.13758159475310247,
|
||
|
|
"learning_rate": 2.0148247497430012e-05,
|
||
|
|
"loss": 0.4035,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.106086365181911,
|
||
|
|
"grad_norm": 0.12620065204508432,
|
||
|
|
"learning_rate": 1.970460387116472e-05,
|
||
|
|
"loss": 0.4003,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1169670180210813,
|
||
|
|
"grad_norm": 0.13100037610614787,
|
||
|
|
"learning_rate": 1.9264297267579e-05,
|
||
|
|
"loss": 0.4,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.127847670860252,
|
||
|
|
"grad_norm": 0.1342586056051866,
|
||
|
|
"learning_rate": 1.8827400083048503e-05,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.138728323699422,
|
||
|
|
"grad_norm": 0.1314403186569449,
|
||
|
|
"learning_rate": 1.8393984153363203e-05,
|
||
|
|
"loss": 0.4084,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1496089765385924,
|
||
|
|
"grad_norm": 0.1275916516562326,
|
||
|
|
"learning_rate": 1.7964120741915905e-05,
|
||
|
|
"loss": 0.4021,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1604896293777625,
|
||
|
|
"grad_norm": 0.12979778271394132,
|
||
|
|
"learning_rate": 1.753788052798501e-05,
|
||
|
|
"loss": 0.4063,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.171370282216933,
|
||
|
|
"grad_norm": 0.13548411500192925,
|
||
|
|
"learning_rate": 1.7115333595113225e-05,
|
||
|
|
"loss": 0.4093,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1822509350561035,
|
||
|
|
"grad_norm": 0.11424114716502747,
|
||
|
|
"learning_rate": 1.669654941958416e-05,
|
||
|
|
"loss": 0.3998,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1931315878952735,
|
||
|
|
"grad_norm": 0.14575557443845474,
|
||
|
|
"learning_rate": 1.628159685899897e-05,
|
||
|
|
"loss": 0.404,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.204012240734444,
|
||
|
|
"grad_norm": 0.12565905473323868,
|
||
|
|
"learning_rate": 1.5870544140954543e-05,
|
||
|
|
"loss": 0.4029,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2148928935736145,
|
||
|
|
"grad_norm": 0.1322140532060304,
|
||
|
|
"learning_rate": 1.5463458851825345e-05,
|
||
|
|
"loss": 0.4034,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2257735464127846,
|
||
|
|
"grad_norm": 0.11539342585698967,
|
||
|
|
"learning_rate": 1.5060407925650662e-05,
|
||
|
|
"loss": 0.4047,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.236654199251955,
|
||
|
|
"grad_norm": 0.110043261615137,
|
||
|
|
"learning_rate": 1.466145763312922e-05,
|
||
|
|
"loss": 0.4028,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2475348520911256,
|
||
|
|
"grad_norm": 0.12462346384772446,
|
||
|
|
"learning_rate": 1.426667357072265e-05,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2584155049302956,
|
||
|
|
"grad_norm": 0.11246035528357057,
|
||
|
|
"learning_rate": 1.3876120649870051e-05,
|
||
|
|
"loss": 0.401,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.269296157769466,
|
||
|
|
"grad_norm": 0.11025464415618055,
|
||
|
|
"learning_rate": 1.3489863086315085e-05,
|
||
|
|
"loss": 0.406,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2801768106086366,
|
||
|
|
"grad_norm": 0.12202834262583902,
|
||
|
|
"learning_rate": 1.3107964389547326e-05,
|
||
|
|
"loss": 0.4105,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2910574634478067,
|
||
|
|
"grad_norm": 0.10891021403569234,
|
||
|
|
"learning_rate": 1.2730487352360026e-05,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.301938116286977,
|
||
|
|
"grad_norm": 0.11067574316750495,
|
||
|
|
"learning_rate": 1.2357494040525416e-05,
|
||
|
|
"loss": 0.4026,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3128187691261477,
|
||
|
|
"grad_norm": 0.12013626538786122,
|
||
|
|
"learning_rate": 1.1989045782589815e-05,
|
||
|
|
"loss": 0.4019,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3236994219653178,
|
||
|
|
"grad_norm": 0.11734274809139549,
|
||
|
|
"learning_rate": 1.1625203159789686e-05,
|
||
|
|
"loss": 0.404,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3345800748044883,
|
||
|
|
"grad_norm": 0.13868399399365283,
|
||
|
|
"learning_rate": 1.1266025996090902e-05,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3454607276436588,
|
||
|
|
"grad_norm": 0.1021758536629292,
|
||
|
|
"learning_rate": 1.0911573348352107e-05,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.356341380482829,
|
||
|
|
"grad_norm": 0.11161156337434158,
|
||
|
|
"learning_rate": 1.0561903496614603e-05,
|
||
|
|
"loss": 0.4019,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3672220333219993,
|
||
|
|
"grad_norm": 0.11628871391170659,
|
||
|
|
"learning_rate": 1.0217073934519726e-05,
|
||
|
|
"loss": 0.3961,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.37810268616117,
|
||
|
|
"grad_norm": 0.10053214623418112,
|
||
|
|
"learning_rate": 9.877141359855567e-06,
|
||
|
|
"loss": 0.4107,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.38898333900034,
|
||
|
|
"grad_norm": 0.10333880551211917,
|
||
|
|
"learning_rate": 9.542161665234623e-06,
|
||
|
|
"loss": 0.4041,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3998639918395104,
|
||
|
|
"grad_norm": 0.10378698888338332,
|
||
|
|
"learning_rate": 9.212189928903758e-06,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.410744644678681,
|
||
|
|
"grad_norm": 0.10095387706934292,
|
||
|
|
"learning_rate": 8.887280405688106e-06,
|
||
|
|
"loss": 0.4035,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.421625297517851,
|
||
|
|
"grad_norm": 0.09702668485060174,
|
||
|
|
"learning_rate": 8.567486518070306e-06,
|
||
|
|
"loss": 0.3979,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4325059503570214,
|
||
|
|
"grad_norm": 0.0968990526407044,
|
||
|
|
"learning_rate": 8.252860847406712e-06,
|
||
|
|
"loss": 0.4007,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.443386603196192,
|
||
|
|
"grad_norm": 0.09758367833082218,
|
||
|
|
"learning_rate": 7.943455125281741e-06,
|
||
|
|
"loss": 0.4038,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.454267256035362,
|
||
|
|
"grad_norm": 0.09335426131514203,
|
||
|
|
"learning_rate": 7.639320225002106e-06,
|
||
|
|
"loss": 0.3987,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4651479088745325,
|
||
|
|
"grad_norm": 0.09620155060950364,
|
||
|
|
"learning_rate": 7.340506153232052e-06,
|
||
|
|
"loss": 0.4051,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.476028561713703,
|
||
|
|
"grad_norm": 0.09238550728932193,
|
||
|
|
"learning_rate": 7.047062041771133e-06,
|
||
|
|
"loss": 0.4038,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.486909214552873,
|
||
|
|
"grad_norm": 0.08653478796623289,
|
||
|
|
"learning_rate": 6.759036139475843e-06,
|
||
|
|
"loss": 0.4035,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4977898673920436,
|
||
|
|
"grad_norm": 0.09462132238499406,
|
||
|
|
"learning_rate": 6.476475804326377e-06,
|
||
|
|
"loss": 0.3945,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.508670520231214,
|
||
|
|
"grad_norm": 0.09364175398439001,
|
||
|
|
"learning_rate": 6.199427495639963e-06,
|
||
|
|
"loss": 0.3953,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.519551173070384,
|
||
|
|
"grad_norm": 0.09533512853144702,
|
||
|
|
"learning_rate": 5.927936766431836e-06,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5304318259095546,
|
||
|
|
"grad_norm": 0.08988259771164411,
|
||
|
|
"learning_rate": 5.662048255925357e-06,
|
||
|
|
"loss": 0.4007,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5413124787487247,
|
||
|
|
"grad_norm": 0.08544449128239766,
|
||
|
|
"learning_rate": 5.40180568221226e-06,
|
||
|
|
"loss": 0.3956,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.552193131587895,
|
||
|
|
"grad_norm": 0.08214234004836413,
|
||
|
|
"learning_rate": 5.147251835064424e-06,
|
||
|
|
"loss": 0.401,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5630737844270657,
|
||
|
|
"grad_norm": 0.08978715763135,
|
||
|
|
"learning_rate": 4.898428568898288e-06,
|
||
|
|
"loss": 0.4018,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.573954437266236,
|
||
|
|
"grad_norm": 0.09111850134473688,
|
||
|
|
"learning_rate": 4.65537679589299e-06,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5848350901054062,
|
||
|
|
"grad_norm": 0.08281874038672186,
|
||
|
|
"learning_rate": 4.418136479263533e-06,
|
||
|
|
"loss": 0.3957,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5957157429445767,
|
||
|
|
"grad_norm": 0.0875486805676823,
|
||
|
|
"learning_rate": 4.186746626689879e-06,
|
||
|
|
"loss": 0.3978,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.606596395783747,
|
||
|
|
"grad_norm": 0.08327809325190605,
|
||
|
|
"learning_rate": 3.961245283903239e-06,
|
||
|
|
"loss": 0.4004,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6174770486229173,
|
||
|
|
"grad_norm": 0.08331308067123125,
|
||
|
|
"learning_rate": 3.7416695284304737e-06,
|
||
|
|
"loss": 0.4062,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.628357701462088,
|
||
|
|
"grad_norm": 0.08388783642844128,
|
||
|
|
"learning_rate": 3.5280554634977217e-06,
|
||
|
|
"loss": 0.3974,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6392383543012583,
|
||
|
|
"grad_norm": 0.08349690695940903,
|
||
|
|
"learning_rate": 3.320438212094197e-06,
|
||
|
|
"loss": 0.4021,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6501190071404284,
|
||
|
|
"grad_norm": 0.08518098898214005,
|
||
|
|
"learning_rate": 3.1188519111971804e-06,
|
||
|
|
"loss": 0.4046,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.660999659979599,
|
||
|
|
"grad_norm": 0.08404039117187238,
|
||
|
|
"learning_rate": 2.9233297061591346e-06,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.671880312818769,
|
||
|
|
"grad_norm": 0.08468392011098298,
|
||
|
|
"learning_rate": 2.733903745257838e-06,
|
||
|
|
"loss": 0.4143,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6827609656579394,
|
||
|
|
"grad_norm": 0.08035934775357019,
|
||
|
|
"learning_rate": 2.550605174410512e-06,
|
||
|
|
"loss": 0.3982,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.69364161849711,
|
||
|
|
"grad_norm": 0.08100249829782087,
|
||
|
|
"learning_rate": 2.373464132052701e-06,
|
||
|
|
"loss": 0.3975,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7045222713362804,
|
||
|
|
"grad_norm": 0.08405527434090163,
|
||
|
|
"learning_rate": 2.202509744182835e-06,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7154029241754505,
|
||
|
|
"grad_norm": 0.08458090883428918,
|
||
|
|
"learning_rate": 2.0377701195732545e-06,
|
||
|
|
"loss": 0.4094,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.726283577014621,
|
||
|
|
"grad_norm": 0.08120194628003821,
|
||
|
|
"learning_rate": 1.879272345148513e-06,
|
||
|
|
"loss": 0.4071,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.737164229853791,
|
||
|
|
"grad_norm": 0.08381423080148898,
|
||
|
|
"learning_rate": 1.727042481531651e-06,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7480448826929615,
|
||
|
|
"grad_norm": 0.08003721871020845,
|
||
|
|
"learning_rate": 1.5811055587592283e-06,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.758925535532132,
|
||
|
|
"grad_norm": 0.07906608009605874,
|
||
|
|
"learning_rate": 1.4414855721658705e-06,
|
||
|
|
"loss": 0.4011,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7698061883713025,
|
||
|
|
"grad_norm": 0.07989568893361568,
|
||
|
|
"learning_rate": 1.3082054784388221e-06,
|
||
|
|
"loss": 0.3938,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7806868412104726,
|
||
|
|
"grad_norm": 0.07531713942027396,
|
||
|
|
"learning_rate": 1.1812871918434143e-06,
|
||
|
|
"loss": 0.4036,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.791567494049643,
|
||
|
|
"grad_norm": 0.07349157785840128,
|
||
|
|
"learning_rate": 1.0607515806198142e-06,
|
||
|
|
"loss": 0.3975,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.802448146888813,
|
||
|
|
"grad_norm": 0.07248419100011405,
|
||
|
|
"learning_rate": 9.466184635518361e-07,
|
||
|
|
"loss": 0.397,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8133287997279837,
|
||
|
|
"grad_norm": 0.07667024498059592,
|
||
|
|
"learning_rate": 8.389066067082852e-07,
|
||
|
|
"loss": 0.4011,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.824209452567154,
|
||
|
|
"grad_norm": 0.07642598936379766,
|
||
|
|
"learning_rate": 7.376337203573824e-07,
|
||
|
|
"loss": 0.4,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8350901054063242,
|
||
|
|
"grad_norm": 0.07461923023970361,
|
||
|
|
"learning_rate": 6.428164560548134e-07,
|
||
|
|
"loss": 0.4021,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8459707582454947,
|
||
|
|
"grad_norm": 0.08011069712526726,
|
||
|
|
"learning_rate": 5.544704039058025e-07,
|
||
|
|
"loss": 0.4001,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8568514110846652,
|
||
|
|
"grad_norm": 0.0778371155244566,
|
||
|
|
"learning_rate": 4.7261009000177274e-07,
|
||
|
|
"loss": 0.4025,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8677320639238353,
|
||
|
|
"grad_norm": 0.0756170688560216,
|
||
|
|
"learning_rate": 3.972489740319274e-07,
|
||
|
|
"loss": 0.4091,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.878612716763006,
|
||
|
|
"grad_norm": 0.0747265642036147,
|
||
|
|
"learning_rate": 3.283994470701579e-07,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8894933696021763,
|
||
|
|
"grad_norm": 0.07119073646902688,
|
||
|
|
"learning_rate": 2.66072829537678e-07,
|
||
|
|
"loss": 0.408,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9003740224413463,
|
||
|
|
"grad_norm": 0.07649012974948233,
|
||
|
|
"learning_rate": 2.102793693417038e-07,
|
||
|
|
"loss": 0.402,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.911254675280517,
|
||
|
|
"grad_norm": 0.07588643659629785,
|
||
|
|
"learning_rate": 1.6102824019043728e-07,
|
||
|
|
"loss": 0.3985,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9221353281196873,
|
||
|
|
"grad_norm": 0.07431991189730484,
|
||
|
|
"learning_rate": 1.1832754008472614e-07,
|
||
|
|
"loss": 0.4025,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9330159809588574,
|
||
|
|
"grad_norm": 0.0777086423840054,
|
||
|
|
"learning_rate": 8.21842899865466e-08,
|
||
|
|
"loss": 0.3962,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.943896633798028,
|
||
|
|
"grad_norm": 0.07387694283814854,
|
||
|
|
"learning_rate": 5.260443266462467e-08,
|
||
|
|
"loss": 0.3982,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9547772866371984,
|
||
|
|
"grad_norm": 0.07271598906861958,
|
||
|
|
"learning_rate": 2.9592831717293326e-08,
|
||
|
|
"loss": 0.3988,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9656579394763685,
|
||
|
|
"grad_norm": 0.07496499490626318,
|
||
|
|
"learning_rate": 1.3153270772807702e-08,
|
||
|
|
"loss": 0.4009,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.976538592315539,
|
||
|
|
"grad_norm": 0.07330139638176505,
|
||
|
|
"learning_rate": 3.2884528672294523e-09,
|
||
|
|
"loss": 0.4001,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.987419245154709,
|
||
|
|
"grad_norm": 0.07348531822682038,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 0.4048,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.987419245154709,
|
||
|
|
"step": 273,
|
||
|
|
"total_flos": 7.259120111838036e+18,
|
||
|
|
"train_loss": 0.5428441533675561,
|
||
|
|
"train_runtime": 82839.4057,
|
||
|
|
"train_samples_per_second": 1.704,
|
||
|
|
"train_steps_per_second": 0.003
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 273,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 10,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 7.259120111838036e+18,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|