7253 lines
175 KiB
JSON
7253 lines
175 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 4.999242079733212,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 1030,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.004850689707442777,
|
||
|
|
"grad_norm": 6.339065858846749,
|
||
|
|
"learning_rate": 7.766990291262136e-07,
|
||
|
|
"loss": 1.0147,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009701379414885554,
|
||
|
|
"grad_norm": 6.370901874406726,
|
||
|
|
"learning_rate": 1.5533980582524272e-06,
|
||
|
|
"loss": 1.0217,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01455206912232833,
|
||
|
|
"grad_norm": 6.253076167725343,
|
||
|
|
"learning_rate": 2.330097087378641e-06,
|
||
|
|
"loss": 1.0059,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01940275882977111,
|
||
|
|
"grad_norm": 5.8627331903052005,
|
||
|
|
"learning_rate": 3.1067961165048544e-06,
|
||
|
|
"loss": 0.9987,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024253448537213885,
|
||
|
|
"grad_norm": 4.624724616975588,
|
||
|
|
"learning_rate": 3.883495145631068e-06,
|
||
|
|
"loss": 0.9654,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02910413824465666,
|
||
|
|
"grad_norm": 2.701973440331149,
|
||
|
|
"learning_rate": 4.660194174757282e-06,
|
||
|
|
"loss": 0.9221,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03395482795209944,
|
||
|
|
"grad_norm": 2.5821400220833683,
|
||
|
|
"learning_rate": 5.436893203883496e-06,
|
||
|
|
"loss": 0.9118,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03880551765954222,
|
||
|
|
"grad_norm": 3.1473551148693146,
|
||
|
|
"learning_rate": 6.213592233009709e-06,
|
||
|
|
"loss": 0.8818,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04365620736698499,
|
||
|
|
"grad_norm": 3.8230150062051638,
|
||
|
|
"learning_rate": 6.990291262135923e-06,
|
||
|
|
"loss": 0.8912,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04850689707442777,
|
||
|
|
"grad_norm": 3.4669236063777715,
|
||
|
|
"learning_rate": 7.766990291262136e-06,
|
||
|
|
"loss": 0.8779,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.053357586781870546,
|
||
|
|
"grad_norm": 2.712055876575345,
|
||
|
|
"learning_rate": 8.54368932038835e-06,
|
||
|
|
"loss": 0.8328,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05820827648931332,
|
||
|
|
"grad_norm": 2.574263966000136,
|
||
|
|
"learning_rate": 9.320388349514565e-06,
|
||
|
|
"loss": 0.8149,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0630589661967561,
|
||
|
|
"grad_norm": 1.8016385817876701,
|
||
|
|
"learning_rate": 1.0097087378640778e-05,
|
||
|
|
"loss": 0.7941,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06790965590419888,
|
||
|
|
"grad_norm": 1.16110696712433,
|
||
|
|
"learning_rate": 1.0873786407766991e-05,
|
||
|
|
"loss": 0.7751,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07276034561164166,
|
||
|
|
"grad_norm": 1.5441894400920566,
|
||
|
|
"learning_rate": 1.1650485436893204e-05,
|
||
|
|
"loss": 0.7603,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07761103531908443,
|
||
|
|
"grad_norm": 1.3570174190036193,
|
||
|
|
"learning_rate": 1.2427184466019418e-05,
|
||
|
|
"loss": 0.7456,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08246172502652721,
|
||
|
|
"grad_norm": 1.0160879152766609,
|
||
|
|
"learning_rate": 1.3203883495145633e-05,
|
||
|
|
"loss": 0.74,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08731241473396999,
|
||
|
|
"grad_norm": 1.195923679791525,
|
||
|
|
"learning_rate": 1.3980582524271846e-05,
|
||
|
|
"loss": 0.7223,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09216310444141276,
|
||
|
|
"grad_norm": 1.0381307779091873,
|
||
|
|
"learning_rate": 1.475728155339806e-05,
|
||
|
|
"loss": 0.7149,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09701379414885554,
|
||
|
|
"grad_norm": 0.9191697728302082,
|
||
|
|
"learning_rate": 1.5533980582524273e-05,
|
||
|
|
"loss": 0.7032,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10186448385629832,
|
||
|
|
"grad_norm": 1.0389109685950821,
|
||
|
|
"learning_rate": 1.6310679611650486e-05,
|
||
|
|
"loss": 0.697,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10671517356374109,
|
||
|
|
"grad_norm": 0.8528569833940303,
|
||
|
|
"learning_rate": 1.70873786407767e-05,
|
||
|
|
"loss": 0.6913,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11156586327118387,
|
||
|
|
"grad_norm": 0.7397574673832126,
|
||
|
|
"learning_rate": 1.7864077669902916e-05,
|
||
|
|
"loss": 0.6844,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11641655297862664,
|
||
|
|
"grad_norm": 0.6762376097915315,
|
||
|
|
"learning_rate": 1.864077669902913e-05,
|
||
|
|
"loss": 0.6807,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12126724268606942,
|
||
|
|
"grad_norm": 0.6801312007046909,
|
||
|
|
"learning_rate": 1.9417475728155343e-05,
|
||
|
|
"loss": 0.6651,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1261179323935122,
|
||
|
|
"grad_norm": 0.5373129321939298,
|
||
|
|
"learning_rate": 2.0194174757281556e-05,
|
||
|
|
"loss": 0.6661,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13096862210095497,
|
||
|
|
"grad_norm": 0.6155691125010336,
|
||
|
|
"learning_rate": 2.097087378640777e-05,
|
||
|
|
"loss": 0.6655,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13581931180839776,
|
||
|
|
"grad_norm": 0.5373412410981904,
|
||
|
|
"learning_rate": 2.1747572815533982e-05,
|
||
|
|
"loss": 0.6541,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14067000151584053,
|
||
|
|
"grad_norm": 0.5722094683121568,
|
||
|
|
"learning_rate": 2.2524271844660196e-05,
|
||
|
|
"loss": 0.6534,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14552069122328332,
|
||
|
|
"grad_norm": 0.6369873796903149,
|
||
|
|
"learning_rate": 2.330097087378641e-05,
|
||
|
|
"loss": 0.6536,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15037138093072608,
|
||
|
|
"grad_norm": 0.5246684440675834,
|
||
|
|
"learning_rate": 2.4077669902912622e-05,
|
||
|
|
"loss": 0.6545,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15522207063816887,
|
||
|
|
"grad_norm": 0.504247506683658,
|
||
|
|
"learning_rate": 2.4854368932038836e-05,
|
||
|
|
"loss": 0.6314,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16007276034561163,
|
||
|
|
"grad_norm": 0.5566944063536889,
|
||
|
|
"learning_rate": 2.5631067961165052e-05,
|
||
|
|
"loss": 0.6373,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16492345005305442,
|
||
|
|
"grad_norm": 0.8570989167580252,
|
||
|
|
"learning_rate": 2.6407766990291266e-05,
|
||
|
|
"loss": 0.63,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16977413976049718,
|
||
|
|
"grad_norm": 1.8667995731915865,
|
||
|
|
"learning_rate": 2.718446601941748e-05,
|
||
|
|
"loss": 0.6456,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17462482946793997,
|
||
|
|
"grad_norm": 0.5164194413667431,
|
||
|
|
"learning_rate": 2.7961165048543692e-05,
|
||
|
|
"loss": 0.6308,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17947551917538274,
|
||
|
|
"grad_norm": 1.6098330943831782,
|
||
|
|
"learning_rate": 2.8737864077669905e-05,
|
||
|
|
"loss": 0.6398,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18432620888282553,
|
||
|
|
"grad_norm": 1.1492129920694993,
|
||
|
|
"learning_rate": 2.951456310679612e-05,
|
||
|
|
"loss": 0.6276,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18917689859026832,
|
||
|
|
"grad_norm": 0.7127614761088336,
|
||
|
|
"learning_rate": 3.0291262135922332e-05,
|
||
|
|
"loss": 0.6218,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19402758829771108,
|
||
|
|
"grad_norm": 1.0312378150228299,
|
||
|
|
"learning_rate": 3.1067961165048545e-05,
|
||
|
|
"loss": 0.6253,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19887827800515387,
|
||
|
|
"grad_norm": 1.7574387168351864,
|
||
|
|
"learning_rate": 3.184466019417476e-05,
|
||
|
|
"loss": 0.6281,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20372896771259663,
|
||
|
|
"grad_norm": 0.9416599187328968,
|
||
|
|
"learning_rate": 3.262135922330097e-05,
|
||
|
|
"loss": 0.6235,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20857965742003942,
|
||
|
|
"grad_norm": 1.8301787236679616,
|
||
|
|
"learning_rate": 3.339805825242719e-05,
|
||
|
|
"loss": 0.622,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21343034712748218,
|
||
|
|
"grad_norm": 0.9292078210446757,
|
||
|
|
"learning_rate": 3.41747572815534e-05,
|
||
|
|
"loss": 0.6107,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21828103683492497,
|
||
|
|
"grad_norm": 1.994214362456412,
|
||
|
|
"learning_rate": 3.4951456310679615e-05,
|
||
|
|
"loss": 0.6151,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22313172654236774,
|
||
|
|
"grad_norm": 1.3619344540131681,
|
||
|
|
"learning_rate": 3.572815533980583e-05,
|
||
|
|
"loss": 0.6089,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22798241624981053,
|
||
|
|
"grad_norm": 1.6323821476629805,
|
||
|
|
"learning_rate": 3.650485436893204e-05,
|
||
|
|
"loss": 0.6165,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2328331059572533,
|
||
|
|
"grad_norm": 1.1474300438640261,
|
||
|
|
"learning_rate": 3.728155339805826e-05,
|
||
|
|
"loss": 0.6104,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23768379566469608,
|
||
|
|
"grad_norm": 1.1936059623728144,
|
||
|
|
"learning_rate": 3.805825242718447e-05,
|
||
|
|
"loss": 0.6086,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24253448537213884,
|
||
|
|
"grad_norm": 1.4126137333521573,
|
||
|
|
"learning_rate": 3.8834951456310685e-05,
|
||
|
|
"loss": 0.6123,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24738517507958163,
|
||
|
|
"grad_norm": 0.7835607330331523,
|
||
|
|
"learning_rate": 3.9611650485436895e-05,
|
||
|
|
"loss": 0.5979,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2522358647870244,
|
||
|
|
"grad_norm": 1.3575257469759314,
|
||
|
|
"learning_rate": 4.038834951456311e-05,
|
||
|
|
"loss": 0.6134,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25708655449446716,
|
||
|
|
"grad_norm": 0.9580505107131282,
|
||
|
|
"learning_rate": 4.116504854368932e-05,
|
||
|
|
"loss": 0.5987,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26193724420190995,
|
||
|
|
"grad_norm": 1.1647956860260527,
|
||
|
|
"learning_rate": 4.194174757281554e-05,
|
||
|
|
"loss": 0.6134,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26678793390935274,
|
||
|
|
"grad_norm": 0.9133078407230598,
|
||
|
|
"learning_rate": 4.271844660194175e-05,
|
||
|
|
"loss": 0.5978,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2716386236167955,
|
||
|
|
"grad_norm": 1.3505836368180404,
|
||
|
|
"learning_rate": 4.3495145631067965e-05,
|
||
|
|
"loss": 0.5944,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2764893133242383,
|
||
|
|
"grad_norm": 1.1628512424723636,
|
||
|
|
"learning_rate": 4.4271844660194175e-05,
|
||
|
|
"loss": 0.6052,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28134000303168105,
|
||
|
|
"grad_norm": 1.6421048813051027,
|
||
|
|
"learning_rate": 4.504854368932039e-05,
|
||
|
|
"loss": 0.6008,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28619069273912384,
|
||
|
|
"grad_norm": 1.3019802198524983,
|
||
|
|
"learning_rate": 4.58252427184466e-05,
|
||
|
|
"loss": 0.594,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29104138244656663,
|
||
|
|
"grad_norm": 1.2774402772060065,
|
||
|
|
"learning_rate": 4.660194174757282e-05,
|
||
|
|
"loss": 0.5892,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2958920721540094,
|
||
|
|
"grad_norm": 1.4435670386305743,
|
||
|
|
"learning_rate": 4.737864077669903e-05,
|
||
|
|
"loss": 0.5931,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30074276186145216,
|
||
|
|
"grad_norm": 1.0284857540916943,
|
||
|
|
"learning_rate": 4.8155339805825245e-05,
|
||
|
|
"loss": 0.5932,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30559345156889495,
|
||
|
|
"grad_norm": 1.1698103896183938,
|
||
|
|
"learning_rate": 4.8932038834951454e-05,
|
||
|
|
"loss": 0.5948,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31044414127633774,
|
||
|
|
"grad_norm": 1.434142725222452,
|
||
|
|
"learning_rate": 4.970873786407767e-05,
|
||
|
|
"loss": 0.5897,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3152948309837805,
|
||
|
|
"grad_norm": 1.3482179068151203,
|
||
|
|
"learning_rate": 5.0485436893203895e-05,
|
||
|
|
"loss": 0.5824,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32014552069122326,
|
||
|
|
"grad_norm": 1.3069227496554443,
|
||
|
|
"learning_rate": 5.1262135922330105e-05,
|
||
|
|
"loss": 0.5802,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32499621039866605,
|
||
|
|
"grad_norm": 1.1366236056516827,
|
||
|
|
"learning_rate": 5.203883495145632e-05,
|
||
|
|
"loss": 0.5804,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32984690010610884,
|
||
|
|
"grad_norm": 1.6243350603336242,
|
||
|
|
"learning_rate": 5.281553398058253e-05,
|
||
|
|
"loss": 0.5812,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33469758981355163,
|
||
|
|
"grad_norm": 1.0822457196908746,
|
||
|
|
"learning_rate": 5.359223300970875e-05,
|
||
|
|
"loss": 0.5862,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33954827952099437,
|
||
|
|
"grad_norm": 1.6360957248140573,
|
||
|
|
"learning_rate": 5.436893203883496e-05,
|
||
|
|
"loss": 0.5912,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34439896922843716,
|
||
|
|
"grad_norm": 2.3477845490488813,
|
||
|
|
"learning_rate": 5.5145631067961174e-05,
|
||
|
|
"loss": 0.5906,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34924965893587995,
|
||
|
|
"grad_norm": 1.105543560593242,
|
||
|
|
"learning_rate": 5.5922330097087384e-05,
|
||
|
|
"loss": 0.5824,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35410034864332274,
|
||
|
|
"grad_norm": 4.137098681881185,
|
||
|
|
"learning_rate": 5.66990291262136e-05,
|
||
|
|
"loss": 0.6359,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3589510383507655,
|
||
|
|
"grad_norm": 3.8847979837997033,
|
||
|
|
"learning_rate": 5.747572815533981e-05,
|
||
|
|
"loss": 0.6486,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36380172805820826,
|
||
|
|
"grad_norm": 1.1747128429519862,
|
||
|
|
"learning_rate": 5.825242718446603e-05,
|
||
|
|
"loss": 0.595,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36865241776565105,
|
||
|
|
"grad_norm": 3.008245900701061,
|
||
|
|
"learning_rate": 5.902912621359224e-05,
|
||
|
|
"loss": 0.6387,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37350310747309384,
|
||
|
|
"grad_norm": 2.3713075132931554,
|
||
|
|
"learning_rate": 5.9805825242718454e-05,
|
||
|
|
"loss": 0.6344,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37835379718053663,
|
||
|
|
"grad_norm": 1.6213053074921984,
|
||
|
|
"learning_rate": 6.0582524271844664e-05,
|
||
|
|
"loss": 0.6048,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38320448688797937,
|
||
|
|
"grad_norm": 1.6024154837501339,
|
||
|
|
"learning_rate": 6.135922330097087e-05,
|
||
|
|
"loss": 0.6203,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38805517659542216,
|
||
|
|
"grad_norm": 1.1825030677591377,
|
||
|
|
"learning_rate": 6.213592233009709e-05,
|
||
|
|
"loss": 0.6052,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39290586630286495,
|
||
|
|
"grad_norm": 1.3964525731128163,
|
||
|
|
"learning_rate": 6.291262135922331e-05,
|
||
|
|
"loss": 0.6112,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39775655601030774,
|
||
|
|
"grad_norm": 1.1205074621871551,
|
||
|
|
"learning_rate": 6.368932038834952e-05,
|
||
|
|
"loss": 0.5977,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4026072457177505,
|
||
|
|
"grad_norm": 1.045620374565707,
|
||
|
|
"learning_rate": 6.446601941747573e-05,
|
||
|
|
"loss": 0.5914,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40745793542519326,
|
||
|
|
"grad_norm": 1.3974586249408472,
|
||
|
|
"learning_rate": 6.524271844660194e-05,
|
||
|
|
"loss": 0.5918,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41230862513263605,
|
||
|
|
"grad_norm": 1.0818483302602913,
|
||
|
|
"learning_rate": 6.601941747572816e-05,
|
||
|
|
"loss": 0.5948,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41715931484007884,
|
||
|
|
"grad_norm": 0.9808456957793906,
|
||
|
|
"learning_rate": 6.679611650485438e-05,
|
||
|
|
"loss": 0.5839,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4220100045475216,
|
||
|
|
"grad_norm": 1.2035779456517084,
|
||
|
|
"learning_rate": 6.757281553398058e-05,
|
||
|
|
"loss": 0.5833,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42686069425496437,
|
||
|
|
"grad_norm": 1.6887623926979713,
|
||
|
|
"learning_rate": 6.83495145631068e-05,
|
||
|
|
"loss": 0.5818,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43171138396240716,
|
||
|
|
"grad_norm": 0.8023218391013366,
|
||
|
|
"learning_rate": 6.912621359223301e-05,
|
||
|
|
"loss": 0.5863,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43656207366984995,
|
||
|
|
"grad_norm": 1.574900348178855,
|
||
|
|
"learning_rate": 6.990291262135923e-05,
|
||
|
|
"loss": 0.5821,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4414127633772927,
|
||
|
|
"grad_norm": 0.9288518542917786,
|
||
|
|
"learning_rate": 7.067961165048545e-05,
|
||
|
|
"loss": 0.5814,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4462634530847355,
|
||
|
|
"grad_norm": 1.6871845487045471,
|
||
|
|
"learning_rate": 7.145631067961166e-05,
|
||
|
|
"loss": 0.5819,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45111414279217826,
|
||
|
|
"grad_norm": 1.147490028185953,
|
||
|
|
"learning_rate": 7.223300970873787e-05,
|
||
|
|
"loss": 0.5752,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45596483249962105,
|
||
|
|
"grad_norm": 1.0734179177901382,
|
||
|
|
"learning_rate": 7.300970873786408e-05,
|
||
|
|
"loss": 0.5786,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46081552220706384,
|
||
|
|
"grad_norm": 1.2263367009960806,
|
||
|
|
"learning_rate": 7.37864077669903e-05,
|
||
|
|
"loss": 0.5789,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4656662119145066,
|
||
|
|
"grad_norm": 1.4570032389620742,
|
||
|
|
"learning_rate": 7.456310679611652e-05,
|
||
|
|
"loss": 0.5745,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47051690162194937,
|
||
|
|
"grad_norm": 1.3246870726440927,
|
||
|
|
"learning_rate": 7.533980582524272e-05,
|
||
|
|
"loss": 0.5775,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47536759132939216,
|
||
|
|
"grad_norm": 0.9415857506868542,
|
||
|
|
"learning_rate": 7.611650485436894e-05,
|
||
|
|
"loss": 0.5699,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48021828103683495,
|
||
|
|
"grad_norm": 1.2384384474151087,
|
||
|
|
"learning_rate": 7.689320388349515e-05,
|
||
|
|
"loss": 0.5733,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4850689707442777,
|
||
|
|
"grad_norm": 1.5627749991572353,
|
||
|
|
"learning_rate": 7.766990291262137e-05,
|
||
|
|
"loss": 0.5735,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4899196604517205,
|
||
|
|
"grad_norm": 1.0078484211944914,
|
||
|
|
"learning_rate": 7.844660194174757e-05,
|
||
|
|
"loss": 0.5733,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49477035015916326,
|
||
|
|
"grad_norm": 1.6421211712488573,
|
||
|
|
"learning_rate": 7.922330097087379e-05,
|
||
|
|
"loss": 0.576,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49962103986660605,
|
||
|
|
"grad_norm": 0.8416126904816602,
|
||
|
|
"learning_rate": 8e-05,
|
||
|
|
"loss": 0.5697,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5044717295740488,
|
||
|
|
"grad_norm": 1.7540458195736903,
|
||
|
|
"learning_rate": 7.999977029531286e-05,
|
||
|
|
"loss": 0.5799,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5093224192814916,
|
||
|
|
"grad_norm": 1.050447231844734,
|
||
|
|
"learning_rate": 7.999908118388965e-05,
|
||
|
|
"loss": 0.5756,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5141731089889343,
|
||
|
|
"grad_norm": 1.394330877960354,
|
||
|
|
"learning_rate": 7.999793267364497e-05,
|
||
|
|
"loss": 0.5713,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5190237986963772,
|
||
|
|
"grad_norm": 1.5873678777006228,
|
||
|
|
"learning_rate": 7.999632477776974e-05,
|
||
|
|
"loss": 0.5733,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5238744884038199,
|
||
|
|
"grad_norm": 0.8115732123781836,
|
||
|
|
"learning_rate": 7.9994257514731e-05,
|
||
|
|
"loss": 0.5661,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5287251781112627,
|
||
|
|
"grad_norm": 1.1857754936411384,
|
||
|
|
"learning_rate": 7.999173090827177e-05,
|
||
|
|
"loss": 0.5719,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5335758678187055,
|
||
|
|
"grad_norm": 0.8066157800150973,
|
||
|
|
"learning_rate": 7.998874498741072e-05,
|
||
|
|
"loss": 0.5695,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5384265575261482,
|
||
|
|
"grad_norm": 1.4739854945603235,
|
||
|
|
"learning_rate": 7.998529978644183e-05,
|
||
|
|
"loss": 0.5712,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.543277247233591,
|
||
|
|
"grad_norm": 0.778875512416499,
|
||
|
|
"learning_rate": 7.998139534493407e-05,
|
||
|
|
"loss": 0.5609,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5481279369410338,
|
||
|
|
"grad_norm": 0.9919152170469479,
|
||
|
|
"learning_rate": 7.997703170773084e-05,
|
||
|
|
"loss": 0.5648,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5529786266484766,
|
||
|
|
"grad_norm": 1.3093959851041357,
|
||
|
|
"learning_rate": 7.997220892494955e-05,
|
||
|
|
"loss": 0.5757,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5578293163559194,
|
||
|
|
"grad_norm": 0.9066117120369992,
|
||
|
|
"learning_rate": 7.996692705198097e-05,
|
||
|
|
"loss": 0.566,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5626800060633621,
|
||
|
|
"grad_norm": 1.2498381901490132,
|
||
|
|
"learning_rate": 7.996118614948869e-05,
|
||
|
|
"loss": 0.5757,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.567530695770805,
|
||
|
|
"grad_norm": 1.0385189418340641,
|
||
|
|
"learning_rate": 7.995498628340827e-05,
|
||
|
|
"loss": 0.5697,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5723813854782477,
|
||
|
|
"grad_norm": 1.3065564110858372,
|
||
|
|
"learning_rate": 7.994832752494667e-05,
|
||
|
|
"loss": 0.5672,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5772320751856904,
|
||
|
|
"grad_norm": 0.9524061899396458,
|
||
|
|
"learning_rate": 7.994120995058127e-05,
|
||
|
|
"loss": 0.5624,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5820827648931333,
|
||
|
|
"grad_norm": 1.018979775317657,
|
||
|
|
"learning_rate": 7.993363364205907e-05,
|
||
|
|
"loss": 0.554,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.586933454600576,
|
||
|
|
"grad_norm": 0.9276898951496916,
|
||
|
|
"learning_rate": 7.992559868639576e-05,
|
||
|
|
"loss": 0.556,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5917841443080188,
|
||
|
|
"grad_norm": 1.1930439188526805,
|
||
|
|
"learning_rate": 7.99171051758747e-05,
|
||
|
|
"loss": 0.5526,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5966348340154616,
|
||
|
|
"grad_norm": 0.6826863389197481,
|
||
|
|
"learning_rate": 7.990815320804583e-05,
|
||
|
|
"loss": 0.5609,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6014855237229043,
|
||
|
|
"grad_norm": 0.8061980858751947,
|
||
|
|
"learning_rate": 7.98987428857246e-05,
|
||
|
|
"loss": 0.5586,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6063362134303472,
|
||
|
|
"grad_norm": 0.9239948149603757,
|
||
|
|
"learning_rate": 7.988887431699079e-05,
|
||
|
|
"loss": 0.5507,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6111869031377899,
|
||
|
|
"grad_norm": 1.0465957889843347,
|
||
|
|
"learning_rate": 7.987854761518719e-05,
|
||
|
|
"loss": 0.5568,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6160375928452326,
|
||
|
|
"grad_norm": 0.8308130357408615,
|
||
|
|
"learning_rate": 7.986776289891842e-05,
|
||
|
|
"loss": 0.5591,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6208882825526755,
|
||
|
|
"grad_norm": 1.0246744766037437,
|
||
|
|
"learning_rate": 7.985652029204946e-05,
|
||
|
|
"loss": 0.5563,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6257389722601182,
|
||
|
|
"grad_norm": 1.3238612418839921,
|
||
|
|
"learning_rate": 7.984481992370429e-05,
|
||
|
|
"loss": 0.5491,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.630589661967561,
|
||
|
|
"grad_norm": 0.6948258976249133,
|
||
|
|
"learning_rate": 7.983266192826437e-05,
|
||
|
|
"loss": 0.5418,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6354403516750038,
|
||
|
|
"grad_norm": 0.7288790333090353,
|
||
|
|
"learning_rate": 7.982004644536716e-05,
|
||
|
|
"loss": 0.5441,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6402910413824465,
|
||
|
|
"grad_norm": 1.0943204811023435,
|
||
|
|
"learning_rate": 7.98069736199044e-05,
|
||
|
|
"loss": 0.5493,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6451417310898894,
|
||
|
|
"grad_norm": 0.8706827515570799,
|
||
|
|
"learning_rate": 7.979344360202055e-05,
|
||
|
|
"loss": 0.5465,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6499924207973321,
|
||
|
|
"grad_norm": 0.9127380693761118,
|
||
|
|
"learning_rate": 7.977945654711108e-05,
|
||
|
|
"loss": 0.5475,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.654843110504775,
|
||
|
|
"grad_norm": 0.9510188119086359,
|
||
|
|
"learning_rate": 7.976501261582056e-05,
|
||
|
|
"loss": 0.543,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6596938002122177,
|
||
|
|
"grad_norm": 0.9350726421156861,
|
||
|
|
"learning_rate": 7.975011197404092e-05,
|
||
|
|
"loss": 0.5525,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6645444899196604,
|
||
|
|
"grad_norm": 1.272258773056705,
|
||
|
|
"learning_rate": 7.973475479290956e-05,
|
||
|
|
"loss": 0.5518,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6693951796271033,
|
||
|
|
"grad_norm": 0.9765265325518906,
|
||
|
|
"learning_rate": 7.971894124880727e-05,
|
||
|
|
"loss": 0.5417,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.674245869334546,
|
||
|
|
"grad_norm": 1.1917501609756302,
|
||
|
|
"learning_rate": 7.970267152335632e-05,
|
||
|
|
"loss": 0.5464,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6790965590419887,
|
||
|
|
"grad_norm": 0.6719257463868904,
|
||
|
|
"learning_rate": 7.968594580341832e-05,
|
||
|
|
"loss": 0.544,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6839472487494316,
|
||
|
|
"grad_norm": 0.7137208028607956,
|
||
|
|
"learning_rate": 7.966876428109209e-05,
|
||
|
|
"loss": 0.5351,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6887979384568743,
|
||
|
|
"grad_norm": 0.7449688014358767,
|
||
|
|
"learning_rate": 7.965112715371144e-05,
|
||
|
|
"loss": 0.5397,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6936486281643172,
|
||
|
|
"grad_norm": 0.5377305872108858,
|
||
|
|
"learning_rate": 7.96330346238429e-05,
|
||
|
|
"loss": 0.5346,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6984993178717599,
|
||
|
|
"grad_norm": 0.5856757070627496,
|
||
|
|
"learning_rate": 7.961448689928341e-05,
|
||
|
|
"loss": 0.5395,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7033500075792026,
|
||
|
|
"grad_norm": 0.6000325566823206,
|
||
|
|
"learning_rate": 7.959548419305796e-05,
|
||
|
|
"loss": 0.5447,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7082006972866455,
|
||
|
|
"grad_norm": 0.7819963257560868,
|
||
|
|
"learning_rate": 7.957602672341707e-05,
|
||
|
|
"loss": 0.5364,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7130513869940882,
|
||
|
|
"grad_norm": 1.1223449508846108,
|
||
|
|
"learning_rate": 7.955611471383433e-05,
|
||
|
|
"loss": 0.5381,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.717902076701531,
|
||
|
|
"grad_norm": 1.1418981667975974,
|
||
|
|
"learning_rate": 7.953574839300385e-05,
|
||
|
|
"loss": 0.5381,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7227527664089738,
|
||
|
|
"grad_norm": 0.758286759296052,
|
||
|
|
"learning_rate": 7.95149279948376e-05,
|
||
|
|
"loss": 0.5398,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7276034561164165,
|
||
|
|
"grad_norm": 0.7637204957772546,
|
||
|
|
"learning_rate": 7.949365375846271e-05,
|
||
|
|
"loss": 0.5386,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7324541458238594,
|
||
|
|
"grad_norm": 0.6982030938329856,
|
||
|
|
"learning_rate": 7.94719259282188e-05,
|
||
|
|
"loss": 0.5328,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7373048355313021,
|
||
|
|
"grad_norm": 0.7115887055025976,
|
||
|
|
"learning_rate": 7.944974475365506e-05,
|
||
|
|
"loss": 0.5406,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7421555252387448,
|
||
|
|
"grad_norm": 0.6914212445412167,
|
||
|
|
"learning_rate": 7.94271104895275e-05,
|
||
|
|
"loss": 0.5375,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7470062149461877,
|
||
|
|
"grad_norm": 0.6376946136665823,
|
||
|
|
"learning_rate": 7.940402339579596e-05,
|
||
|
|
"loss": 0.5322,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7518569046536304,
|
||
|
|
"grad_norm": 0.662468788270689,
|
||
|
|
"learning_rate": 7.93804837376211e-05,
|
||
|
|
"loss": 0.5312,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7567075943610733,
|
||
|
|
"grad_norm": 0.8116591959883654,
|
||
|
|
"learning_rate": 7.935649178536142e-05,
|
||
|
|
"loss": 0.5362,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.761558284068516,
|
||
|
|
"grad_norm": 1.2251954995336705,
|
||
|
|
"learning_rate": 7.93320478145701e-05,
|
||
|
|
"loss": 0.5454,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7664089737759587,
|
||
|
|
"grad_norm": 1.1753293382340935,
|
||
|
|
"learning_rate": 7.93071521059919e-05,
|
||
|
|
"loss": 0.5369,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7712596634834016,
|
||
|
|
"grad_norm": 0.5797209510428332,
|
||
|
|
"learning_rate": 7.928180494555983e-05,
|
||
|
|
"loss": 0.5255,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7761103531908443,
|
||
|
|
"grad_norm": 0.9260629876609666,
|
||
|
|
"learning_rate": 7.925600662439201e-05,
|
||
|
|
"loss": 0.535,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.780961042898287,
|
||
|
|
"grad_norm": 1.2828815170548864,
|
||
|
|
"learning_rate": 7.922975743878817e-05,
|
||
|
|
"loss": 0.5293,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7858117326057299,
|
||
|
|
"grad_norm": 0.5171701751512903,
|
||
|
|
"learning_rate": 7.92030576902264e-05,
|
||
|
|
"loss": 0.5298,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7906624223131726,
|
||
|
|
"grad_norm": 0.9531992162184508,
|
||
|
|
"learning_rate": 7.917590768535952e-05,
|
||
|
|
"loss": 0.5315,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7955131120206155,
|
||
|
|
"grad_norm": 1.1106333536493764,
|
||
|
|
"learning_rate": 7.914830773601173e-05,
|
||
|
|
"loss": 0.5279,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8003638017280582,
|
||
|
|
"grad_norm": 0.45675301699833715,
|
||
|
|
"learning_rate": 7.912025815917489e-05,
|
||
|
|
"loss": 0.5372,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.805214491435501,
|
||
|
|
"grad_norm": 0.7189143844280501,
|
||
|
|
"learning_rate": 7.909175927700499e-05,
|
||
|
|
"loss": 0.535,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8100651811429438,
|
||
|
|
"grad_norm": 0.8686823215812186,
|
||
|
|
"learning_rate": 7.906281141681839e-05,
|
||
|
|
"loss": 0.5333,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8149158708503865,
|
||
|
|
"grad_norm": 0.588440050652225,
|
||
|
|
"learning_rate": 7.903341491108798e-05,
|
||
|
|
"loss": 0.5289,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8197665605578294,
|
||
|
|
"grad_norm": 0.563873615495661,
|
||
|
|
"learning_rate": 7.900357009743958e-05,
|
||
|
|
"loss": 0.5331,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8246172502652721,
|
||
|
|
"grad_norm": 0.545022127050129,
|
||
|
|
"learning_rate": 7.897327731864784e-05,
|
||
|
|
"loss": 0.5266,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8294679399727148,
|
||
|
|
"grad_norm": 0.5592040867673563,
|
||
|
|
"learning_rate": 7.894253692263244e-05,
|
||
|
|
"loss": 0.522,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8343186296801577,
|
||
|
|
"grad_norm": 0.6324827822327501,
|
||
|
|
"learning_rate": 7.891134926245402e-05,
|
||
|
|
"loss": 0.5297,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8391693193876004,
|
||
|
|
"grad_norm": 0.8244822385641454,
|
||
|
|
"learning_rate": 7.887971469631016e-05,
|
||
|
|
"loss": 0.5319,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8440200090950432,
|
||
|
|
"grad_norm": 1.2087031591715138,
|
||
|
|
"learning_rate": 7.884763358753129e-05,
|
||
|
|
"loss": 0.5408,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.848870698802486,
|
||
|
|
"grad_norm": 0.7427926592130248,
|
||
|
|
"learning_rate": 7.881510630457643e-05,
|
||
|
|
"loss": 0.5326,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8537213885099287,
|
||
|
|
"grad_norm": 0.5074413186000275,
|
||
|
|
"learning_rate": 7.878213322102908e-05,
|
||
|
|
"loss": 0.5281,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8585720782173716,
|
||
|
|
"grad_norm": 0.7191422226961306,
|
||
|
|
"learning_rate": 7.874871471559282e-05,
|
||
|
|
"loss": 0.5269,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8634227679248143,
|
||
|
|
"grad_norm": 0.7351841845171684,
|
||
|
|
"learning_rate": 7.8714851172087e-05,
|
||
|
|
"loss": 0.5326,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.868273457632257,
|
||
|
|
"grad_norm": 0.661668481852005,
|
||
|
|
"learning_rate": 7.868054297944237e-05,
|
||
|
|
"loss": 0.5312,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8731241473396999,
|
||
|
|
"grad_norm": 0.6257843185205204,
|
||
|
|
"learning_rate": 7.864579053169657e-05,
|
||
|
|
"loss": 0.5265,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8779748370471426,
|
||
|
|
"grad_norm": 0.6272508186215946,
|
||
|
|
"learning_rate": 7.86105942279896e-05,
|
||
|
|
"loss": 0.5242,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8828255267545854,
|
||
|
|
"grad_norm": 0.5859524441087289,
|
||
|
|
"learning_rate": 7.857495447255925e-05,
|
||
|
|
"loss": 0.5117,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8876762164620282,
|
||
|
|
"grad_norm": 0.5314342493506371,
|
||
|
|
"learning_rate": 7.853887167473646e-05,
|
||
|
|
"loss": 0.5275,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.892526906169471,
|
||
|
|
"grad_norm": 0.5894396606474899,
|
||
|
|
"learning_rate": 7.850234624894064e-05,
|
||
|
|
"loss": 0.5236,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8973775958769138,
|
||
|
|
"grad_norm": 0.7789309413273731,
|
||
|
|
"learning_rate": 7.846537861467485e-05,
|
||
|
|
"loss": 0.5269,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9022282855843565,
|
||
|
|
"grad_norm": 0.9749399305462054,
|
||
|
|
"learning_rate": 7.842796919652104e-05,
|
||
|
|
"loss": 0.5177,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9070789752917993,
|
||
|
|
"grad_norm": 0.8944031183061603,
|
||
|
|
"learning_rate": 7.839011842413514e-05,
|
||
|
|
"loss": 0.5236,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9119296649992421,
|
||
|
|
"grad_norm": 0.6072136307213187,
|
||
|
|
"learning_rate": 7.835182673224212e-05,
|
||
|
|
"loss": 0.5237,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9167803547066848,
|
||
|
|
"grad_norm": 0.44815995922956803,
|
||
|
|
"learning_rate": 7.831309456063107e-05,
|
||
|
|
"loss": 0.5193,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9216310444141277,
|
||
|
|
"grad_norm": 0.5362363058315522,
|
||
|
|
"learning_rate": 7.827392235415005e-05,
|
||
|
|
"loss": 0.5242,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9264817341215704,
|
||
|
|
"grad_norm": 0.5675209533538658,
|
||
|
|
"learning_rate": 7.823431056270103e-05,
|
||
|
|
"loss": 0.5223,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9313324238290132,
|
||
|
|
"grad_norm": 0.5087214285921436,
|
||
|
|
"learning_rate": 7.81942596412347e-05,
|
||
|
|
"loss": 0.522,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.936183113536456,
|
||
|
|
"grad_norm": 0.46687413642954234,
|
||
|
|
"learning_rate": 7.815377004974532e-05,
|
||
|
|
"loss": 0.509,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9410338032438987,
|
||
|
|
"grad_norm": 0.43127006261715695,
|
||
|
|
"learning_rate": 7.811284225326529e-05,
|
||
|
|
"loss": 0.522,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9458844929513415,
|
||
|
|
"grad_norm": 0.43502529088952246,
|
||
|
|
"learning_rate": 7.807147672185996e-05,
|
||
|
|
"loss": 0.5258,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9507351826587843,
|
||
|
|
"grad_norm": 0.5247431038277521,
|
||
|
|
"learning_rate": 7.802967393062219e-05,
|
||
|
|
"loss": 0.524,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.955585872366227,
|
||
|
|
"grad_norm": 0.6873590048364063,
|
||
|
|
"learning_rate": 7.798743435966676e-05,
|
||
|
|
"loss": 0.5227,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9604365620736699,
|
||
|
|
"grad_norm": 0.8978763261543932,
|
||
|
|
"learning_rate": 7.794475849412512e-05,
|
||
|
|
"loss": 0.5143,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9652872517811126,
|
||
|
|
"grad_norm": 0.8491329782406589,
|
||
|
|
"learning_rate": 7.790164682413954e-05,
|
||
|
|
"loss": 0.5186,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9701379414885554,
|
||
|
|
"grad_norm": 0.5004070361840594,
|
||
|
|
"learning_rate": 7.785809984485765e-05,
|
||
|
|
"loss": 0.5185,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9749886311959982,
|
||
|
|
"grad_norm": 0.4620691718653308,
|
||
|
|
"learning_rate": 7.781411805642675e-05,
|
||
|
|
"loss": 0.5179,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.979839320903441,
|
||
|
|
"grad_norm": 0.6078145800613444,
|
||
|
|
"learning_rate": 7.776970196398795e-05,
|
||
|
|
"loss": 0.5185,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9846900106108837,
|
||
|
|
"grad_norm": 0.6744449960922595,
|
||
|
|
"learning_rate": 7.77248520776705e-05,
|
||
|
|
"loss": 0.5223,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9895407003183265,
|
||
|
|
"grad_norm": 0.769858159356013,
|
||
|
|
"learning_rate": 7.767956891258585e-05,
|
||
|
|
"loss": 0.514,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9943913900257693,
|
||
|
|
"grad_norm": 0.9703823352556248,
|
||
|
|
"learning_rate": 7.763385298882177e-05,
|
||
|
|
"loss": 0.5227,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9992420797332121,
|
||
|
|
"grad_norm": 1.0790500827883502,
|
||
|
|
"learning_rate": 7.758770483143634e-05,
|
||
|
|
"loss": 0.5161,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0048506897074427,
|
||
|
|
"grad_norm": 0.979399858326228,
|
||
|
|
"learning_rate": 7.754112497045198e-05,
|
||
|
|
"loss": 0.5126,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0097013794148855,
|
||
|
|
"grad_norm": 0.8061955074549858,
|
||
|
|
"learning_rate": 7.749411394084931e-05,
|
||
|
|
"loss": 0.5093,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0145520691223284,
|
||
|
|
"grad_norm": 0.769561002371883,
|
||
|
|
"learning_rate": 7.744667228256102e-05,
|
||
|
|
"loss": 0.5129,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0194027588297712,
|
||
|
|
"grad_norm": 0.7563794216226459,
|
||
|
|
"learning_rate": 7.739880054046567e-05,
|
||
|
|
"loss": 0.504,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.024253448537214,
|
||
|
|
"grad_norm": 0.4861095042439798,
|
||
|
|
"learning_rate": 7.735049926438143e-05,
|
||
|
|
"loss": 0.5008,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0291041382446566,
|
||
|
|
"grad_norm": 0.521763707621908,
|
||
|
|
"learning_rate": 7.730176900905978e-05,
|
||
|
|
"loss": 0.5039,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0339548279520994,
|
||
|
|
"grad_norm": 0.6847654097095645,
|
||
|
|
"learning_rate": 7.725261033417914e-05,
|
||
|
|
"loss": 0.4987,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0388055176595423,
|
||
|
|
"grad_norm": 0.45617035587869154,
|
||
|
|
"learning_rate": 7.720302380433838e-05,
|
||
|
|
"loss": 0.5082,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.043656207366985,
|
||
|
|
"grad_norm": 0.5136841747695677,
|
||
|
|
"learning_rate": 7.715300998905045e-05,
|
||
|
|
"loss": 0.4903,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0485068970744278,
|
||
|
|
"grad_norm": 0.5201029165395914,
|
||
|
|
"learning_rate": 7.710256946273572e-05,
|
||
|
|
"loss": 0.5061,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0533575867818705,
|
||
|
|
"grad_norm": 0.5369451831716502,
|
||
|
|
"learning_rate": 7.705170280471546e-05,
|
||
|
|
"loss": 0.4923,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0582082764893133,
|
||
|
|
"grad_norm": 0.5292650633659572,
|
||
|
|
"learning_rate": 7.700041059920516e-05,
|
||
|
|
"loss": 0.4958,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0630589661967562,
|
||
|
|
"grad_norm": 0.38347803199441816,
|
||
|
|
"learning_rate": 7.694869343530781e-05,
|
||
|
|
"loss": 0.4949,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.067909655904199,
|
||
|
|
"grad_norm": 0.42205008813365347,
|
||
|
|
"learning_rate": 7.689655190700719e-05,
|
||
|
|
"loss": 0.4958,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0727603456116417,
|
||
|
|
"grad_norm": 0.43326896920779806,
|
||
|
|
"learning_rate": 7.684398661316092e-05,
|
||
|
|
"loss": 0.5034,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0776110353190844,
|
||
|
|
"grad_norm": 0.3610486041319715,
|
||
|
|
"learning_rate": 7.679099815749377e-05,
|
||
|
|
"loss": 0.508,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0824617250265272,
|
||
|
|
"grad_norm": 0.4631910613487445,
|
||
|
|
"learning_rate": 7.673758714859052e-05,
|
||
|
|
"loss": 0.5012,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.08731241473397,
|
||
|
|
"grad_norm": 0.5933537887890846,
|
||
|
|
"learning_rate": 7.668375419988918e-05,
|
||
|
|
"loss": 0.5003,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0921631044414128,
|
||
|
|
"grad_norm": 0.7536616463697116,
|
||
|
|
"learning_rate": 7.662949992967375e-05,
|
||
|
|
"loss": 0.4984,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0970137941488556,
|
||
|
|
"grad_norm": 0.8908783317227219,
|
||
|
|
"learning_rate": 7.657482496106725e-05,
|
||
|
|
"loss": 0.499,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1018644838562983,
|
||
|
|
"grad_norm": 0.8949365262495667,
|
||
|
|
"learning_rate": 7.651972992202449e-05,
|
||
|
|
"loss": 0.4964,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.106715173563741,
|
||
|
|
"grad_norm": 0.7666080530207662,
|
||
|
|
"learning_rate": 7.646421544532492e-05,
|
||
|
|
"loss": 0.501,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1115658632711838,
|
||
|
|
"grad_norm": 0.5974548399149405,
|
||
|
|
"learning_rate": 7.640828216856532e-05,
|
||
|
|
"loss": 0.5019,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1164165529786267,
|
||
|
|
"grad_norm": 0.5208576739553936,
|
||
|
|
"learning_rate": 7.635193073415246e-05,
|
||
|
|
"loss": 0.4954,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1212672426860695,
|
||
|
|
"grad_norm": 0.4449218126901089,
|
||
|
|
"learning_rate": 7.62951617892958e-05,
|
||
|
|
"loss": 0.4966,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1261179323935122,
|
||
|
|
"grad_norm": 0.3413546355163314,
|
||
|
|
"learning_rate": 7.623797598599995e-05,
|
||
|
|
"loss": 0.4869,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.130968622100955,
|
||
|
|
"grad_norm": 0.4021494906350838,
|
||
|
|
"learning_rate": 7.618037398105728e-05,
|
||
|
|
"loss": 0.4876,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1358193118083977,
|
||
|
|
"grad_norm": 0.5401093810240136,
|
||
|
|
"learning_rate": 7.612235643604031e-05,
|
||
|
|
"loss": 0.495,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1406700015158404,
|
||
|
|
"grad_norm": 0.6450843987922678,
|
||
|
|
"learning_rate": 7.606392401729415e-05,
|
||
|
|
"loss": 0.4953,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1455206912232834,
|
||
|
|
"grad_norm": 0.6413460171528268,
|
||
|
|
"learning_rate": 7.600507739592879e-05,
|
||
|
|
"loss": 0.4972,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.150371380930726,
|
||
|
|
"grad_norm": 0.6276158556341717,
|
||
|
|
"learning_rate": 7.594581724781152e-05,
|
||
|
|
"loss": 0.4957,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1552220706381688,
|
||
|
|
"grad_norm": 0.7553018024510589,
|
||
|
|
"learning_rate": 7.588614425355898e-05,
|
||
|
|
"loss": 0.4955,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1600727603456116,
|
||
|
|
"grad_norm": 0.8968837770832118,
|
||
|
|
"learning_rate": 7.582605909852951e-05,
|
||
|
|
"loss": 0.4937,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1649234500530543,
|
||
|
|
"grad_norm": 0.7618870978199445,
|
||
|
|
"learning_rate": 7.576556247281522e-05,
|
||
|
|
"loss": 0.4969,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1697741397604973,
|
||
|
|
"grad_norm": 0.6531717186517569,
|
||
|
|
"learning_rate": 7.570465507123401e-05,
|
||
|
|
"loss": 0.5042,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.17462482946794,
|
||
|
|
"grad_norm": 0.591515239054212,
|
||
|
|
"learning_rate": 7.564333759332167e-05,
|
||
|
|
"loss": 0.4906,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1794755191753827,
|
||
|
|
"grad_norm": 0.49797581923871925,
|
||
|
|
"learning_rate": 7.558161074332379e-05,
|
||
|
|
"loss": 0.4966,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1843262088828255,
|
||
|
|
"grad_norm": 0.5865910810734263,
|
||
|
|
"learning_rate": 7.551947523018774e-05,
|
||
|
|
"loss": 0.4997,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1891768985902682,
|
||
|
|
"grad_norm": 0.5003536808662635,
|
||
|
|
"learning_rate": 7.54569317675544e-05,
|
||
|
|
"loss": 0.4954,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1940275882977112,
|
||
|
|
"grad_norm": 0.4328856052217075,
|
||
|
|
"learning_rate": 7.539398107375015e-05,
|
||
|
|
"loss": 0.4979,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.198878278005154,
|
||
|
|
"grad_norm": 0.5623661241326378,
|
||
|
|
"learning_rate": 7.533062387177843e-05,
|
||
|
|
"loss": 0.4982,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2037289677125966,
|
||
|
|
"grad_norm": 0.542587238982675,
|
||
|
|
"learning_rate": 7.526686088931156e-05,
|
||
|
|
"loss": 0.4991,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2085796574200394,
|
||
|
|
"grad_norm": 0.4949553127282243,
|
||
|
|
"learning_rate": 7.520269285868235e-05,
|
||
|
|
"loss": 0.4908,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.213430347127482,
|
||
|
|
"grad_norm": 0.39917468694971237,
|
||
|
|
"learning_rate": 7.513812051687564e-05,
|
||
|
|
"loss": 0.4917,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.218281036834925,
|
||
|
|
"grad_norm": 0.44831825611716425,
|
||
|
|
"learning_rate": 7.507314460551993e-05,
|
||
|
|
"loss": 0.4898,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2231317265423678,
|
||
|
|
"grad_norm": 0.4694728642442923,
|
||
|
|
"learning_rate": 7.500776587087878e-05,
|
||
|
|
"loss": 0.4929,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2279824162498105,
|
||
|
|
"grad_norm": 0.4548469944052975,
|
||
|
|
"learning_rate": 7.494198506384229e-05,
|
||
|
|
"loss": 0.4826,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2328331059572533,
|
||
|
|
"grad_norm": 0.6666754202357293,
|
||
|
|
"learning_rate": 7.487580293991844e-05,
|
||
|
|
"loss": 0.5021,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.237683795664696,
|
||
|
|
"grad_norm": 0.6791995766586792,
|
||
|
|
"learning_rate": 7.480922025922443e-05,
|
||
|
|
"loss": 0.4974,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.242534485372139,
|
||
|
|
"grad_norm": 0.48795487898431833,
|
||
|
|
"learning_rate": 7.474223778647796e-05,
|
||
|
|
"loss": 0.4934,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2473851750795817,
|
||
|
|
"grad_norm": 0.32007742081193336,
|
||
|
|
"learning_rate": 7.467485629098842e-05,
|
||
|
|
"loss": 0.4907,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2522358647870244,
|
||
|
|
"grad_norm": 0.2948779105297471,
|
||
|
|
"learning_rate": 7.460707654664807e-05,
|
||
|
|
"loss": 0.4974,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2570865544944672,
|
||
|
|
"grad_norm": 0.2954711132741566,
|
||
|
|
"learning_rate": 7.453889933192316e-05,
|
||
|
|
"loss": 0.4893,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.26193724420191,
|
||
|
|
"grad_norm": 0.3234156565448516,
|
||
|
|
"learning_rate": 7.447032542984502e-05,
|
||
|
|
"loss": 0.4882,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2667879339093528,
|
||
|
|
"grad_norm": 0.37736907562809757,
|
||
|
|
"learning_rate": 7.440135562800093e-05,
|
||
|
|
"loss": 0.487,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2716386236167956,
|
||
|
|
"grad_norm": 0.4380964102542513,
|
||
|
|
"learning_rate": 7.433199071852526e-05,
|
||
|
|
"loss": 0.4965,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2764893133242383,
|
||
|
|
"grad_norm": 0.5377640268616285,
|
||
|
|
"learning_rate": 7.426223149809023e-05,
|
||
|
|
"loss": 0.4922,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.281340003031681,
|
||
|
|
"grad_norm": 0.5447192325063156,
|
||
|
|
"learning_rate": 7.419207876789685e-05,
|
||
|
|
"loss": 0.4844,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2861906927391238,
|
||
|
|
"grad_norm": 0.5569387883590096,
|
||
|
|
"learning_rate": 7.412153333366567e-05,
|
||
|
|
"loss": 0.4887,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2910413824465667,
|
||
|
|
"grad_norm": 0.6127559785319273,
|
||
|
|
"learning_rate": 7.405059600562751e-05,
|
||
|
|
"loss": 0.4974,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2958920721540095,
|
||
|
|
"grad_norm": 0.5372861059475705,
|
||
|
|
"learning_rate": 7.397926759851425e-05,
|
||
|
|
"loss": 0.4946,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3007427618614522,
|
||
|
|
"grad_norm": 0.3715441206816088,
|
||
|
|
"learning_rate": 7.390754893154933e-05,
|
||
|
|
"loss": 0.4914,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.305593451568895,
|
||
|
|
"grad_norm": 0.3468457624853246,
|
||
|
|
"learning_rate": 7.383544082843846e-05,
|
||
|
|
"loss": 0.4906,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3104441412763377,
|
||
|
|
"grad_norm": 0.37917435116867076,
|
||
|
|
"learning_rate": 7.376294411736009e-05,
|
||
|
|
"loss": 0.4877,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3152948309837806,
|
||
|
|
"grad_norm": 0.44133896758740837,
|
||
|
|
"learning_rate": 7.369005963095596e-05,
|
||
|
|
"loss": 0.4962,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3201455206912232,
|
||
|
|
"grad_norm": 0.5453886771483799,
|
||
|
|
"learning_rate": 7.361678820632145e-05,
|
||
|
|
"loss": 0.4918,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.324996210398666,
|
||
|
|
"grad_norm": 0.6036321305780495,
|
||
|
|
"learning_rate": 7.354313068499607e-05,
|
||
|
|
"loss": 0.4892,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3298469001061088,
|
||
|
|
"grad_norm": 0.620838391242342,
|
||
|
|
"learning_rate": 7.346908791295369e-05,
|
||
|
|
"loss": 0.495,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3346975898135516,
|
||
|
|
"grad_norm": 0.5848284090531766,
|
||
|
|
"learning_rate": 7.339466074059292e-05,
|
||
|
|
"loss": 0.4862,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3395482795209943,
|
||
|
|
"grad_norm": 0.5915616847361488,
|
||
|
|
"learning_rate": 7.331985002272726e-05,
|
||
|
|
"loss": 0.4986,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.344398969228437,
|
||
|
|
"grad_norm": 0.5871009859314463,
|
||
|
|
"learning_rate": 7.324465661857534e-05,
|
||
|
|
"loss": 0.4868,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.34924965893588,
|
||
|
|
"grad_norm": 0.4325881770640798,
|
||
|
|
"learning_rate": 7.316908139175105e-05,
|
||
|
|
"loss": 0.4886,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3541003486433227,
|
||
|
|
"grad_norm": 0.342133338929163,
|
||
|
|
"learning_rate": 7.309312521025356e-05,
|
||
|
|
"loss": 0.4909,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3589510383507655,
|
||
|
|
"grad_norm": 0.4415991996544984,
|
||
|
|
"learning_rate": 7.301678894645742e-05,
|
||
|
|
"loss": 0.4915,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3638017280582082,
|
||
|
|
"grad_norm": 0.510599069072842,
|
||
|
|
"learning_rate": 7.294007347710251e-05,
|
||
|
|
"loss": 0.4935,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.368652417765651,
|
||
|
|
"grad_norm": 0.5220646005978747,
|
||
|
|
"learning_rate": 7.286297968328397e-05,
|
||
|
|
"loss": 0.4834,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.373503107473094,
|
||
|
|
"grad_norm": 0.4836929320122624,
|
||
|
|
"learning_rate": 7.27855084504421e-05,
|
||
|
|
"loss": 0.4953,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3783537971805366,
|
||
|
|
"grad_norm": 0.4678184328008887,
|
||
|
|
"learning_rate": 7.270766066835217e-05,
|
||
|
|
"loss": 0.4872,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3832044868879794,
|
||
|
|
"grad_norm": 0.4479819209137706,
|
||
|
|
"learning_rate": 7.262943723111419e-05,
|
||
|
|
"loss": 0.4916,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.388055176595422,
|
||
|
|
"grad_norm": 0.44661460126650243,
|
||
|
|
"learning_rate": 7.255083903714266e-05,
|
||
|
|
"loss": 0.4866,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3929058663028648,
|
||
|
|
"grad_norm": 0.46700839086853646,
|
||
|
|
"learning_rate": 7.247186698915625e-05,
|
||
|
|
"loss": 0.4879,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3977565560103078,
|
||
|
|
"grad_norm": 0.5111356334340853,
|
||
|
|
"learning_rate": 7.239252199416749e-05,
|
||
|
|
"loss": 0.4812,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4026072457177505,
|
||
|
|
"grad_norm": 0.5198819509993116,
|
||
|
|
"learning_rate": 7.23128049634722e-05,
|
||
|
|
"loss": 0.4809,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4074579354251933,
|
||
|
|
"grad_norm": 0.5071119674882796,
|
||
|
|
"learning_rate": 7.223271681263916e-05,
|
||
|
|
"loss": 0.4839,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.412308625132636,
|
||
|
|
"grad_norm": 0.4871736388487796,
|
||
|
|
"learning_rate": 7.215225846149957e-05,
|
||
|
|
"loss": 0.4899,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4171593148400787,
|
||
|
|
"grad_norm": 0.4644477177967194,
|
||
|
|
"learning_rate": 7.207143083413643e-05,
|
||
|
|
"loss": 0.4865,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4220100045475217,
|
||
|
|
"grad_norm": 0.5305010693129898,
|
||
|
|
"learning_rate": 7.1990234858874e-05,
|
||
|
|
"loss": 0.4876,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4268606942549644,
|
||
|
|
"grad_norm": 0.5381812012245146,
|
||
|
|
"learning_rate": 7.190867146826707e-05,
|
||
|
|
"loss": 0.4936,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4317113839624072,
|
||
|
|
"grad_norm": 0.42926452605809334,
|
||
|
|
"learning_rate": 7.182674159909031e-05,
|
||
|
|
"loss": 0.4845,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.43656207366985,
|
||
|
|
"grad_norm": 0.42959757228135126,
|
||
|
|
"learning_rate": 7.174444619232745e-05,
|
||
|
|
"loss": 0.4952,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4414127633772926,
|
||
|
|
"grad_norm": 0.4994771109955089,
|
||
|
|
"learning_rate": 7.166178619316056e-05,
|
||
|
|
"loss": 0.4912,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4462634530847356,
|
||
|
|
"grad_norm": 0.4676921077663137,
|
||
|
|
"learning_rate": 7.157876255095906e-05,
|
||
|
|
"loss": 0.4875,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4511141427921783,
|
||
|
|
"grad_norm": 0.430209471088095,
|
||
|
|
"learning_rate": 7.149537621926895e-05,
|
||
|
|
"loss": 0.4862,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.455964832499621,
|
||
|
|
"grad_norm": 0.5267214852499816,
|
||
|
|
"learning_rate": 7.14116281558018e-05,
|
||
|
|
"loss": 0.4879,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4608155222070638,
|
||
|
|
"grad_norm": 0.5169308603441447,
|
||
|
|
"learning_rate": 7.132751932242376e-05,
|
||
|
|
"loss": 0.4984,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4656662119145065,
|
||
|
|
"grad_norm": 0.3435076728807633,
|
||
|
|
"learning_rate": 7.124305068514444e-05,
|
||
|
|
"loss": 0.487,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4705169016219495,
|
||
|
|
"grad_norm": 0.2662958717194974,
|
||
|
|
"learning_rate": 7.1158223214106e-05,
|
||
|
|
"loss": 0.4878,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4753675913293922,
|
||
|
|
"grad_norm": 0.31770648802942325,
|
||
|
|
"learning_rate": 7.107303788357177e-05,
|
||
|
|
"loss": 0.4819,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.480218281036835,
|
||
|
|
"grad_norm": 0.29599579651368213,
|
||
|
|
"learning_rate": 7.098749567191527e-05,
|
||
|
|
"loss": 0.4852,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4850689707442777,
|
||
|
|
"grad_norm": 0.3093179833213525,
|
||
|
|
"learning_rate": 7.090159756160886e-05,
|
||
|
|
"loss": 0.4877,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4899196604517204,
|
||
|
|
"grad_norm": 0.38971280696993216,
|
||
|
|
"learning_rate": 7.081534453921242e-05,
|
||
|
|
"loss": 0.4852,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4947703501591634,
|
||
|
|
"grad_norm": 0.3825128078756503,
|
||
|
|
"learning_rate": 7.072873759536217e-05,
|
||
|
|
"loss": 0.4913,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.499621039866606,
|
||
|
|
"grad_norm": 0.2886624346515238,
|
||
|
|
"learning_rate": 7.064177772475912e-05,
|
||
|
|
"loss": 0.4798,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5044717295740488,
|
||
|
|
"grad_norm": 0.3712678777863381,
|
||
|
|
"learning_rate": 7.05544659261578e-05,
|
||
|
|
"loss": 0.4867,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5093224192814916,
|
||
|
|
"grad_norm": 0.42331161544955054,
|
||
|
|
"learning_rate": 7.046680320235466e-05,
|
||
|
|
"loss": 0.4871,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5141731089889343,
|
||
|
|
"grad_norm": 0.4018226592697959,
|
||
|
|
"learning_rate": 7.037879056017663e-05,
|
||
|
|
"loss": 0.4842,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5190237986963773,
|
||
|
|
"grad_norm": 0.3959692369182176,
|
||
|
|
"learning_rate": 7.029042901046952e-05,
|
||
|
|
"loss": 0.4802,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5238744884038198,
|
||
|
|
"grad_norm": 0.4052319536586186,
|
||
|
|
"learning_rate": 7.020171956808645e-05,
|
||
|
|
"loss": 0.4859,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5287251781112627,
|
||
|
|
"grad_norm": 0.40673400305291324,
|
||
|
|
"learning_rate": 7.011266325187615e-05,
|
||
|
|
"loss": 0.496,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5335758678187055,
|
||
|
|
"grad_norm": 0.44258718402573904,
|
||
|
|
"learning_rate": 7.002326108467129e-05,
|
||
|
|
"loss": 0.4864,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5384265575261482,
|
||
|
|
"grad_norm": 0.4022422201093314,
|
||
|
|
"learning_rate": 6.993351409327672e-05,
|
||
|
|
"loss": 0.4763,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5432772472335912,
|
||
|
|
"grad_norm": 0.39107692009497685,
|
||
|
|
"learning_rate": 6.984342330845764e-05,
|
||
|
|
"loss": 0.4952,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5481279369410337,
|
||
|
|
"grad_norm": 0.36401667233363455,
|
||
|
|
"learning_rate": 6.975298976492785e-05,
|
||
|
|
"loss": 0.4952,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5529786266484766,
|
||
|
|
"grad_norm": 0.3964209144895704,
|
||
|
|
"learning_rate": 6.966221450133779e-05,
|
||
|
|
"loss": 0.4901,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5578293163559194,
|
||
|
|
"grad_norm": 0.41002635948711413,
|
||
|
|
"learning_rate": 6.957109856026261e-05,
|
||
|
|
"loss": 0.4917,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.562680006063362,
|
||
|
|
"grad_norm": 0.33043042887382146,
|
||
|
|
"learning_rate": 6.94796429881903e-05,
|
||
|
|
"loss": 0.4771,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.567530695770805,
|
||
|
|
"grad_norm": 0.3258167342621945,
|
||
|
|
"learning_rate": 6.938784883550948e-05,
|
||
|
|
"loss": 0.4889,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5723813854782476,
|
||
|
|
"grad_norm": 0.33864970777234993,
|
||
|
|
"learning_rate": 6.929571715649755e-05,
|
||
|
|
"loss": 0.4866,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5772320751856905,
|
||
|
|
"grad_norm": 0.3799335436470155,
|
||
|
|
"learning_rate": 6.920324900930842e-05,
|
||
|
|
"loss": 0.4907,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5820827648931333,
|
||
|
|
"grad_norm": 0.38649667346458383,
|
||
|
|
"learning_rate": 6.911044545596042e-05,
|
||
|
|
"loss": 0.4854,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.586933454600576,
|
||
|
|
"grad_norm": 0.3332457174533195,
|
||
|
|
"learning_rate": 6.901730756232411e-05,
|
||
|
|
"loss": 0.4895,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.591784144308019,
|
||
|
|
"grad_norm": 0.3688581464993942,
|
||
|
|
"learning_rate": 6.892383639811005e-05,
|
||
|
|
"loss": 0.4958,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5966348340154615,
|
||
|
|
"grad_norm": 0.43127701691860393,
|
||
|
|
"learning_rate": 6.883003303685644e-05,
|
||
|
|
"loss": 0.4844,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6014855237229044,
|
||
|
|
"grad_norm": 0.5098788849460419,
|
||
|
|
"learning_rate": 6.87358985559169e-05,
|
||
|
|
"loss": 0.489,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6063362134303472,
|
||
|
|
"grad_norm": 0.531466303384909,
|
||
|
|
"learning_rate": 6.864143403644797e-05,
|
||
|
|
"loss": 0.4945,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.61118690313779,
|
||
|
|
"grad_norm": 0.4575057116450561,
|
||
|
|
"learning_rate": 6.85466405633968e-05,
|
||
|
|
"loss": 0.4855,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6160375928452326,
|
||
|
|
"grad_norm": 0.42418027914564915,
|
||
|
|
"learning_rate": 6.845151922548865e-05,
|
||
|
|
"loss": 0.4783,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6208882825526754,
|
||
|
|
"grad_norm": 0.38431703300530295,
|
||
|
|
"learning_rate": 6.835607111521439e-05,
|
||
|
|
"loss": 0.4796,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6257389722601183,
|
||
|
|
"grad_norm": 0.325044588754403,
|
||
|
|
"learning_rate": 6.826029732881793e-05,
|
||
|
|
"loss": 0.4928,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.630589661967561,
|
||
|
|
"grad_norm": 0.2723847494193817,
|
||
|
|
"learning_rate": 6.816419896628363e-05,
|
||
|
|
"loss": 0.4851,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6354403516750038,
|
||
|
|
"grad_norm": 0.25325170865464947,
|
||
|
|
"learning_rate": 6.806777713132374e-05,
|
||
|
|
"loss": 0.4826,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6402910413824465,
|
||
|
|
"grad_norm": 0.2793387163645126,
|
||
|
|
"learning_rate": 6.79710329313656e-05,
|
||
|
|
"loss": 0.4873,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6451417310898893,
|
||
|
|
"grad_norm": 0.38835730960557174,
|
||
|
|
"learning_rate": 6.787396747753903e-05,
|
||
|
|
"loss": 0.4744,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6499924207973322,
|
||
|
|
"grad_norm": 0.5896470389457479,
|
||
|
|
"learning_rate": 6.777658188466354e-05,
|
||
|
|
"loss": 0.4765,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.654843110504775,
|
||
|
|
"grad_norm": 0.7717199781637745,
|
||
|
|
"learning_rate": 6.767887727123544e-05,
|
||
|
|
"loss": 0.4931,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6596938002122177,
|
||
|
|
"grad_norm": 1.0491183054565791,
|
||
|
|
"learning_rate": 6.758085475941516e-05,
|
||
|
|
"loss": 0.4875,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6645444899196604,
|
||
|
|
"grad_norm": 1.0963514331569248,
|
||
|
|
"learning_rate": 6.748251547501418e-05,
|
||
|
|
"loss": 0.4783,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6693951796271032,
|
||
|
|
"grad_norm": 0.6008540810802777,
|
||
|
|
"learning_rate": 6.738386054748226e-05,
|
||
|
|
"loss": 0.4836,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.674245869334546,
|
||
|
|
"grad_norm": 0.4448291346545642,
|
||
|
|
"learning_rate": 6.728489110989434e-05,
|
||
|
|
"loss": 0.4883,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6790965590419886,
|
||
|
|
"grad_norm": 0.4942617693141126,
|
||
|
|
"learning_rate": 6.718560829893762e-05,
|
||
|
|
"loss": 0.4799,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6839472487494316,
|
||
|
|
"grad_norm": 0.4996733158915889,
|
||
|
|
"learning_rate": 6.708601325489844e-05,
|
||
|
|
"loss": 0.4872,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6887979384568743,
|
||
|
|
"grad_norm": 0.3715020675799419,
|
||
|
|
"learning_rate": 6.698610712164924e-05,
|
||
|
|
"loss": 0.4864,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.693648628164317,
|
||
|
|
"grad_norm": 0.27977348496462506,
|
||
|
|
"learning_rate": 6.688589104663536e-05,
|
||
|
|
"loss": 0.4731,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.69849931787176,
|
||
|
|
"grad_norm": 0.3329757061053683,
|
||
|
|
"learning_rate": 6.67853661808619e-05,
|
||
|
|
"loss": 0.4771,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7033500075792025,
|
||
|
|
"grad_norm": 0.35849519050184514,
|
||
|
|
"learning_rate": 6.668453367888052e-05,
|
||
|
|
"loss": 0.4867,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7082006972866455,
|
||
|
|
"grad_norm": 0.3093064728603477,
|
||
|
|
"learning_rate": 6.658339469877613e-05,
|
||
|
|
"loss": 0.478,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7130513869940882,
|
||
|
|
"grad_norm": 0.2983932071893944,
|
||
|
|
"learning_rate": 6.64819504021536e-05,
|
||
|
|
"loss": 0.4814,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.717902076701531,
|
||
|
|
"grad_norm": 0.32895472335219694,
|
||
|
|
"learning_rate": 6.638020195412448e-05,
|
||
|
|
"loss": 0.4771,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.722752766408974,
|
||
|
|
"grad_norm": 0.4277599821762847,
|
||
|
|
"learning_rate": 6.627815052329354e-05,
|
||
|
|
"loss": 0.4925,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7276034561164164,
|
||
|
|
"grad_norm": 0.4690652488456737,
|
||
|
|
"learning_rate": 6.617579728174535e-05,
|
||
|
|
"loss": 0.4854,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7324541458238594,
|
||
|
|
"grad_norm": 0.43487992599318925,
|
||
|
|
"learning_rate": 6.60731434050309e-05,
|
||
|
|
"loss": 0.4777,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.737304835531302,
|
||
|
|
"grad_norm": 0.4398837483674929,
|
||
|
|
"learning_rate": 6.597019007215401e-05,
|
||
|
|
"loss": 0.4783,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7421555252387448,
|
||
|
|
"grad_norm": 0.43050412759021583,
|
||
|
|
"learning_rate": 6.586693846555788e-05,
|
||
|
|
"loss": 0.4743,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7470062149461878,
|
||
|
|
"grad_norm": 0.40202174280112624,
|
||
|
|
"learning_rate": 6.576338977111134e-05,
|
||
|
|
"loss": 0.48,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7518569046536303,
|
||
|
|
"grad_norm": 0.36259944082265505,
|
||
|
|
"learning_rate": 6.565954517809543e-05,
|
||
|
|
"loss": 0.4747,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7567075943610733,
|
||
|
|
"grad_norm": 0.28300782787905476,
|
||
|
|
"learning_rate": 6.555540587918968e-05,
|
||
|
|
"loss": 0.4778,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.761558284068516,
|
||
|
|
"grad_norm": 0.27286805994349533,
|
||
|
|
"learning_rate": 6.545097307045831e-05,
|
||
|
|
"loss": 0.4795,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7664089737759587,
|
||
|
|
"grad_norm": 0.31934491384929364,
|
||
|
|
"learning_rate": 6.534624795133662e-05,
|
||
|
|
"loss": 0.4851,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7712596634834017,
|
||
|
|
"grad_norm": 0.35245692182661065,
|
||
|
|
"learning_rate": 6.524123172461711e-05,
|
||
|
|
"loss": 0.4794,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7761103531908442,
|
||
|
|
"grad_norm": 0.3396944161317505,
|
||
|
|
"learning_rate": 6.51359255964358e-05,
|
||
|
|
"loss": 0.4774,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7809610428982872,
|
||
|
|
"grad_norm": 0.2877754553279699,
|
||
|
|
"learning_rate": 6.503033077625824e-05,
|
||
|
|
"loss": 0.4746,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.78581173260573,
|
||
|
|
"grad_norm": 0.3326775146217252,
|
||
|
|
"learning_rate": 6.492444847686566e-05,
|
||
|
|
"loss": 0.4849,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7906624223131726,
|
||
|
|
"grad_norm": 0.419702080624426,
|
||
|
|
"learning_rate": 6.481827991434111e-05,
|
||
|
|
"loss": 0.4814,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7955131120206156,
|
||
|
|
"grad_norm": 0.43780909545325103,
|
||
|
|
"learning_rate": 6.471182630805538e-05,
|
||
|
|
"loss": 0.4813,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.800363801728058,
|
||
|
|
"grad_norm": 0.5080371772111751,
|
||
|
|
"learning_rate": 6.460508888065314e-05,
|
||
|
|
"loss": 0.4865,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.805214491435501,
|
||
|
|
"grad_norm": 0.5541771901573003,
|
||
|
|
"learning_rate": 6.449806885803873e-05,
|
||
|
|
"loss": 0.4752,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8100651811429438,
|
||
|
|
"grad_norm": 0.5077930985424878,
|
||
|
|
"learning_rate": 6.439076746936219e-05,
|
||
|
|
"loss": 0.4776,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8149158708503865,
|
||
|
|
"grad_norm": 0.3974055597915673,
|
||
|
|
"learning_rate": 6.428318594700509e-05,
|
||
|
|
"loss": 0.4833,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8197665605578295,
|
||
|
|
"grad_norm": 0.27838793632255576,
|
||
|
|
"learning_rate": 6.417532552656647e-05,
|
||
|
|
"loss": 0.4808,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.824617250265272,
|
||
|
|
"grad_norm": 0.25531453372215107,
|
||
|
|
"learning_rate": 6.406718744684851e-05,
|
||
|
|
"loss": 0.475,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.829467939972715,
|
||
|
|
"grad_norm": 0.37619306197372576,
|
||
|
|
"learning_rate": 6.395877294984241e-05,
|
||
|
|
"loss": 0.4718,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8343186296801577,
|
||
|
|
"grad_norm": 0.45314622671289495,
|
||
|
|
"learning_rate": 6.385008328071406e-05,
|
||
|
|
"loss": 0.4858,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8391693193876004,
|
||
|
|
"grad_norm": 0.4091697946782105,
|
||
|
|
"learning_rate": 6.374111968778982e-05,
|
||
|
|
"loss": 0.4797,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8440200090950432,
|
||
|
|
"grad_norm": 0.31229108750141443,
|
||
|
|
"learning_rate": 6.363188342254206e-05,
|
||
|
|
"loss": 0.4819,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.848870698802486,
|
||
|
|
"grad_norm": 0.29389799920114323,
|
||
|
|
"learning_rate": 6.352237573957488e-05,
|
||
|
|
"loss": 0.4777,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8537213885099288,
|
||
|
|
"grad_norm": 0.31830405845533455,
|
||
|
|
"learning_rate": 6.341259789660969e-05,
|
||
|
|
"loss": 0.4812,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8585720782173716,
|
||
|
|
"grad_norm": 0.2783419658252336,
|
||
|
|
"learning_rate": 6.330255115447076e-05,
|
||
|
|
"loss": 0.4722,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8634227679248143,
|
||
|
|
"grad_norm": 0.252195902565345,
|
||
|
|
"learning_rate": 6.319223677707069e-05,
|
||
|
|
"loss": 0.4786,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.868273457632257,
|
||
|
|
"grad_norm": 0.24931429544692238,
|
||
|
|
"learning_rate": 6.308165603139598e-05,
|
||
|
|
"loss": 0.4766,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8731241473396998,
|
||
|
|
"grad_norm": 0.343056469344131,
|
||
|
|
"learning_rate": 6.29708101874924e-05,
|
||
|
|
"loss": 0.4746,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8779748370471427,
|
||
|
|
"grad_norm": 0.39163026769861153,
|
||
|
|
"learning_rate": 6.285970051845045e-05,
|
||
|
|
"loss": 0.4777,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8828255267545853,
|
||
|
|
"grad_norm": 0.3925083138992103,
|
||
|
|
"learning_rate": 6.274832830039071e-05,
|
||
|
|
"loss": 0.4762,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8876762164620282,
|
||
|
|
"grad_norm": 0.3908015146094751,
|
||
|
|
"learning_rate": 6.26366948124492e-05,
|
||
|
|
"loss": 0.4882,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.892526906169471,
|
||
|
|
"grad_norm": 0.38768255703534454,
|
||
|
|
"learning_rate": 6.25248013367627e-05,
|
||
|
|
"loss": 0.4746,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8973775958769137,
|
||
|
|
"grad_norm": 0.34797592572865116,
|
||
|
|
"learning_rate": 6.241264915845401e-05,
|
||
|
|
"loss": 0.4863,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9022282855843566,
|
||
|
|
"grad_norm": 0.2939167601152598,
|
||
|
|
"learning_rate": 6.230023956561716e-05,
|
||
|
|
"loss": 0.4803,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9070789752917991,
|
||
|
|
"grad_norm": 0.2683598696324213,
|
||
|
|
"learning_rate": 6.218757384930268e-05,
|
||
|
|
"loss": 0.4769,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.911929664999242,
|
||
|
|
"grad_norm": 0.2963955328357841,
|
||
|
|
"learning_rate": 6.207465330350273e-05,
|
||
|
|
"loss": 0.4798,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9167803547066848,
|
||
|
|
"grad_norm": 0.34936564317737695,
|
||
|
|
"learning_rate": 6.196147922513623e-05,
|
||
|
|
"loss": 0.479,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9216310444141276,
|
||
|
|
"grad_norm": 0.3023947037005716,
|
||
|
|
"learning_rate": 6.184805291403402e-05,
|
||
|
|
"loss": 0.4776,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9264817341215705,
|
||
|
|
"grad_norm": 0.23472105439779495,
|
||
|
|
"learning_rate": 6.173437567292383e-05,
|
||
|
|
"loss": 0.475,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.931332423829013,
|
||
|
|
"grad_norm": 0.30151702935680424,
|
||
|
|
"learning_rate": 6.162044880741544e-05,
|
||
|
|
"loss": 0.4719,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.936183113536456,
|
||
|
|
"grad_norm": 0.4380813843564537,
|
||
|
|
"learning_rate": 6.150627362598557e-05,
|
||
|
|
"loss": 0.4871,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9410338032438987,
|
||
|
|
"grad_norm": 0.5034940901090467,
|
||
|
|
"learning_rate": 6.139185143996298e-05,
|
||
|
|
"loss": 0.4806,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9458844929513415,
|
||
|
|
"grad_norm": 0.44831770290541656,
|
||
|
|
"learning_rate": 6.127718356351326e-05,
|
||
|
|
"loss": 0.478,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9507351826587844,
|
||
|
|
"grad_norm": 0.3881020850233725,
|
||
|
|
"learning_rate": 6.116227131362385e-05,
|
||
|
|
"loss": 0.4714,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.955585872366227,
|
||
|
|
"grad_norm": 0.3382903208345561,
|
||
|
|
"learning_rate": 6.104711601008888e-05,
|
||
|
|
"loss": 0.4779,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.96043656207367,
|
||
|
|
"grad_norm": 0.2830781275191087,
|
||
|
|
"learning_rate": 6.0931718975493985e-05,
|
||
|
|
"loss": 0.4846,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9652872517811126,
|
||
|
|
"grad_norm": 0.27767803820547865,
|
||
|
|
"learning_rate": 6.081608153520117e-05,
|
||
|
|
"loss": 0.4691,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9701379414885554,
|
||
|
|
"grad_norm": 0.35527234014372044,
|
||
|
|
"learning_rate": 6.0700205017333525e-05,
|
||
|
|
"loss": 0.4787,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9749886311959983,
|
||
|
|
"grad_norm": 0.34300192919407774,
|
||
|
|
"learning_rate": 6.058409075276002e-05,
|
||
|
|
"loss": 0.4689,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9798393209034408,
|
||
|
|
"grad_norm": 0.2964726842271146,
|
||
|
|
"learning_rate": 6.046774007508019e-05,
|
||
|
|
"loss": 0.475,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9846900106108838,
|
||
|
|
"grad_norm": 0.2502113324820329,
|
||
|
|
"learning_rate": 6.035115432060883e-05,
|
||
|
|
"loss": 0.4747,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9895407003183265,
|
||
|
|
"grad_norm": 0.23471156048166733,
|
||
|
|
"learning_rate": 6.0234334828360655e-05,
|
||
|
|
"loss": 0.4786,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9943913900257693,
|
||
|
|
"grad_norm": 0.27137410563019304,
|
||
|
|
"learning_rate": 6.011728294003494e-05,
|
||
|
|
"loss": 0.4802,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9992420797332122,
|
||
|
|
"grad_norm": 0.2960970008273601,
|
||
|
|
"learning_rate": 6.000000000000001e-05,
|
||
|
|
"loss": 0.4768,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.004850689707443,
|
||
|
|
"grad_norm": 0.33644070901215145,
|
||
|
|
"learning_rate": 5.988248735527793e-05,
|
||
|
|
"loss": 0.4473,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0097013794148855,
|
||
|
|
"grad_norm": 0.39546749871672404,
|
||
|
|
"learning_rate": 5.9764746355528994e-05,
|
||
|
|
"loss": 0.4501,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0145520691223284,
|
||
|
|
"grad_norm": 0.47967552460987467,
|
||
|
|
"learning_rate": 5.964677835303615e-05,
|
||
|
|
"loss": 0.4483,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.019402758829771,
|
||
|
|
"grad_norm": 0.43081443925289625,
|
||
|
|
"learning_rate": 5.952858470268955e-05,
|
||
|
|
"loss": 0.4468,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.024253448537214,
|
||
|
|
"grad_norm": 0.42692273173821377,
|
||
|
|
"learning_rate": 5.941016676197098e-05,
|
||
|
|
"loss": 0.4499,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.029104138244657,
|
||
|
|
"grad_norm": 0.5203095933335755,
|
||
|
|
"learning_rate": 5.929152589093825e-05,
|
||
|
|
"loss": 0.4498,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0339548279520994,
|
||
|
|
"grad_norm": 0.5390774215298352,
|
||
|
|
"learning_rate": 5.9172663452209554e-05,
|
||
|
|
"loss": 0.449,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0388055176595423,
|
||
|
|
"grad_norm": 0.5618164866931807,
|
||
|
|
"learning_rate": 5.9053580810947845e-05,
|
||
|
|
"loss": 0.4555,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.043656207366985,
|
||
|
|
"grad_norm": 0.6473211411897097,
|
||
|
|
"learning_rate": 5.89342793348452e-05,
|
||
|
|
"loss": 0.4537,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.048506897074428,
|
||
|
|
"grad_norm": 0.7641174380933736,
|
||
|
|
"learning_rate": 5.881476039410699e-05,
|
||
|
|
"loss": 0.4579,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0533575867818707,
|
||
|
|
"grad_norm": 0.6589276143277975,
|
||
|
|
"learning_rate": 5.869502536143629e-05,
|
||
|
|
"loss": 0.4478,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0582082764893133,
|
||
|
|
"grad_norm": 0.4055360078631115,
|
||
|
|
"learning_rate": 5.857507561201802e-05,
|
||
|
|
"loss": 0.4501,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.063058966196756,
|
||
|
|
"grad_norm": 0.3576976481492298,
|
||
|
|
"learning_rate": 5.845491252350312e-05,
|
||
|
|
"loss": 0.4479,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0679096559041987,
|
||
|
|
"grad_norm": 0.444390387775824,
|
||
|
|
"learning_rate": 5.833453747599286e-05,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0727603456116417,
|
||
|
|
"grad_norm": 0.38706062123939283,
|
||
|
|
"learning_rate": 5.821395185202285e-05,
|
||
|
|
"loss": 0.449,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0776110353190846,
|
||
|
|
"grad_norm": 0.3243650769978332,
|
||
|
|
"learning_rate": 5.809315703654726e-05,
|
||
|
|
"loss": 0.4581,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.082461725026527,
|
||
|
|
"grad_norm": 0.35862489527477903,
|
||
|
|
"learning_rate": 5.797215441692284e-05,
|
||
|
|
"loss": 0.4534,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.08731241473397,
|
||
|
|
"grad_norm": 0.3353344776943914,
|
||
|
|
"learning_rate": 5.785094538289304e-05,
|
||
|
|
"loss": 0.4537,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0921631044414126,
|
||
|
|
"grad_norm": 0.3620686259692887,
|
||
|
|
"learning_rate": 5.772953132657202e-05,
|
||
|
|
"loss": 0.4553,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0970137941488556,
|
||
|
|
"grad_norm": 0.307403523851727,
|
||
|
|
"learning_rate": 5.7607913642428666e-05,
|
||
|
|
"loss": 0.4424,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.101864483856298,
|
||
|
|
"grad_norm": 0.28710201149568576,
|
||
|
|
"learning_rate": 5.7486093727270606e-05,
|
||
|
|
"loss": 0.4462,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.106715173563741,
|
||
|
|
"grad_norm": 0.31451691469735704,
|
||
|
|
"learning_rate": 5.736407298022809e-05,
|
||
|
|
"loss": 0.4434,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.111565863271184,
|
||
|
|
"grad_norm": 0.3105010146819863,
|
||
|
|
"learning_rate": 5.7241852802738e-05,
|
||
|
|
"loss": 0.4533,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1164165529786265,
|
||
|
|
"grad_norm": 0.2878408032383936,
|
||
|
|
"learning_rate": 5.711943459852772e-05,
|
||
|
|
"loss": 0.4427,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1212672426860695,
|
||
|
|
"grad_norm": 0.3026608573456508,
|
||
|
|
"learning_rate": 5.699681977359902e-05,
|
||
|
|
"loss": 0.4385,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1261179323935124,
|
||
|
|
"grad_norm": 0.2857208984253648,
|
||
|
|
"learning_rate": 5.6874009736211896e-05,
|
||
|
|
"loss": 0.4465,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.130968622100955,
|
||
|
|
"grad_norm": 0.28786201316205207,
|
||
|
|
"learning_rate": 5.675100589686839e-05,
|
||
|
|
"loss": 0.4472,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.135819311808398,
|
||
|
|
"grad_norm": 0.3397455432854385,
|
||
|
|
"learning_rate": 5.662780966829646e-05,
|
||
|
|
"loss": 0.4486,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1406700015158404,
|
||
|
|
"grad_norm": 0.3060713829784068,
|
||
|
|
"learning_rate": 5.650442246543364e-05,
|
||
|
|
"loss": 0.4525,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1455206912232834,
|
||
|
|
"grad_norm": 0.2554715156912105,
|
||
|
|
"learning_rate": 5.638084570541088e-05,
|
||
|
|
"loss": 0.4451,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.150371380930726,
|
||
|
|
"grad_norm": 0.31763468046624377,
|
||
|
|
"learning_rate": 5.625708080753621e-05,
|
||
|
|
"loss": 0.455,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.155222070638169,
|
||
|
|
"grad_norm": 0.3288505103534938,
|
||
|
|
"learning_rate": 5.6133129193278525e-05,
|
||
|
|
"loss": 0.4453,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.160072760345612,
|
||
|
|
"grad_norm": 0.27302062189682574,
|
||
|
|
"learning_rate": 5.600899228625112e-05,
|
||
|
|
"loss": 0.4523,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1649234500530543,
|
||
|
|
"grad_norm": 0.2367917713116079,
|
||
|
|
"learning_rate": 5.588467151219549e-05,
|
||
|
|
"loss": 0.4481,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1697741397604973,
|
||
|
|
"grad_norm": 0.27892881990044693,
|
||
|
|
"learning_rate": 5.5760168298964874e-05,
|
||
|
|
"loss": 0.4397,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.17462482946794,
|
||
|
|
"grad_norm": 0.3659363982261399,
|
||
|
|
"learning_rate": 5.563548407650782e-05,
|
||
|
|
"loss": 0.4464,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1794755191753827,
|
||
|
|
"grad_norm": 0.3139974928184246,
|
||
|
|
"learning_rate": 5.551062027685187e-05,
|
||
|
|
"loss": 0.4487,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1843262088828257,
|
||
|
|
"grad_norm": 0.25894377780084493,
|
||
|
|
"learning_rate": 5.5385578334087006e-05,
|
||
|
|
"loss": 0.4481,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.189176898590268,
|
||
|
|
"grad_norm": 0.17951964246489394,
|
||
|
|
"learning_rate": 5.526035968434927e-05,
|
||
|
|
"loss": 0.4469,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.194027588297711,
|
||
|
|
"grad_norm": 0.21554515294483917,
|
||
|
|
"learning_rate": 5.513496576580418e-05,
|
||
|
|
"loss": 0.4573,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1988782780051537,
|
||
|
|
"grad_norm": 0.24799983544152385,
|
||
|
|
"learning_rate": 5.5009398018630276e-05,
|
||
|
|
"loss": 0.4498,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2037289677125966,
|
||
|
|
"grad_norm": 0.186973375547793,
|
||
|
|
"learning_rate": 5.4883657885002575e-05,
|
||
|
|
"loss": 0.4449,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2085796574200396,
|
||
|
|
"grad_norm": 0.1888483688795653,
|
||
|
|
"learning_rate": 5.475774680907597e-05,
|
||
|
|
"loss": 0.443,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.213430347127482,
|
||
|
|
"grad_norm": 0.18914695692387654,
|
||
|
|
"learning_rate": 5.463166623696868e-05,
|
||
|
|
"loss": 0.4434,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.218281036834925,
|
||
|
|
"grad_norm": 0.19070268199404652,
|
||
|
|
"learning_rate": 5.450541761674562e-05,
|
||
|
|
"loss": 0.4445,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2231317265423676,
|
||
|
|
"grad_norm": 0.2259939851217927,
|
||
|
|
"learning_rate": 5.437900239840179e-05,
|
||
|
|
"loss": 0.4465,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2279824162498105,
|
||
|
|
"grad_norm": 0.18023421500987896,
|
||
|
|
"learning_rate": 5.42524220338456e-05,
|
||
|
|
"loss": 0.4453,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2328331059572535,
|
||
|
|
"grad_norm": 0.16598518940751159,
|
||
|
|
"learning_rate": 5.412567797688219e-05,
|
||
|
|
"loss": 0.4498,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.237683795664696,
|
||
|
|
"grad_norm": 0.16004882092407235,
|
||
|
|
"learning_rate": 5.3998771683196754e-05,
|
||
|
|
"loss": 0.4527,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.242534485372139,
|
||
|
|
"grad_norm": 0.17890202722776521,
|
||
|
|
"learning_rate": 5.3871704610337836e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2473851750795815,
|
||
|
|
"grad_norm": 0.182744670257566,
|
||
|
|
"learning_rate": 5.374447821770053e-05,
|
||
|
|
"loss": 0.4431,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2522358647870244,
|
||
|
|
"grad_norm": 0.19342699163866056,
|
||
|
|
"learning_rate": 5.361709396650977e-05,
|
||
|
|
"loss": 0.4404,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2570865544944674,
|
||
|
|
"grad_norm": 0.20442779398031627,
|
||
|
|
"learning_rate": 5.3489553319803566e-05,
|
||
|
|
"loss": 0.4496,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.26193724420191,
|
||
|
|
"grad_norm": 0.23359410284964036,
|
||
|
|
"learning_rate": 5.336185774241609e-05,
|
||
|
|
"loss": 0.4469,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.266787933909353,
|
||
|
|
"grad_norm": 0.23295417695606166,
|
||
|
|
"learning_rate": 5.3234008700961e-05,
|
||
|
|
"loss": 0.4505,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2716386236167954,
|
||
|
|
"grad_norm": 0.20207094824496044,
|
||
|
|
"learning_rate": 5.3106007663814505e-05,
|
||
|
|
"loss": 0.4406,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2764893133242383,
|
||
|
|
"grad_norm": 0.1850252157104855,
|
||
|
|
"learning_rate": 5.2977856101098484e-05,
|
||
|
|
"loss": 0.4525,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.281340003031681,
|
||
|
|
"grad_norm": 0.1821206965545461,
|
||
|
|
"learning_rate": 5.284955548466371e-05,
|
||
|
|
"loss": 0.4592,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.286190692739124,
|
||
|
|
"grad_norm": 0.1913435003815255,
|
||
|
|
"learning_rate": 5.272110728807279e-05,
|
||
|
|
"loss": 0.4459,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2910413824465667,
|
||
|
|
"grad_norm": 0.17908151714339782,
|
||
|
|
"learning_rate": 5.25925129865834e-05,
|
||
|
|
"loss": 0.4523,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2958920721540093,
|
||
|
|
"grad_norm": 0.17796456682985312,
|
||
|
|
"learning_rate": 5.246377405713121e-05,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.300742761861452,
|
||
|
|
"grad_norm": 0.16950491734508644,
|
||
|
|
"learning_rate": 5.2334891978313006e-05,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.305593451568895,
|
||
|
|
"grad_norm": 0.18036359667208995,
|
||
|
|
"learning_rate": 5.220586823036966e-05,
|
||
|
|
"loss": 0.4458,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3104441412763377,
|
||
|
|
"grad_norm": 0.1873060682555774,
|
||
|
|
"learning_rate": 5.207670429516915e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3152948309837806,
|
||
|
|
"grad_norm": 0.1986452939709168,
|
||
|
|
"learning_rate": 5.1947401656189546e-05,
|
||
|
|
"loss": 0.4593,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.320145520691223,
|
||
|
|
"grad_norm": 0.20494449288937291,
|
||
|
|
"learning_rate": 5.181796179850197e-05,
|
||
|
|
"loss": 0.4424,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.324996210398666,
|
||
|
|
"grad_norm": 0.17952910191793728,
|
||
|
|
"learning_rate": 5.168838620875352e-05,
|
||
|
|
"loss": 0.4503,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3298469001061086,
|
||
|
|
"grad_norm": 0.1909701609269039,
|
||
|
|
"learning_rate": 5.155867637515019e-05,
|
||
|
|
"loss": 0.4506,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3346975898135516,
|
||
|
|
"grad_norm": 0.1838298898954926,
|
||
|
|
"learning_rate": 5.142883378743984e-05,
|
||
|
|
"loss": 0.4513,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3395482795209945,
|
||
|
|
"grad_norm": 0.1818267641683358,
|
||
|
|
"learning_rate": 5.129885993689502e-05,
|
||
|
|
"loss": 0.4488,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.344398969228437,
|
||
|
|
"grad_norm": 0.21197375880432345,
|
||
|
|
"learning_rate": 5.116875631629585e-05,
|
||
|
|
"loss": 0.4456,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.34924965893588,
|
||
|
|
"grad_norm": 0.21240893965447508,
|
||
|
|
"learning_rate": 5.10385244199129e-05,
|
||
|
|
"loss": 0.4386,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.354100348643323,
|
||
|
|
"grad_norm": 0.19244612255162405,
|
||
|
|
"learning_rate": 5.0908165743490047e-05,
|
||
|
|
"loss": 0.4482,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3589510383507655,
|
||
|
|
"grad_norm": 0.22440529731925618,
|
||
|
|
"learning_rate": 5.0777681784227224e-05,
|
||
|
|
"loss": 0.4496,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3638017280582084,
|
||
|
|
"grad_norm": 0.249440062974833,
|
||
|
|
"learning_rate": 5.064707404076327e-05,
|
||
|
|
"loss": 0.4502,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.368652417765651,
|
||
|
|
"grad_norm": 0.2374206142112278,
|
||
|
|
"learning_rate": 5.051634401315875e-05,
|
||
|
|
"loss": 0.448,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.373503107473094,
|
||
|
|
"grad_norm": 0.21044332969367502,
|
||
|
|
"learning_rate": 5.0385493202878656e-05,
|
||
|
|
"loss": 0.4416,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3783537971805364,
|
||
|
|
"grad_norm": 0.15343545111269605,
|
||
|
|
"learning_rate": 5.025452311277522e-05,
|
||
|
|
"loss": 0.4413,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3832044868879794,
|
||
|
|
"grad_norm": 0.17472771019103053,
|
||
|
|
"learning_rate": 5.01234352470706e-05,
|
||
|
|
"loss": 0.4472,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3880551765954223,
|
||
|
|
"grad_norm": 0.2225509747823868,
|
||
|
|
"learning_rate": 4.999223111133968e-05,
|
||
|
|
"loss": 0.4405,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.392905866302865,
|
||
|
|
"grad_norm": 0.27110633436791925,
|
||
|
|
"learning_rate": 4.986091221249269e-05,
|
||
|
|
"loss": 0.44,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.397756556010308,
|
||
|
|
"grad_norm": 0.24255464597168586,
|
||
|
|
"learning_rate": 4.972948005875796e-05,
|
||
|
|
"loss": 0.4432,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4026072457177503,
|
||
|
|
"grad_norm": 0.2496648842091371,
|
||
|
|
"learning_rate": 4.959793615966459e-05,
|
||
|
|
"loss": 0.4401,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4074579354251933,
|
||
|
|
"grad_norm": 0.24806426439634907,
|
||
|
|
"learning_rate": 4.946628202602508e-05,
|
||
|
|
"loss": 0.4526,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.412308625132636,
|
||
|
|
"grad_norm": 0.21808090914084832,
|
||
|
|
"learning_rate": 4.933451916991802e-05,
|
||
|
|
"loss": 0.4474,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4171593148400787,
|
||
|
|
"grad_norm": 0.19833835766366836,
|
||
|
|
"learning_rate": 4.920264910467066e-05,
|
||
|
|
"loss": 0.4485,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4220100045475217,
|
||
|
|
"grad_norm": 0.1904056029579938,
|
||
|
|
"learning_rate": 4.9070673344841645e-05,
|
||
|
|
"loss": 0.4471,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.426860694254964,
|
||
|
|
"grad_norm": 0.17821880940044135,
|
||
|
|
"learning_rate": 4.893859340620348e-05,
|
||
|
|
"loss": 0.4518,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.431711383962407,
|
||
|
|
"grad_norm": 0.16242846601925154,
|
||
|
|
"learning_rate": 4.880641080572522e-05,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.43656207366985,
|
||
|
|
"grad_norm": 0.16230843192633562,
|
||
|
|
"learning_rate": 4.8674127061555025e-05,
|
||
|
|
"loss": 0.4492,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4414127633772926,
|
||
|
|
"grad_norm": 0.18692985874064466,
|
||
|
|
"learning_rate": 4.8541743693002676e-05,
|
||
|
|
"loss": 0.4576,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4462634530847356,
|
||
|
|
"grad_norm": 0.17489245993778632,
|
||
|
|
"learning_rate": 4.8409262220522196e-05,
|
||
|
|
"loss": 0.4476,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.451114142792178,
|
||
|
|
"grad_norm": 0.14061959670906948,
|
||
|
|
"learning_rate": 4.8276684165694336e-05,
|
||
|
|
"loss": 0.4479,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.455964832499621,
|
||
|
|
"grad_norm": 0.17289206898304424,
|
||
|
|
"learning_rate": 4.814401105120914e-05,
|
||
|
|
"loss": 0.4479,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.460815522207064,
|
||
|
|
"grad_norm": 0.21063084112901795,
|
||
|
|
"learning_rate": 4.8011244400848414e-05,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4656662119145065,
|
||
|
|
"grad_norm": 0.2134287283629687,
|
||
|
|
"learning_rate": 4.787838573946825e-05,
|
||
|
|
"loss": 0.4503,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4705169016219495,
|
||
|
|
"grad_norm": 0.19387557882251144,
|
||
|
|
"learning_rate": 4.774543659298152e-05,
|
||
|
|
"loss": 0.4419,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.475367591329392,
|
||
|
|
"grad_norm": 0.1690053079886072,
|
||
|
|
"learning_rate": 4.761239848834031e-05,
|
||
|
|
"loss": 0.4443,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.480218281036835,
|
||
|
|
"grad_norm": 0.17406180900609755,
|
||
|
|
"learning_rate": 4.747927295351845e-05,
|
||
|
|
"loss": 0.4474,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.485068970744278,
|
||
|
|
"grad_norm": 0.2024050850623432,
|
||
|
|
"learning_rate": 4.734606151749389e-05,
|
||
|
|
"loss": 0.4473,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4899196604517204,
|
||
|
|
"grad_norm": 0.22659357369802574,
|
||
|
|
"learning_rate": 4.7212765710231204e-05,
|
||
|
|
"loss": 0.4481,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4947703501591634,
|
||
|
|
"grad_norm": 0.23595309939097722,
|
||
|
|
"learning_rate": 4.707938706266397e-05,
|
||
|
|
"loss": 0.4484,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.499621039866606,
|
||
|
|
"grad_norm": 0.19918584163751257,
|
||
|
|
"learning_rate": 4.694592710667723e-05,
|
||
|
|
"loss": 0.444,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.504471729574049,
|
||
|
|
"grad_norm": 0.18418670752131802,
|
||
|
|
"learning_rate": 4.681238737508983e-05,
|
||
|
|
"loss": 0.4424,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5093224192814914,
|
||
|
|
"grad_norm": 0.1926237495649244,
|
||
|
|
"learning_rate": 4.6678769401636894e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5141731089889343,
|
||
|
|
"grad_norm": 0.20706125086296728,
|
||
|
|
"learning_rate": 4.6545074720952166e-05,
|
||
|
|
"loss": 0.456,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5190237986963773,
|
||
|
|
"grad_norm": 0.17699664563372686,
|
||
|
|
"learning_rate": 4.641130486855038e-05,
|
||
|
|
"loss": 0.4396,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.52387448840382,
|
||
|
|
"grad_norm": 0.18317752602670304,
|
||
|
|
"learning_rate": 4.627746138080966e-05,
|
||
|
|
"loss": 0.4432,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5287251781112627,
|
||
|
|
"grad_norm": 0.2190424482227647,
|
||
|
|
"learning_rate": 4.614354579495379e-05,
|
||
|
|
"loss": 0.4448,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5335758678187057,
|
||
|
|
"grad_norm": 0.20135719119048615,
|
||
|
|
"learning_rate": 4.6009559649034695e-05,
|
||
|
|
"loss": 0.4432,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.538426557526148,
|
||
|
|
"grad_norm": 0.20782420010728125,
|
||
|
|
"learning_rate": 4.587550448191465e-05,
|
||
|
|
"loss": 0.4474,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.543277247233591,
|
||
|
|
"grad_norm": 0.18668296726800496,
|
||
|
|
"learning_rate": 4.5741381833248655e-05,
|
||
|
|
"loss": 0.455,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5481279369410337,
|
||
|
|
"grad_norm": 0.17935132627421838,
|
||
|
|
"learning_rate": 4.560719324346677e-05,
|
||
|
|
"loss": 0.4457,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5529786266484766,
|
||
|
|
"grad_norm": 0.16835981784522308,
|
||
|
|
"learning_rate": 4.547294025375641e-05,
|
||
|
|
"loss": 0.4478,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.557829316355919,
|
||
|
|
"grad_norm": 0.18084487512355504,
|
||
|
|
"learning_rate": 4.533862440604461e-05,
|
||
|
|
"loss": 0.447,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.562680006063362,
|
||
|
|
"grad_norm": 0.17384784743298828,
|
||
|
|
"learning_rate": 4.520424724298036e-05,
|
||
|
|
"loss": 0.4408,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.567530695770805,
|
||
|
|
"grad_norm": 0.20150460275113774,
|
||
|
|
"learning_rate": 4.5069810307916874e-05,
|
||
|
|
"loss": 0.4441,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5723813854782476,
|
||
|
|
"grad_norm": 0.2240004858996321,
|
||
|
|
"learning_rate": 4.493531514489385e-05,
|
||
|
|
"loss": 0.4425,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5772320751856905,
|
||
|
|
"grad_norm": 0.2286831099325836,
|
||
|
|
"learning_rate": 4.480076329861977e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5820827648931335,
|
||
|
|
"grad_norm": 0.2048648110357608,
|
||
|
|
"learning_rate": 4.46661563144541e-05,
|
||
|
|
"loss": 0.4487,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.586933454600576,
|
||
|
|
"grad_norm": 0.2121488428415987,
|
||
|
|
"learning_rate": 4.453149573838962e-05,
|
||
|
|
"loss": 0.4445,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.591784144308019,
|
||
|
|
"grad_norm": 0.18541474138380978,
|
||
|
|
"learning_rate": 4.43967831170346e-05,
|
||
|
|
"loss": 0.4494,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5966348340154615,
|
||
|
|
"grad_norm": 0.17452762694525445,
|
||
|
|
"learning_rate": 4.426201999759505e-05,
|
||
|
|
"loss": 0.4484,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6014855237229044,
|
||
|
|
"grad_norm": 0.21944224068377363,
|
||
|
|
"learning_rate": 4.4127207927857e-05,
|
||
|
|
"loss": 0.4419,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.606336213430347,
|
||
|
|
"grad_norm": 0.17656272332454842,
|
||
|
|
"learning_rate": 4.3992348456168666e-05,
|
||
|
|
"loss": 0.4568,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.61118690313779,
|
||
|
|
"grad_norm": 0.1892562653364182,
|
||
|
|
"learning_rate": 4.385744313142267e-05,
|
||
|
|
"loss": 0.4427,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.616037592845233,
|
||
|
|
"grad_norm": 0.21611454670373548,
|
||
|
|
"learning_rate": 4.372249350303828e-05,
|
||
|
|
"loss": 0.4418,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6208882825526754,
|
||
|
|
"grad_norm": 0.17168747953224547,
|
||
|
|
"learning_rate": 4.358750112094363e-05,
|
||
|
|
"loss": 0.4544,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6257389722601183,
|
||
|
|
"grad_norm": 0.17941819138400728,
|
||
|
|
"learning_rate": 4.3452467535557846e-05,
|
||
|
|
"loss": 0.4372,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6305896619675613,
|
||
|
|
"grad_norm": 0.2025265834742146,
|
||
|
|
"learning_rate": 4.3317394297773304e-05,
|
||
|
|
"loss": 0.4517,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.635440351675004,
|
||
|
|
"grad_norm": 0.20441246530938206,
|
||
|
|
"learning_rate": 4.3182282958937816e-05,
|
||
|
|
"loss": 0.4333,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6402910413824463,
|
||
|
|
"grad_norm": 0.2334105452950634,
|
||
|
|
"learning_rate": 4.304713507083673e-05,
|
||
|
|
"loss": 0.4481,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6451417310898893,
|
||
|
|
"grad_norm": 0.26291969340773214,
|
||
|
|
"learning_rate": 4.291195218567523e-05,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.649992420797332,
|
||
|
|
"grad_norm": 0.1863631298156993,
|
||
|
|
"learning_rate": 4.277673585606046e-05,
|
||
|
|
"loss": 0.4405,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6548431105047747,
|
||
|
|
"grad_norm": 0.23226855973797117,
|
||
|
|
"learning_rate": 4.264148763498364e-05,
|
||
|
|
"loss": 0.4566,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6596938002122177,
|
||
|
|
"grad_norm": 0.30482274820740174,
|
||
|
|
"learning_rate": 4.250620907580226e-05,
|
||
|
|
"loss": 0.4407,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6645444899196606,
|
||
|
|
"grad_norm": 0.23781311620065457,
|
||
|
|
"learning_rate": 4.237090173222231e-05,
|
||
|
|
"loss": 0.4493,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.669395179627103,
|
||
|
|
"grad_norm": 0.1808214801234254,
|
||
|
|
"learning_rate": 4.223556715828033e-05,
|
||
|
|
"loss": 0.4511,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.674245869334546,
|
||
|
|
"grad_norm": 0.26315804734468673,
|
||
|
|
"learning_rate": 4.2100206908325603e-05,
|
||
|
|
"loss": 0.447,
|
||
|
|
"step": 551
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6790965590419886,
|
||
|
|
"grad_norm": 0.25781234163394623,
|
||
|
|
"learning_rate": 4.196482253700235e-05,
|
||
|
|
"loss": 0.4415,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6839472487494316,
|
||
|
|
"grad_norm": 0.17133762584152984,
|
||
|
|
"learning_rate": 4.182941559923179e-05,
|
||
|
|
"loss": 0.4457,
|
||
|
|
"step": 553
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.688797938456874,
|
||
|
|
"grad_norm": 0.2266803612041648,
|
||
|
|
"learning_rate": 4.169398765019433e-05,
|
||
|
|
"loss": 0.4422,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.693648628164317,
|
||
|
|
"grad_norm": 0.23286738752123257,
|
||
|
|
"learning_rate": 4.15585402453117e-05,
|
||
|
|
"loss": 0.4429,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.69849931787176,
|
||
|
|
"grad_norm": 0.20226496811604636,
|
||
|
|
"learning_rate": 4.14230749402291e-05,
|
||
|
|
"loss": 0.4421,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7033500075792025,
|
||
|
|
"grad_norm": 0.21746634743317236,
|
||
|
|
"learning_rate": 4.128759329079732e-05,
|
||
|
|
"loss": 0.4318,
|
||
|
|
"step": 557
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7082006972866455,
|
||
|
|
"grad_norm": 0.24285493960537577,
|
||
|
|
"learning_rate": 4.115209685305482e-05,
|
||
|
|
"loss": 0.4374,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7130513869940884,
|
||
|
|
"grad_norm": 0.20035101285126697,
|
||
|
|
"learning_rate": 4.101658718320998e-05,
|
||
|
|
"loss": 0.4429,
|
||
|
|
"step": 559
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.717902076701531,
|
||
|
|
"grad_norm": 0.1733102653989901,
|
||
|
|
"learning_rate": 4.088106583762309e-05,
|
||
|
|
"loss": 0.4456,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.722752766408974,
|
||
|
|
"grad_norm": 0.25116764609287723,
|
||
|
|
"learning_rate": 4.074553437278857e-05,
|
||
|
|
"loss": 0.4494,
|
||
|
|
"step": 561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7276034561164164,
|
||
|
|
"grad_norm": 0.19896329775589092,
|
||
|
|
"learning_rate": 4.060999434531704e-05,
|
||
|
|
"loss": 0.4449,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7324541458238594,
|
||
|
|
"grad_norm": 0.16517536428811208,
|
||
|
|
"learning_rate": 4.047444731191751e-05,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.737304835531302,
|
||
|
|
"grad_norm": 0.1656807626865065,
|
||
|
|
"learning_rate": 4.033889482937943e-05,
|
||
|
|
"loss": 0.4445,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.742155525238745,
|
||
|
|
"grad_norm": 0.15103159619749504,
|
||
|
|
"learning_rate": 4.020333845455478e-05,
|
||
|
|
"loss": 0.4565,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.747006214946188,
|
||
|
|
"grad_norm": 0.16996820086522443,
|
||
|
|
"learning_rate": 4.0067779744340345e-05,
|
||
|
|
"loss": 0.4459,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7518569046536303,
|
||
|
|
"grad_norm": 0.1495970266083701,
|
||
|
|
"learning_rate": 3.993222025565966e-05,
|
||
|
|
"loss": 0.4447,
|
||
|
|
"step": 567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7567075943610733,
|
||
|
|
"grad_norm": 0.15458974892236554,
|
||
|
|
"learning_rate": 3.979666154544522e-05,
|
||
|
|
"loss": 0.4452,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7615582840685162,
|
||
|
|
"grad_norm": 0.1741093401099396,
|
||
|
|
"learning_rate": 3.96611051706206e-05,
|
||
|
|
"loss": 0.4421,
|
||
|
|
"step": 569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7664089737759587,
|
||
|
|
"grad_norm": 0.1819530197226333,
|
||
|
|
"learning_rate": 3.9525552688082494e-05,
|
||
|
|
"loss": 0.4509,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7712596634834017,
|
||
|
|
"grad_norm": 0.14996389947080183,
|
||
|
|
"learning_rate": 3.939000565468297e-05,
|
||
|
|
"loss": 0.4442,
|
||
|
|
"step": 571
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.776110353190844,
|
||
|
|
"grad_norm": 0.19015205226216172,
|
||
|
|
"learning_rate": 3.9254465627211444e-05,
|
||
|
|
"loss": 0.4458,
|
||
|
|
"step": 572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.780961042898287,
|
||
|
|
"grad_norm": 0.2147271939320094,
|
||
|
|
"learning_rate": 3.911893416237693e-05,
|
||
|
|
"loss": 0.4423,
|
||
|
|
"step": 573
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7858117326057297,
|
||
|
|
"grad_norm": 0.17368493707493848,
|
||
|
|
"learning_rate": 3.8983412816790045e-05,
|
||
|
|
"loss": 0.4415,
|
||
|
|
"step": 574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7906624223131726,
|
||
|
|
"grad_norm": 0.18366782229193682,
|
||
|
|
"learning_rate": 3.8847903146945186e-05,
|
||
|
|
"loss": 0.4419,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7955131120206156,
|
||
|
|
"grad_norm": 0.1770373720928735,
|
||
|
|
"learning_rate": 3.871240670920269e-05,
|
||
|
|
"loss": 0.4477,
|
||
|
|
"step": 576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.800363801728058,
|
||
|
|
"grad_norm": 0.15508137985676013,
|
||
|
|
"learning_rate": 3.85769250597709e-05,
|
||
|
|
"loss": 0.4458,
|
||
|
|
"step": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.805214491435501,
|
||
|
|
"grad_norm": 0.15518418619016236,
|
||
|
|
"learning_rate": 3.844145975468832e-05,
|
||
|
|
"loss": 0.4403,
|
||
|
|
"step": 578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.810065181142944,
|
||
|
|
"grad_norm": 0.13016021632650948,
|
||
|
|
"learning_rate": 3.830601234980569e-05,
|
||
|
|
"loss": 0.4509,
|
||
|
|
"step": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8149158708503865,
|
||
|
|
"grad_norm": 0.17125198843453068,
|
||
|
|
"learning_rate": 3.8170584400768224e-05,
|
||
|
|
"loss": 0.4492,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8197665605578295,
|
||
|
|
"grad_norm": 0.18182599605587274,
|
||
|
|
"learning_rate": 3.8035177462997664e-05,
|
||
|
|
"loss": 0.4475,
|
||
|
|
"step": 581
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.824617250265272,
|
||
|
|
"grad_norm": 0.16612208849559923,
|
||
|
|
"learning_rate": 3.7899793091674396e-05,
|
||
|
|
"loss": 0.4419,
|
||
|
|
"step": 582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.829467939972715,
|
||
|
|
"grad_norm": 0.15504382692612345,
|
||
|
|
"learning_rate": 3.776443284171969e-05,
|
||
|
|
"loss": 0.4421,
|
||
|
|
"step": 583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8343186296801575,
|
||
|
|
"grad_norm": 0.14158493240403466,
|
||
|
|
"learning_rate": 3.7629098267777706e-05,
|
||
|
|
"loss": 0.4399,
|
||
|
|
"step": 584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8391693193876004,
|
||
|
|
"grad_norm": 0.14521790840725082,
|
||
|
|
"learning_rate": 3.7493790924197746e-05,
|
||
|
|
"loss": 0.4328,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8440200090950434,
|
||
|
|
"grad_norm": 0.14223117063886642,
|
||
|
|
"learning_rate": 3.735851236501637e-05,
|
||
|
|
"loss": 0.4403,
|
||
|
|
"step": 586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.848870698802486,
|
||
|
|
"grad_norm": 0.13716153813556975,
|
||
|
|
"learning_rate": 3.722326414393954e-05,
|
||
|
|
"loss": 0.4375,
|
||
|
|
"step": 587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.853721388509929,
|
||
|
|
"grad_norm": 0.13481835773066367,
|
||
|
|
"learning_rate": 3.708804781432478e-05,
|
||
|
|
"loss": 0.4465,
|
||
|
|
"step": 588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.858572078217372,
|
||
|
|
"grad_norm": 0.15098861155914894,
|
||
|
|
"learning_rate": 3.6952864929163286e-05,
|
||
|
|
"loss": 0.4478,
|
||
|
|
"step": 589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8634227679248143,
|
||
|
|
"grad_norm": 0.14428029096945075,
|
||
|
|
"learning_rate": 3.6817717041062204e-05,
|
||
|
|
"loss": 0.4433,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.868273457632257,
|
||
|
|
"grad_norm": 0.14941685817838865,
|
||
|
|
"learning_rate": 3.66826057022267e-05,
|
||
|
|
"loss": 0.4426,
|
||
|
|
"step": 591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8731241473397,
|
||
|
|
"grad_norm": 0.15814915640950794,
|
||
|
|
"learning_rate": 3.654753246444217e-05,
|
||
|
|
"loss": 0.437,
|
||
|
|
"step": 592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8779748370471427,
|
||
|
|
"grad_norm": 0.1666350361712344,
|
||
|
|
"learning_rate": 3.641249887905638e-05,
|
||
|
|
"loss": 0.4404,
|
||
|
|
"step": 593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8828255267545853,
|
||
|
|
"grad_norm": 0.14761581189993908,
|
||
|
|
"learning_rate": 3.627750649696173e-05,
|
||
|
|
"loss": 0.4418,
|
||
|
|
"step": 594
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.887676216462028,
|
||
|
|
"grad_norm": 0.16433140815176697,
|
||
|
|
"learning_rate": 3.614255686857734e-05,
|
||
|
|
"loss": 0.4482,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.892526906169471,
|
||
|
|
"grad_norm": 0.14898117787478918,
|
||
|
|
"learning_rate": 3.600765154383134e-05,
|
||
|
|
"loss": 0.4407,
|
||
|
|
"step": 596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8973775958769137,
|
||
|
|
"grad_norm": 0.1576311183401583,
|
||
|
|
"learning_rate": 3.587279207214301e-05,
|
||
|
|
"loss": 0.4502,
|
||
|
|
"step": 597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9022282855843566,
|
||
|
|
"grad_norm": 0.12713739329535242,
|
||
|
|
"learning_rate": 3.5737980002404965e-05,
|
||
|
|
"loss": 0.4504,
|
||
|
|
"step": 598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.907078975291799,
|
||
|
|
"grad_norm": 0.1407162797770253,
|
||
|
|
"learning_rate": 3.5603216882965415e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"step": 599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.911929664999242,
|
||
|
|
"grad_norm": 0.15096239269549241,
|
||
|
|
"learning_rate": 3.5468504261610387e-05,
|
||
|
|
"loss": 0.4497,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9167803547066846,
|
||
|
|
"grad_norm": 0.1341187523707445,
|
||
|
|
"learning_rate": 3.5333843685545914e-05,
|
||
|
|
"loss": 0.4449,
|
||
|
|
"step": 601
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9216310444141276,
|
||
|
|
"grad_norm": 0.1619335943958044,
|
||
|
|
"learning_rate": 3.519923670138025e-05,
|
||
|
|
"loss": 0.4434,
|
||
|
|
"step": 602
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9264817341215705,
|
||
|
|
"grad_norm": 0.16211928956809254,
|
||
|
|
"learning_rate": 3.506468485510616e-05,
|
||
|
|
"loss": 0.4394,
|
||
|
|
"step": 603
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.931332423829013,
|
||
|
|
"grad_norm": 0.13565985896740923,
|
||
|
|
"learning_rate": 3.493018969208314e-05,
|
||
|
|
"loss": 0.4513,
|
||
|
|
"step": 604
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.936183113536456,
|
||
|
|
"grad_norm": 0.14551792578562125,
|
||
|
|
"learning_rate": 3.479575275701965e-05,
|
||
|
|
"loss": 0.4425,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.941033803243899,
|
||
|
|
"grad_norm": 0.17680002225930672,
|
||
|
|
"learning_rate": 3.4661375593955405e-05,
|
||
|
|
"loss": 0.4384,
|
||
|
|
"step": 606
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9458844929513415,
|
||
|
|
"grad_norm": 0.16447736465809054,
|
||
|
|
"learning_rate": 3.45270597462436e-05,
|
||
|
|
"loss": 0.4451,
|
||
|
|
"step": 607
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9507351826587844,
|
||
|
|
"grad_norm": 0.15299942108774958,
|
||
|
|
"learning_rate": 3.4392806756533233e-05,
|
||
|
|
"loss": 0.4383,
|
||
|
|
"step": 608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.955585872366227,
|
||
|
|
"grad_norm": 0.1536623395643003,
|
||
|
|
"learning_rate": 3.425861816675135e-05,
|
||
|
|
"loss": 0.4453,
|
||
|
|
"step": 609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.96043656207367,
|
||
|
|
"grad_norm": 0.15727509499916784,
|
||
|
|
"learning_rate": 3.4124495518085366e-05,
|
||
|
|
"loss": 0.436,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9652872517811124,
|
||
|
|
"grad_norm": 0.17906600408772821,
|
||
|
|
"learning_rate": 3.399044035096532e-05,
|
||
|
|
"loss": 0.4467,
|
||
|
|
"step": 611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9701379414885554,
|
||
|
|
"grad_norm": 0.16243556143131102,
|
||
|
|
"learning_rate": 3.3856454205046223e-05,
|
||
|
|
"loss": 0.4364,
|
||
|
|
"step": 612
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9749886311959983,
|
||
|
|
"grad_norm": 0.1608114743453715,
|
||
|
|
"learning_rate": 3.372253861919036e-05,
|
||
|
|
"loss": 0.4517,
|
||
|
|
"step": 613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.979839320903441,
|
||
|
|
"grad_norm": 0.16249676695986184,
|
||
|
|
"learning_rate": 3.3588695131449626e-05,
|
||
|
|
"loss": 0.4464,
|
||
|
|
"step": 614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.984690010610884,
|
||
|
|
"grad_norm": 0.14682205641070967,
|
||
|
|
"learning_rate": 3.3454925279047854e-05,
|
||
|
|
"loss": 0.4446,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9895407003183267,
|
||
|
|
"grad_norm": 0.158750060737996,
|
||
|
|
"learning_rate": 3.3321230598363126e-05,
|
||
|
|
"loss": 0.4449,
|
||
|
|
"step": 616
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9943913900257693,
|
||
|
|
"grad_norm": 0.15103569523913019,
|
||
|
|
"learning_rate": 3.3187612624910185e-05,
|
||
|
|
"loss": 0.4457,
|
||
|
|
"step": 617
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.999242079733212,
|
||
|
|
"grad_norm": 0.13279281722008607,
|
||
|
|
"learning_rate": 3.305407289332279e-05,
|
||
|
|
"loss": 0.4524,
|
||
|
|
"step": 618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.004850689707443,
|
||
|
|
"grad_norm": 0.21786343264375474,
|
||
|
|
"learning_rate": 3.2920612937336035e-05,
|
||
|
|
"loss": 0.4186,
|
||
|
|
"step": 619
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0097013794148855,
|
||
|
|
"grad_norm": 0.1940657522375173,
|
||
|
|
"learning_rate": 3.2787234289768816e-05,
|
||
|
|
"loss": 0.4198,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0145520691223284,
|
||
|
|
"grad_norm": 0.21773031084989108,
|
||
|
|
"learning_rate": 3.2653938482506125e-05,
|
||
|
|
"loss": 0.4257,
|
||
|
|
"step": 621
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.019402758829771,
|
||
|
|
"grad_norm": 0.20326547030411232,
|
||
|
|
"learning_rate": 3.252072704648157e-05,
|
||
|
|
"loss": 0.4165,
|
||
|
|
"step": 622
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.024253448537214,
|
||
|
|
"grad_norm": 0.1952882443077655,
|
||
|
|
"learning_rate": 3.2387601511659695e-05,
|
||
|
|
"loss": 0.4099,
|
||
|
|
"step": 623
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.029104138244657,
|
||
|
|
"grad_norm": 0.1918042514566916,
|
||
|
|
"learning_rate": 3.22545634070185e-05,
|
||
|
|
"loss": 0.4152,
|
||
|
|
"step": 624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0339548279520994,
|
||
|
|
"grad_norm": 0.19705116304049744,
|
||
|
|
"learning_rate": 3.212161426053177e-05,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0388055176595423,
|
||
|
|
"grad_norm": 0.19649678060258355,
|
||
|
|
"learning_rate": 3.19887555991516e-05,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"step": 626
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.043656207366985,
|
||
|
|
"grad_norm": 0.21708556979200094,
|
||
|
|
"learning_rate": 3.1855988948790866e-05,
|
||
|
|
"loss": 0.419,
|
||
|
|
"step": 627
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.048506897074428,
|
||
|
|
"grad_norm": 0.1975715363358727,
|
||
|
|
"learning_rate": 3.172331583430567e-05,
|
||
|
|
"loss": 0.4179,
|
||
|
|
"step": 628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0533575867818707,
|
||
|
|
"grad_norm": 0.23335868145268443,
|
||
|
|
"learning_rate": 3.1590737779477825e-05,
|
||
|
|
"loss": 0.4187,
|
||
|
|
"step": 629
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0582082764893133,
|
||
|
|
"grad_norm": 0.16312780113553543,
|
||
|
|
"learning_rate": 3.145825630699734e-05,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.063058966196756,
|
||
|
|
"grad_norm": 0.1995031684584169,
|
||
|
|
"learning_rate": 3.1325872938444995e-05,
|
||
|
|
"loss": 0.4269,
|
||
|
|
"step": 631
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0679096559041987,
|
||
|
|
"grad_norm": 0.16321617015370155,
|
||
|
|
"learning_rate": 3.119358919427478e-05,
|
||
|
|
"loss": 0.4189,
|
||
|
|
"step": 632
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0727603456116417,
|
||
|
|
"grad_norm": 0.18410001775256316,
|
||
|
|
"learning_rate": 3.106140659379652e-05,
|
||
|
|
"loss": 0.4218,
|
||
|
|
"step": 633
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0776110353190846,
|
||
|
|
"grad_norm": 0.15016856412332835,
|
||
|
|
"learning_rate": 3.092932665515837e-05,
|
||
|
|
"loss": 0.4142,
|
||
|
|
"step": 634
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.082461725026527,
|
||
|
|
"grad_norm": 0.1502499895895468,
|
||
|
|
"learning_rate": 3.079735089532935e-05,
|
||
|
|
"loss": 0.4092,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.08731241473397,
|
||
|
|
"grad_norm": 0.1564011362374121,
|
||
|
|
"learning_rate": 3.0665480830082e-05,
|
||
|
|
"loss": 0.4176,
|
||
|
|
"step": 636
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0921631044414126,
|
||
|
|
"grad_norm": 0.15490177035198396,
|
||
|
|
"learning_rate": 3.0533717973974924e-05,
|
||
|
|
"loss": 0.4171,
|
||
|
|
"step": 637
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0970137941488556,
|
||
|
|
"grad_norm": 0.16842243538179394,
|
||
|
|
"learning_rate": 3.040206384033542e-05,
|
||
|
|
"loss": 0.4142,
|
||
|
|
"step": 638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.101864483856298,
|
||
|
|
"grad_norm": 0.15647723657119908,
|
||
|
|
"learning_rate": 3.0270519941242052e-05,
|
||
|
|
"loss": 0.4045,
|
||
|
|
"step": 639
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.106715173563741,
|
||
|
|
"grad_norm": 0.14187950473293476,
|
||
|
|
"learning_rate": 3.0139087787507323e-05,
|
||
|
|
"loss": 0.4162,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.111565863271184,
|
||
|
|
"grad_norm": 0.1486994798612613,
|
||
|
|
"learning_rate": 3.0007768888660337e-05,
|
||
|
|
"loss": 0.4162,
|
||
|
|
"step": 641
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1164165529786265,
|
||
|
|
"grad_norm": 0.14045982542928215,
|
||
|
|
"learning_rate": 2.9876564752929406e-05,
|
||
|
|
"loss": 0.423,
|
||
|
|
"step": 642
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1212672426860695,
|
||
|
|
"grad_norm": 0.14911944405394412,
|
||
|
|
"learning_rate": 2.9745476887224806e-05,
|
||
|
|
"loss": 0.4186,
|
||
|
|
"step": 643
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1261179323935124,
|
||
|
|
"grad_norm": 0.130768356118423,
|
||
|
|
"learning_rate": 2.961450679712135e-05,
|
||
|
|
"loss": 0.4149,
|
||
|
|
"step": 644
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.130968622100955,
|
||
|
|
"grad_norm": 0.15344876789210227,
|
||
|
|
"learning_rate": 2.9483655986841265e-05,
|
||
|
|
"loss": 0.4185,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.135819311808398,
|
||
|
|
"grad_norm": 0.16373681347456412,
|
||
|
|
"learning_rate": 2.9352925959236732e-05,
|
||
|
|
"loss": 0.4199,
|
||
|
|
"step": 646
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1406700015158404,
|
||
|
|
"grad_norm": 0.13681917154319687,
|
||
|
|
"learning_rate": 2.92223182157728e-05,
|
||
|
|
"loss": 0.4187,
|
||
|
|
"step": 647
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1455206912232834,
|
||
|
|
"grad_norm": 0.16376916195733163,
|
||
|
|
"learning_rate": 2.909183425650996e-05,
|
||
|
|
"loss": 0.4144,
|
||
|
|
"step": 648
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.150371380930726,
|
||
|
|
"grad_norm": 0.15231760580025508,
|
||
|
|
"learning_rate": 2.8961475580087108e-05,
|
||
|
|
"loss": 0.4065,
|
||
|
|
"step": 649
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.155222070638169,
|
||
|
|
"grad_norm": 0.1482250629623317,
|
||
|
|
"learning_rate": 2.8831243683704162e-05,
|
||
|
|
"loss": 0.4167,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.160072760345612,
|
||
|
|
"grad_norm": 0.1279306947970543,
|
||
|
|
"learning_rate": 2.8701140063104996e-05,
|
||
|
|
"loss": 0.4163,
|
||
|
|
"step": 651
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1649234500530543,
|
||
|
|
"grad_norm": 0.14839352063841615,
|
||
|
|
"learning_rate": 2.857116621256018e-05,
|
||
|
|
"loss": 0.4066,
|
||
|
|
"step": 652
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1697741397604973,
|
||
|
|
"grad_norm": 0.14182099741632095,
|
||
|
|
"learning_rate": 2.8441323624849827e-05,
|
||
|
|
"loss": 0.4073,
|
||
|
|
"step": 653
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.17462482946794,
|
||
|
|
"grad_norm": 0.14062989686711488,
|
||
|
|
"learning_rate": 2.83116137912465e-05,
|
||
|
|
"loss": 0.4148,
|
||
|
|
"step": 654
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1794755191753827,
|
||
|
|
"grad_norm": 0.13350262763245205,
|
||
|
|
"learning_rate": 2.8182038201498038e-05,
|
||
|
|
"loss": 0.4185,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1843262088828257,
|
||
|
|
"grad_norm": 0.14000623741594295,
|
||
|
|
"learning_rate": 2.8052598343810474e-05,
|
||
|
|
"loss": 0.4084,
|
||
|
|
"step": 656
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.189176898590268,
|
||
|
|
"grad_norm": 0.13922618624994615,
|
||
|
|
"learning_rate": 2.7923295704830868e-05,
|
||
|
|
"loss": 0.4209,
|
||
|
|
"step": 657
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.194027588297711,
|
||
|
|
"grad_norm": 0.13424360868636123,
|
||
|
|
"learning_rate": 2.7794131769630355e-05,
|
||
|
|
"loss": 0.4203,
|
||
|
|
"step": 658
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1988782780051537,
|
||
|
|
"grad_norm": 0.1410971557670894,
|
||
|
|
"learning_rate": 2.7665108021687007e-05,
|
||
|
|
"loss": 0.4229,
|
||
|
|
"step": 659
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2037289677125966,
|
||
|
|
"grad_norm": 0.15216131334925778,
|
||
|
|
"learning_rate": 2.753622594286879e-05,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2085796574200396,
|
||
|
|
"grad_norm": 0.1261567069468345,
|
||
|
|
"learning_rate": 2.7407487013416615e-05,
|
||
|
|
"loss": 0.4083,
|
||
|
|
"step": 661
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.213430347127482,
|
||
|
|
"grad_norm": 0.15853568579534694,
|
||
|
|
"learning_rate": 2.727889271192722e-05,
|
||
|
|
"loss": 0.4187,
|
||
|
|
"step": 662
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.218281036834925,
|
||
|
|
"grad_norm": 0.1151093763460037,
|
||
|
|
"learning_rate": 2.715044451533631e-05,
|
||
|
|
"loss": 0.4164,
|
||
|
|
"step": 663
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2231317265423676,
|
||
|
|
"grad_norm": 0.14296777545925654,
|
||
|
|
"learning_rate": 2.702214389890152e-05,
|
||
|
|
"loss": 0.413,
|
||
|
|
"step": 664
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2279824162498105,
|
||
|
|
"grad_norm": 0.12844538251966497,
|
||
|
|
"learning_rate": 2.6893992336185512e-05,
|
||
|
|
"loss": 0.4035,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2328331059572535,
|
||
|
|
"grad_norm": 0.13036276585432338,
|
||
|
|
"learning_rate": 2.6765991299039025e-05,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"step": 666
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.237683795664696,
|
||
|
|
"grad_norm": 0.12243507743746514,
|
||
|
|
"learning_rate": 2.663814225758393e-05,
|
||
|
|
"loss": 0.4117,
|
||
|
|
"step": 667
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.242534485372139,
|
||
|
|
"grad_norm": 0.14245972122975337,
|
||
|
|
"learning_rate": 2.6510446680196448e-05,
|
||
|
|
"loss": 0.4195,
|
||
|
|
"step": 668
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2473851750795815,
|
||
|
|
"grad_norm": 0.12799039988824565,
|
||
|
|
"learning_rate": 2.638290603349023e-05,
|
||
|
|
"loss": 0.4203,
|
||
|
|
"step": 669
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2522358647870244,
|
||
|
|
"grad_norm": 0.12961756855915293,
|
||
|
|
"learning_rate": 2.625552178229949e-05,
|
||
|
|
"loss": 0.4159,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2570865544944674,
|
||
|
|
"grad_norm": 0.1296944618260485,
|
||
|
|
"learning_rate": 2.612829538966218e-05,
|
||
|
|
"loss": 0.4111,
|
||
|
|
"step": 671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.26193724420191,
|
||
|
|
"grad_norm": 0.14508116679367689,
|
||
|
|
"learning_rate": 2.6001228316803256e-05,
|
||
|
|
"loss": 0.4196,
|
||
|
|
"step": 672
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.266787933909353,
|
||
|
|
"grad_norm": 0.1216509600449271,
|
||
|
|
"learning_rate": 2.5874322023117824e-05,
|
||
|
|
"loss": 0.4162,
|
||
|
|
"step": 673
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2716386236167954,
|
||
|
|
"grad_norm": 0.1407698696044201,
|
||
|
|
"learning_rate": 2.5747577966154404e-05,
|
||
|
|
"loss": 0.4165,
|
||
|
|
"step": 674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2764893133242383,
|
||
|
|
"grad_norm": 0.1142728140236527,
|
||
|
|
"learning_rate": 2.5620997601598215e-05,
|
||
|
|
"loss": 0.4076,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.281340003031681,
|
||
|
|
"grad_norm": 0.14212475123175447,
|
||
|
|
"learning_rate": 2.5494582383254388e-05,
|
||
|
|
"loss": 0.4174,
|
||
|
|
"step": 676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.286190692739124,
|
||
|
|
"grad_norm": 0.11798282133422631,
|
||
|
|
"learning_rate": 2.5368333763031324e-05,
|
||
|
|
"loss": 0.4131,
|
||
|
|
"step": 677
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2910413824465667,
|
||
|
|
"grad_norm": 0.14618443390836333,
|
||
|
|
"learning_rate": 2.5242253190924034e-05,
|
||
|
|
"loss": 0.4092,
|
||
|
|
"step": 678
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2958920721540093,
|
||
|
|
"grad_norm": 0.12815235524330332,
|
||
|
|
"learning_rate": 2.5116342114997442e-05,
|
||
|
|
"loss": 0.409,
|
||
|
|
"step": 679
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.300742761861452,
|
||
|
|
"grad_norm": 0.1346079173937725,
|
||
|
|
"learning_rate": 2.4990601981369737e-05,
|
||
|
|
"loss": 0.4201,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.305593451568895,
|
||
|
|
"grad_norm": 0.12020401678661803,
|
||
|
|
"learning_rate": 2.4865034234195834e-05,
|
||
|
|
"loss": 0.4107,
|
||
|
|
"step": 681
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3104441412763377,
|
||
|
|
"grad_norm": 0.1342860022847603,
|
||
|
|
"learning_rate": 2.4739640315650747e-05,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"step": 682
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3152948309837806,
|
||
|
|
"grad_norm": 0.11394559661374248,
|
||
|
|
"learning_rate": 2.4614421665912997e-05,
|
||
|
|
"loss": 0.4213,
|
||
|
|
"step": 683
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.320145520691223,
|
||
|
|
"grad_norm": 0.126805017438777,
|
||
|
|
"learning_rate": 2.4489379723148147e-05,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"step": 684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.324996210398666,
|
||
|
|
"grad_norm": 0.11817811298525939,
|
||
|
|
"learning_rate": 2.4364515923492187e-05,
|
||
|
|
"loss": 0.4193,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3298469001061086,
|
||
|
|
"grad_norm": 0.1342332761111468,
|
||
|
|
"learning_rate": 2.4239831701035143e-05,
|
||
|
|
"loss": 0.418,
|
||
|
|
"step": 686
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3346975898135516,
|
||
|
|
"grad_norm": 0.12474527460295737,
|
||
|
|
"learning_rate": 2.411532848780451e-05,
|
||
|
|
"loss": 0.4166,
|
||
|
|
"step": 687
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3395482795209945,
|
||
|
|
"grad_norm": 0.1329059048345405,
|
||
|
|
"learning_rate": 2.399100771374888e-05,
|
||
|
|
"loss": 0.4138,
|
||
|
|
"step": 688
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.344398969228437,
|
||
|
|
"grad_norm": 0.11628441384991241,
|
||
|
|
"learning_rate": 2.3866870806721495e-05,
|
||
|
|
"loss": 0.4111,
|
||
|
|
"step": 689
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.34924965893588,
|
||
|
|
"grad_norm": 0.1453864432398833,
|
||
|
|
"learning_rate": 2.37429191924638e-05,
|
||
|
|
"loss": 0.42,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.354100348643323,
|
||
|
|
"grad_norm": 0.11264715608612798,
|
||
|
|
"learning_rate": 2.361915429458913e-05,
|
||
|
|
"loss": 0.417,
|
||
|
|
"step": 691
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3589510383507655,
|
||
|
|
"grad_norm": 0.14248256005839127,
|
||
|
|
"learning_rate": 2.349557753456637e-05,
|
||
|
|
"loss": 0.4168,
|
||
|
|
"step": 692
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3638017280582084,
|
||
|
|
"grad_norm": 0.12398727462550883,
|
||
|
|
"learning_rate": 2.3372190331703556e-05,
|
||
|
|
"loss": 0.4189,
|
||
|
|
"step": 693
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.368652417765651,
|
||
|
|
"grad_norm": 0.11869463547338541,
|
||
|
|
"learning_rate": 2.324899410313161e-05,
|
||
|
|
"loss": 0.4125,
|
||
|
|
"step": 694
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.373503107473094,
|
||
|
|
"grad_norm": 0.12377753865053726,
|
||
|
|
"learning_rate": 2.3125990263788118e-05,
|
||
|
|
"loss": 0.4186,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3783537971805364,
|
||
|
|
"grad_norm": 0.13141561076446406,
|
||
|
|
"learning_rate": 2.3003180226400986e-05,
|
||
|
|
"loss": 0.4123,
|
||
|
|
"step": 696
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3832044868879794,
|
||
|
|
"grad_norm": 0.11816327910326484,
|
||
|
|
"learning_rate": 2.288056540147229e-05,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"step": 697
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3880551765954223,
|
||
|
|
"grad_norm": 0.12260669143866527,
|
||
|
|
"learning_rate": 2.275814719726201e-05,
|
||
|
|
"loss": 0.4133,
|
||
|
|
"step": 698
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.392905866302865,
|
||
|
|
"grad_norm": 0.133084483131333,
|
||
|
|
"learning_rate": 2.263592701977193e-05,
|
||
|
|
"loss": 0.4219,
|
||
|
|
"step": 699
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.397756556010308,
|
||
|
|
"grad_norm": 0.11848073628628028,
|
||
|
|
"learning_rate": 2.2513906272729397e-05,
|
||
|
|
"loss": 0.4143,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4026072457177503,
|
||
|
|
"grad_norm": 0.12858950370510128,
|
||
|
|
"learning_rate": 2.239208635757133e-05,
|
||
|
|
"loss": 0.4166,
|
||
|
|
"step": 701
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4074579354251933,
|
||
|
|
"grad_norm": 0.11722692816596028,
|
||
|
|
"learning_rate": 2.2270468673428004e-05,
|
||
|
|
"loss": 0.4259,
|
||
|
|
"step": 702
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.412308625132636,
|
||
|
|
"grad_norm": 0.11830608786302087,
|
||
|
|
"learning_rate": 2.2149054617106974e-05,
|
||
|
|
"loss": 0.407,
|
||
|
|
"step": 703
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4171593148400787,
|
||
|
|
"grad_norm": 0.12285165179780408,
|
||
|
|
"learning_rate": 2.2027845583077175e-05,
|
||
|
|
"loss": 0.4231,
|
||
|
|
"step": 704
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4220100045475217,
|
||
|
|
"grad_norm": 0.12884220459818324,
|
||
|
|
"learning_rate": 2.1906842963452757e-05,
|
||
|
|
"loss": 0.4069,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.426860694254964,
|
||
|
|
"grad_norm": 0.10700415399515635,
|
||
|
|
"learning_rate": 2.178604814797715e-05,
|
||
|
|
"loss": 0.4149,
|
||
|
|
"step": 706
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.431711383962407,
|
||
|
|
"grad_norm": 0.11853515304365536,
|
||
|
|
"learning_rate": 2.1665462524007162e-05,
|
||
|
|
"loss": 0.4125,
|
||
|
|
"step": 707
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.43656207366985,
|
||
|
|
"grad_norm": 0.10772813217204756,
|
||
|
|
"learning_rate": 2.1545087476496903e-05,
|
||
|
|
"loss": 0.4216,
|
||
|
|
"step": 708
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4414127633772926,
|
||
|
|
"grad_norm": 0.12946073793938545,
|
||
|
|
"learning_rate": 2.1424924387981996e-05,
|
||
|
|
"loss": 0.4227,
|
||
|
|
"step": 709
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4462634530847356,
|
||
|
|
"grad_norm": 0.11256888467579416,
|
||
|
|
"learning_rate": 2.1304974638563715e-05,
|
||
|
|
"loss": 0.4116,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.451114142792178,
|
||
|
|
"grad_norm": 0.12334394272902899,
|
||
|
|
"learning_rate": 2.1185239605893013e-05,
|
||
|
|
"loss": 0.4217,
|
||
|
|
"step": 711
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.455964832499621,
|
||
|
|
"grad_norm": 0.11467056354627854,
|
||
|
|
"learning_rate": 2.106572066515482e-05,
|
||
|
|
"loss": 0.4193,
|
||
|
|
"step": 712
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.460815522207064,
|
||
|
|
"grad_norm": 0.11248104529388973,
|
||
|
|
"learning_rate": 2.0946419189052162e-05,
|
||
|
|
"loss": 0.4147,
|
||
|
|
"step": 713
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4656662119145065,
|
||
|
|
"grad_norm": 0.11839620237634582,
|
||
|
|
"learning_rate": 2.0827336547790452e-05,
|
||
|
|
"loss": 0.4214,
|
||
|
|
"step": 714
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4705169016219495,
|
||
|
|
"grad_norm": 0.11350680266976185,
|
||
|
|
"learning_rate": 2.0708474109061752e-05,
|
||
|
|
"loss": 0.416,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.475367591329392,
|
||
|
|
"grad_norm": 0.11310394993568851,
|
||
|
|
"learning_rate": 2.0589833238029032e-05,
|
||
|
|
"loss": 0.4001,
|
||
|
|
"step": 716
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.480218281036835,
|
||
|
|
"grad_norm": 0.1142115014102716,
|
||
|
|
"learning_rate": 2.0471415297310455e-05,
|
||
|
|
"loss": 0.4158,
|
||
|
|
"step": 717
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.485068970744278,
|
||
|
|
"grad_norm": 0.12114855144347077,
|
||
|
|
"learning_rate": 2.0353221646963864e-05,
|
||
|
|
"loss": 0.41,
|
||
|
|
"step": 718
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4899196604517204,
|
||
|
|
"grad_norm": 0.11155618948058961,
|
||
|
|
"learning_rate": 2.0235253644471012e-05,
|
||
|
|
"loss": 0.4226,
|
||
|
|
"step": 719
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4947703501591634,
|
||
|
|
"grad_norm": 0.12346483161916652,
|
||
|
|
"learning_rate": 2.011751264472206e-05,
|
||
|
|
"loss": 0.4163,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.499621039866606,
|
||
|
|
"grad_norm": 0.1314504891806893,
|
||
|
|
"learning_rate": 2.0000000000000012e-05,
|
||
|
|
"loss": 0.4185,
|
||
|
|
"step": 721
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.504471729574049,
|
||
|
|
"grad_norm": 0.11587490363769114,
|
||
|
|
"learning_rate": 1.9882717059965086e-05,
|
||
|
|
"loss": 0.4243,
|
||
|
|
"step": 722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5093224192814914,
|
||
|
|
"grad_norm": 0.147492515392888,
|
||
|
|
"learning_rate": 1.9765665171639345e-05,
|
||
|
|
"loss": 0.4139,
|
||
|
|
"step": 723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5141731089889343,
|
||
|
|
"grad_norm": 0.13161876983358525,
|
||
|
|
"learning_rate": 1.964884567939118e-05,
|
||
|
|
"loss": 0.4023,
|
||
|
|
"step": 724
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5190237986963773,
|
||
|
|
"grad_norm": 0.1288012773595963,
|
||
|
|
"learning_rate": 1.9532259924919823e-05,
|
||
|
|
"loss": 0.416,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.52387448840382,
|
||
|
|
"grad_norm": 0.10979902623938809,
|
||
|
|
"learning_rate": 1.9415909247239996e-05,
|
||
|
|
"loss": 0.4198,
|
||
|
|
"step": 726
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5287251781112627,
|
||
|
|
"grad_norm": 0.12273321372911124,
|
||
|
|
"learning_rate": 1.9299794982666485e-05,
|
||
|
|
"loss": 0.4084,
|
||
|
|
"step": 727
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5335758678187057,
|
||
|
|
"grad_norm": 0.11485189001172974,
|
||
|
|
"learning_rate": 1.9183918464798837e-05,
|
||
|
|
"loss": 0.408,
|
||
|
|
"step": 728
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.538426557526148,
|
||
|
|
"grad_norm": 0.1142263107632135,
|
||
|
|
"learning_rate": 1.906828102450601e-05,
|
||
|
|
"loss": 0.4131,
|
||
|
|
"step": 729
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.543277247233591,
|
||
|
|
"grad_norm": 0.11235720950088848,
|
||
|
|
"learning_rate": 1.895288398991114e-05,
|
||
|
|
"loss": 0.4102,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5481279369410337,
|
||
|
|
"grad_norm": 0.11476446551397697,
|
||
|
|
"learning_rate": 1.8837728686376158e-05,
|
||
|
|
"loss": 0.4176,
|
||
|
|
"step": 731
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5529786266484766,
|
||
|
|
"grad_norm": 0.12252689326129268,
|
||
|
|
"learning_rate": 1.8722816436486754e-05,
|
||
|
|
"loss": 0.4238,
|
||
|
|
"step": 732
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.557829316355919,
|
||
|
|
"grad_norm": 0.10742367387279236,
|
||
|
|
"learning_rate": 1.8608148560037036e-05,
|
||
|
|
"loss": 0.4248,
|
||
|
|
"step": 733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.562680006063362,
|
||
|
|
"grad_norm": 0.11202485484827082,
|
||
|
|
"learning_rate": 1.8493726374014442e-05,
|
||
|
|
"loss": 0.4047,
|
||
|
|
"step": 734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.567530695770805,
|
||
|
|
"grad_norm": 0.10619927645163178,
|
||
|
|
"learning_rate": 1.8379551192584588e-05,
|
||
|
|
"loss": 0.4105,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5723813854782476,
|
||
|
|
"grad_norm": 0.1104031551739802,
|
||
|
|
"learning_rate": 1.826562432707619e-05,
|
||
|
|
"loss": 0.4158,
|
||
|
|
"step": 736
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5772320751856905,
|
||
|
|
"grad_norm": 0.10920848831138874,
|
||
|
|
"learning_rate": 1.8151947085965994e-05,
|
||
|
|
"loss": 0.4157,
|
||
|
|
"step": 737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5820827648931335,
|
||
|
|
"grad_norm": 0.10703865180962287,
|
||
|
|
"learning_rate": 1.803852077486377e-05,
|
||
|
|
"loss": 0.4144,
|
||
|
|
"step": 738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.586933454600576,
|
||
|
|
"grad_norm": 0.1043548569854331,
|
||
|
|
"learning_rate": 1.7925346696497295e-05,
|
||
|
|
"loss": 0.4082,
|
||
|
|
"step": 739
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.591784144308019,
|
||
|
|
"grad_norm": 0.11138455070453787,
|
||
|
|
"learning_rate": 1.781242615069733e-05,
|
||
|
|
"loss": 0.4137,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5966348340154615,
|
||
|
|
"grad_norm": 0.11820322046978973,
|
||
|
|
"learning_rate": 1.7699760434382853e-05,
|
||
|
|
"loss": 0.4108,
|
||
|
|
"step": 741
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6014855237229044,
|
||
|
|
"grad_norm": 0.11278678223744326,
|
||
|
|
"learning_rate": 1.758735084154601e-05,
|
||
|
|
"loss": 0.4189,
|
||
|
|
"step": 742
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.606336213430347,
|
||
|
|
"grad_norm": 0.11323272041806605,
|
||
|
|
"learning_rate": 1.7475198663237297e-05,
|
||
|
|
"loss": 0.4123,
|
||
|
|
"step": 743
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.61118690313779,
|
||
|
|
"grad_norm": 0.10904989882061365,
|
||
|
|
"learning_rate": 1.736330518755082e-05,
|
||
|
|
"loss": 0.4158,
|
||
|
|
"step": 744
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.616037592845233,
|
||
|
|
"grad_norm": 0.11229772775053595,
|
||
|
|
"learning_rate": 1.7251671699609313e-05,
|
||
|
|
"loss": 0.4182,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6208882825526754,
|
||
|
|
"grad_norm": 0.10951334908001022,
|
||
|
|
"learning_rate": 1.7140299481549557e-05,
|
||
|
|
"loss": 0.4213,
|
||
|
|
"step": 746
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6257389722601183,
|
||
|
|
"grad_norm": 0.11434493342191,
|
||
|
|
"learning_rate": 1.7029189812507603e-05,
|
||
|
|
"loss": 0.4224,
|
||
|
|
"step": 747
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6305896619675613,
|
||
|
|
"grad_norm": 0.10344385368037658,
|
||
|
|
"learning_rate": 1.6918343968604027e-05,
|
||
|
|
"loss": 0.4106,
|
||
|
|
"step": 748
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.635440351675004,
|
||
|
|
"grad_norm": 0.11168549168378746,
|
||
|
|
"learning_rate": 1.6807763222929315e-05,
|
||
|
|
"loss": 0.408,
|
||
|
|
"step": 749
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6402910413824463,
|
||
|
|
"grad_norm": 0.11594040802990377,
|
||
|
|
"learning_rate": 1.669744884552926e-05,
|
||
|
|
"loss": 0.4169,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6451417310898893,
|
||
|
|
"grad_norm": 0.11452998735643415,
|
||
|
|
"learning_rate": 1.6587402103390314e-05,
|
||
|
|
"loss": 0.4162,
|
||
|
|
"step": 751
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.649992420797332,
|
||
|
|
"grad_norm": 0.11033578052460767,
|
||
|
|
"learning_rate": 1.6477624260425137e-05,
|
||
|
|
"loss": 0.4179,
|
||
|
|
"step": 752
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6548431105047747,
|
||
|
|
"grad_norm": 0.1198668006175803,
|
||
|
|
"learning_rate": 1.6368116577457973e-05,
|
||
|
|
"loss": 0.4124,
|
||
|
|
"step": 753
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6596938002122177,
|
||
|
|
"grad_norm": 0.10884921001547737,
|
||
|
|
"learning_rate": 1.6258880312210195e-05,
|
||
|
|
"loss": 0.4152,
|
||
|
|
"step": 754
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6645444899196606,
|
||
|
|
"grad_norm": 0.12009675101549108,
|
||
|
|
"learning_rate": 1.6149916719285942e-05,
|
||
|
|
"loss": 0.4147,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.669395179627103,
|
||
|
|
"grad_norm": 0.11742205741912104,
|
||
|
|
"learning_rate": 1.6041227050157607e-05,
|
||
|
|
"loss": 0.4096,
|
||
|
|
"step": 756
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.674245869334546,
|
||
|
|
"grad_norm": 0.1123251375365744,
|
||
|
|
"learning_rate": 1.5932812553151506e-05,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"step": 757
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6790965590419886,
|
||
|
|
"grad_norm": 0.12009728599563303,
|
||
|
|
"learning_rate": 1.582467447343355e-05,
|
||
|
|
"loss": 0.4179,
|
||
|
|
"step": 758
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6839472487494316,
|
||
|
|
"grad_norm": 0.10679356522304362,
|
||
|
|
"learning_rate": 1.5716814052994928e-05,
|
||
|
|
"loss": 0.4153,
|
||
|
|
"step": 759
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.688797938456874,
|
||
|
|
"grad_norm": 0.11287526417521035,
|
||
|
|
"learning_rate": 1.5609232530637827e-05,
|
||
|
|
"loss": 0.4065,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.693648628164317,
|
||
|
|
"grad_norm": 0.10604174975295146,
|
||
|
|
"learning_rate": 1.5501931141961278e-05,
|
||
|
|
"loss": 0.4135,
|
||
|
|
"step": 761
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.69849931787176,
|
||
|
|
"grad_norm": 0.10895447717123838,
|
||
|
|
"learning_rate": 1.539491111934686e-05,
|
||
|
|
"loss": 0.4102,
|
||
|
|
"step": 762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7033500075792025,
|
||
|
|
"grad_norm": 0.11522106117016057,
|
||
|
|
"learning_rate": 1.5288173691944613e-05,
|
||
|
|
"loss": 0.4193,
|
||
|
|
"step": 763
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7082006972866455,
|
||
|
|
"grad_norm": 0.11668799730823959,
|
||
|
|
"learning_rate": 1.5181720085658906e-05,
|
||
|
|
"loss": 0.4131,
|
||
|
|
"step": 764
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7130513869940884,
|
||
|
|
"grad_norm": 0.1179757070897769,
|
||
|
|
"learning_rate": 1.5075551523134358e-05,
|
||
|
|
"loss": 0.4107,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.717902076701531,
|
||
|
|
"grad_norm": 0.11352197320512793,
|
||
|
|
"learning_rate": 1.4969669223741771e-05,
|
||
|
|
"loss": 0.4093,
|
||
|
|
"step": 766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.722752766408974,
|
||
|
|
"grad_norm": 0.11945491424386492,
|
||
|
|
"learning_rate": 1.4864074403564216e-05,
|
||
|
|
"loss": 0.4142,
|
||
|
|
"step": 767
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7276034561164164,
|
||
|
|
"grad_norm": 0.10892285815783607,
|
||
|
|
"learning_rate": 1.4758768275382887e-05,
|
||
|
|
"loss": 0.4205,
|
||
|
|
"step": 768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7324541458238594,
|
||
|
|
"grad_norm": 0.12765157069597566,
|
||
|
|
"learning_rate": 1.4653752048663394e-05,
|
||
|
|
"loss": 0.412,
|
||
|
|
"step": 769
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.737304835531302,
|
||
|
|
"grad_norm": 0.10956587187939422,
|
||
|
|
"learning_rate": 1.4549026929541693e-05,
|
||
|
|
"loss": 0.4148,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.742155525238745,
|
||
|
|
"grad_norm": 0.12138544117757244,
|
||
|
|
"learning_rate": 1.4444594120810326e-05,
|
||
|
|
"loss": 0.4115,
|
||
|
|
"step": 771
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.747006214946188,
|
||
|
|
"grad_norm": 0.10697996518645103,
|
||
|
|
"learning_rate": 1.4340454821904573e-05,
|
||
|
|
"loss": 0.4194,
|
||
|
|
"step": 772
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7518569046536303,
|
||
|
|
"grad_norm": 0.10998535306961314,
|
||
|
|
"learning_rate": 1.4236610228888683e-05,
|
||
|
|
"loss": 0.4143,
|
||
|
|
"step": 773
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7567075943610733,
|
||
|
|
"grad_norm": 0.10998459629638649,
|
||
|
|
"learning_rate": 1.4133061534442133e-05,
|
||
|
|
"loss": 0.4121,
|
||
|
|
"step": 774
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7615582840685162,
|
||
|
|
"grad_norm": 0.1056241184630642,
|
||
|
|
"learning_rate": 1.4029809927845981e-05,
|
||
|
|
"loss": 0.4146,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7664089737759587,
|
||
|
|
"grad_norm": 0.11617390368347923,
|
||
|
|
"learning_rate": 1.3926856594969115e-05,
|
||
|
|
"loss": 0.4125,
|
||
|
|
"step": 776
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7712596634834017,
|
||
|
|
"grad_norm": 0.10105741678809248,
|
||
|
|
"learning_rate": 1.3824202718254655e-05,
|
||
|
|
"loss": 0.4081,
|
||
|
|
"step": 777
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.776110353190844,
|
||
|
|
"grad_norm": 0.12303653861990478,
|
||
|
|
"learning_rate": 1.3721849476706477e-05,
|
||
|
|
"loss": 0.413,
|
||
|
|
"step": 778
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.780961042898287,
|
||
|
|
"grad_norm": 0.10128855312982177,
|
||
|
|
"learning_rate": 1.3619798045875529e-05,
|
||
|
|
"loss": 0.4117,
|
||
|
|
"step": 779
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7858117326057297,
|
||
|
|
"grad_norm": 0.1118697000457396,
|
||
|
|
"learning_rate": 1.3518049597846412e-05,
|
||
|
|
"loss": 0.4097,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7906624223131726,
|
||
|
|
"grad_norm": 0.10689143155371568,
|
||
|
|
"learning_rate": 1.3416605301223893e-05,
|
||
|
|
"loss": 0.4146,
|
||
|
|
"step": 781
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7955131120206156,
|
||
|
|
"grad_norm": 0.09942573069367382,
|
||
|
|
"learning_rate": 1.3315466321119486e-05,
|
||
|
|
"loss": 0.415,
|
||
|
|
"step": 782
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.800363801728058,
|
||
|
|
"grad_norm": 0.112223020708749,
|
||
|
|
"learning_rate": 1.3214633819138105e-05,
|
||
|
|
"loss": 0.4187,
|
||
|
|
"step": 783
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.805214491435501,
|
||
|
|
"grad_norm": 0.09509376603334437,
|
||
|
|
"learning_rate": 1.3114108953364655e-05,
|
||
|
|
"loss": 0.4083,
|
||
|
|
"step": 784
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.810065181142944,
|
||
|
|
"grad_norm": 0.1039694903593983,
|
||
|
|
"learning_rate": 1.3013892878350771e-05,
|
||
|
|
"loss": 0.415,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8149158708503865,
|
||
|
|
"grad_norm": 0.09799457517015436,
|
||
|
|
"learning_rate": 1.2913986745101567e-05,
|
||
|
|
"loss": 0.4082,
|
||
|
|
"step": 786
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8197665605578295,
|
||
|
|
"grad_norm": 0.0994744466895985,
|
||
|
|
"learning_rate": 1.2814391701062392e-05,
|
||
|
|
"loss": 0.416,
|
||
|
|
"step": 787
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.824617250265272,
|
||
|
|
"grad_norm": 0.09937882084294014,
|
||
|
|
"learning_rate": 1.2715108890105663e-05,
|
||
|
|
"loss": 0.4118,
|
||
|
|
"step": 788
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.829467939972715,
|
||
|
|
"grad_norm": 0.09366936781008306,
|
||
|
|
"learning_rate": 1.2616139452517748e-05,
|
||
|
|
"loss": 0.4202,
|
||
|
|
"step": 789
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8343186296801575,
|
||
|
|
"grad_norm": 0.10103247526201467,
|
||
|
|
"learning_rate": 1.2517484524985836e-05,
|
||
|
|
"loss": 0.414,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8391693193876004,
|
||
|
|
"grad_norm": 0.10154281976161658,
|
||
|
|
"learning_rate": 1.2419145240584856e-05,
|
||
|
|
"loss": 0.4169,
|
||
|
|
"step": 791
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8440200090950434,
|
||
|
|
"grad_norm": 0.09917419975099584,
|
||
|
|
"learning_rate": 1.2321122728764566e-05,
|
||
|
|
"loss": 0.4121,
|
||
|
|
"step": 792
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.848870698802486,
|
||
|
|
"grad_norm": 0.10067017561953691,
|
||
|
|
"learning_rate": 1.222341811533648e-05,
|
||
|
|
"loss": 0.4177,
|
||
|
|
"step": 793
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.853721388509929,
|
||
|
|
"grad_norm": 0.10318548830281854,
|
||
|
|
"learning_rate": 1.2126032522460975e-05,
|
||
|
|
"loss": 0.4211,
|
||
|
|
"step": 794
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.858572078217372,
|
||
|
|
"grad_norm": 0.09616344349182201,
|
||
|
|
"learning_rate": 1.2028967068634417e-05,
|
||
|
|
"loss": 0.4204,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8634227679248143,
|
||
|
|
"grad_norm": 0.10489574786705688,
|
||
|
|
"learning_rate": 1.193222286867628e-05,
|
||
|
|
"loss": 0.4119,
|
||
|
|
"step": 796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.868273457632257,
|
||
|
|
"grad_norm": 0.1008569871750799,
|
||
|
|
"learning_rate": 1.1835801033716372e-05,
|
||
|
|
"loss": 0.4086,
|
||
|
|
"step": 797
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8731241473397,
|
||
|
|
"grad_norm": 0.09122317803788128,
|
||
|
|
"learning_rate": 1.1739702671182083e-05,
|
||
|
|
"loss": 0.4214,
|
||
|
|
"step": 798
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8779748370471427,
|
||
|
|
"grad_norm": 0.0991904412555539,
|
||
|
|
"learning_rate": 1.1643928884785618e-05,
|
||
|
|
"loss": 0.4095,
|
||
|
|
"step": 799
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8828255267545853,
|
||
|
|
"grad_norm": 0.0959491624589887,
|
||
|
|
"learning_rate": 1.1548480774511353e-05,
|
||
|
|
"loss": 0.4218,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.887676216462028,
|
||
|
|
"grad_norm": 0.09541302821779325,
|
||
|
|
"learning_rate": 1.1453359436603213e-05,
|
||
|
|
"loss": 0.4218,
|
||
|
|
"step": 801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.892526906169471,
|
||
|
|
"grad_norm": 0.09054297930670004,
|
||
|
|
"learning_rate": 1.1358565963552039e-05,
|
||
|
|
"loss": 0.421,
|
||
|
|
"step": 802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8973775958769137,
|
||
|
|
"grad_norm": 0.09175434223796135,
|
||
|
|
"learning_rate": 1.126410144408312e-05,
|
||
|
|
"loss": 0.4088,
|
||
|
|
"step": 803
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9022282855843566,
|
||
|
|
"grad_norm": 0.0933968969265613,
|
||
|
|
"learning_rate": 1.1169966963143568e-05,
|
||
|
|
"loss": 0.4105,
|
||
|
|
"step": 804
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.907078975291799,
|
||
|
|
"grad_norm": 0.09827723734859882,
|
||
|
|
"learning_rate": 1.1076163601889953e-05,
|
||
|
|
"loss": 0.4114,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.911929664999242,
|
||
|
|
"grad_norm": 0.09185288445531155,
|
||
|
|
"learning_rate": 1.098269243767589e-05,
|
||
|
|
"loss": 0.4099,
|
||
|
|
"step": 806
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9167803547066846,
|
||
|
|
"grad_norm": 0.10344811490009215,
|
||
|
|
"learning_rate": 1.0889554544039593e-05,
|
||
|
|
"loss": 0.417,
|
||
|
|
"step": 807
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9216310444141276,
|
||
|
|
"grad_norm": 0.09584223413813131,
|
||
|
|
"learning_rate": 1.0796750990691596e-05,
|
||
|
|
"loss": 0.4092,
|
||
|
|
"step": 808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9264817341215705,
|
||
|
|
"grad_norm": 0.10225202685968936,
|
||
|
|
"learning_rate": 1.0704282843502459e-05,
|
||
|
|
"loss": 0.4156,
|
||
|
|
"step": 809
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.931332423829013,
|
||
|
|
"grad_norm": 0.10087206295646214,
|
||
|
|
"learning_rate": 1.0612151164490525e-05,
|
||
|
|
"loss": 0.4209,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.936183113536456,
|
||
|
|
"grad_norm": 0.09495839512641314,
|
||
|
|
"learning_rate": 1.0520357011809707e-05,
|
||
|
|
"loss": 0.4193,
|
||
|
|
"step": 811
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.941033803243899,
|
||
|
|
"grad_norm": 0.10115287872844174,
|
||
|
|
"learning_rate": 1.0428901439737387e-05,
|
||
|
|
"loss": 0.415,
|
||
|
|
"step": 812
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9458844929513415,
|
||
|
|
"grad_norm": 0.09538434619439141,
|
||
|
|
"learning_rate": 1.0337785498662223e-05,
|
||
|
|
"loss": 0.4152,
|
||
|
|
"step": 813
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9507351826587844,
|
||
|
|
"grad_norm": 0.0955931932252973,
|
||
|
|
"learning_rate": 1.024701023507216e-05,
|
||
|
|
"loss": 0.4153,
|
||
|
|
"step": 814
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.955585872366227,
|
||
|
|
"grad_norm": 0.10045702643945939,
|
||
|
|
"learning_rate": 1.015657669154237e-05,
|
||
|
|
"loss": 0.4156,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.96043656207367,
|
||
|
|
"grad_norm": 0.09335473216661304,
|
||
|
|
"learning_rate": 1.00664859067233e-05,
|
||
|
|
"loss": 0.4109,
|
||
|
|
"step": 816
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9652872517811124,
|
||
|
|
"grad_norm": 0.09645722262367523,
|
||
|
|
"learning_rate": 9.976738915328719e-06,
|
||
|
|
"loss": 0.4107,
|
||
|
|
"step": 817
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9701379414885554,
|
||
|
|
"grad_norm": 0.09446841411423582,
|
||
|
|
"learning_rate": 9.887336748123864e-06,
|
||
|
|
"loss": 0.4178,
|
||
|
|
"step": 818
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9749886311959983,
|
||
|
|
"grad_norm": 0.0903235693705833,
|
||
|
|
"learning_rate": 9.798280431913558e-06,
|
||
|
|
"loss": 0.4202,
|
||
|
|
"step": 819
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.979839320903441,
|
||
|
|
"grad_norm": 0.0948091272844266,
|
||
|
|
"learning_rate": 9.709570989530493e-06,
|
||
|
|
"loss": 0.4123,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.984690010610884,
|
||
|
|
"grad_norm": 0.10103242493534337,
|
||
|
|
"learning_rate": 9.621209439823388e-06,
|
||
|
|
"loss": 0.4132,
|
||
|
|
"step": 821
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9895407003183267,
|
||
|
|
"grad_norm": 0.09199808648298305,
|
||
|
|
"learning_rate": 9.533196797645354e-06,
|
||
|
|
"loss": 0.4101,
|
||
|
|
"step": 822
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9943913900257693,
|
||
|
|
"grad_norm": 0.09866422487619428,
|
||
|
|
"learning_rate": 9.44553407384221e-06,
|
||
|
|
"loss": 0.412,
|
||
|
|
"step": 823
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.999242079733212,
|
||
|
|
"grad_norm": 0.09950215009808663,
|
||
|
|
"learning_rate": 9.358222275240884e-06,
|
||
|
|
"loss": 0.4113,
|
||
|
|
"step": 824
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0048506897074425,
|
||
|
|
"grad_norm": 0.16227668213643984,
|
||
|
|
"learning_rate": 9.271262404637835e-06,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.009701379414886,
|
||
|
|
"grad_norm": 0.11430363283166092,
|
||
|
|
"learning_rate": 9.184655460787591e-06,
|
||
|
|
"loss": 0.3988,
|
||
|
|
"step": 826
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.014552069122328,
|
||
|
|
"grad_norm": 0.11751760908320912,
|
||
|
|
"learning_rate": 9.098402438391161e-06,
|
||
|
|
"loss": 0.3943,
|
||
|
|
"step": 827
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.019402758829771,
|
||
|
|
"grad_norm": 0.13225951402360853,
|
||
|
|
"learning_rate": 9.012504328084724e-06,
|
||
|
|
"loss": 0.4024,
|
||
|
|
"step": 828
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0242534485372135,
|
||
|
|
"grad_norm": 0.1269725744791709,
|
||
|
|
"learning_rate": 8.926962116428228e-06,
|
||
|
|
"loss": 0.4,
|
||
|
|
"step": 829
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.029104138244657,
|
||
|
|
"grad_norm": 0.12436207204015083,
|
||
|
|
"learning_rate": 8.841776785894014e-06,
|
||
|
|
"loss": 0.3994,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.033954827952099,
|
||
|
|
"grad_norm": 0.12882961013897004,
|
||
|
|
"learning_rate": 8.756949314855565e-06,
|
||
|
|
"loss": 0.3977,
|
||
|
|
"step": 831
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.038805517659542,
|
||
|
|
"grad_norm": 0.11685196489455994,
|
||
|
|
"learning_rate": 8.672480677576267e-06,
|
||
|
|
"loss": 0.3906,
|
||
|
|
"step": 832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.043656207366985,
|
||
|
|
"grad_norm": 0.106568471380297,
|
||
|
|
"learning_rate": 8.58837184419821e-06,
|
||
|
|
"loss": 0.388,
|
||
|
|
"step": 833
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.048506897074428,
|
||
|
|
"grad_norm": 0.1134401036269532,
|
||
|
|
"learning_rate": 8.504623780731056e-06,
|
||
|
|
"loss": 0.3918,
|
||
|
|
"step": 834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.05335758678187,
|
||
|
|
"grad_norm": 0.12093411095557142,
|
||
|
|
"learning_rate": 8.421237449040962e-06,
|
||
|
|
"loss": 0.397,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.058208276489314,
|
||
|
|
"grad_norm": 0.11674197768223235,
|
||
|
|
"learning_rate": 8.338213806839453e-06,
|
||
|
|
"loss": 0.393,
|
||
|
|
"step": 836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.063058966196756,
|
||
|
|
"grad_norm": 0.10082248897884766,
|
||
|
|
"learning_rate": 8.255553807672547e-06,
|
||
|
|
"loss": 0.3897,
|
||
|
|
"step": 837
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.067909655904199,
|
||
|
|
"grad_norm": 0.11069126863357161,
|
||
|
|
"learning_rate": 8.1732584009097e-06,
|
||
|
|
"loss": 0.3882,
|
||
|
|
"step": 838
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.072760345611641,
|
||
|
|
"grad_norm": 0.11738423041846735,
|
||
|
|
"learning_rate": 8.091328531732925e-06,
|
||
|
|
"loss": 0.3959,
|
||
|
|
"step": 839
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.077611035319085,
|
||
|
|
"grad_norm": 0.10387395387726432,
|
||
|
|
"learning_rate": 8.009765141126014e-06,
|
||
|
|
"loss": 0.3891,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.082461725026527,
|
||
|
|
"grad_norm": 0.10003982048853301,
|
||
|
|
"learning_rate": 7.928569165863584e-06,
|
||
|
|
"loss": 0.3909,
|
||
|
|
"step": 841
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.08731241473397,
|
||
|
|
"grad_norm": 0.10549262492820612,
|
||
|
|
"learning_rate": 7.847741538500439e-06,
|
||
|
|
"loss": 0.3875,
|
||
|
|
"step": 842
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.092163104441413,
|
||
|
|
"grad_norm": 0.10986857014715488,
|
||
|
|
"learning_rate": 7.767283187360846e-06,
|
||
|
|
"loss": 0.3929,
|
||
|
|
"step": 843
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.097013794148856,
|
||
|
|
"grad_norm": 0.09674503227632227,
|
||
|
|
"learning_rate": 7.687195036527813e-06,
|
||
|
|
"loss": 0.3928,
|
||
|
|
"step": 844
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.101864483856298,
|
||
|
|
"grad_norm": 0.10305617355184506,
|
||
|
|
"learning_rate": 7.60747800583252e-06,
|
||
|
|
"loss": 0.4002,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1067151735637415,
|
||
|
|
"grad_norm": 0.09835145173574618,
|
||
|
|
"learning_rate": 7.52813301084375e-06,
|
||
|
|
"loss": 0.3961,
|
||
|
|
"step": 846
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.111565863271184,
|
||
|
|
"grad_norm": 0.09426867245890704,
|
||
|
|
"learning_rate": 7.449160962857358e-06,
|
||
|
|
"loss": 0.3946,
|
||
|
|
"step": 847
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1164165529786265,
|
||
|
|
"grad_norm": 0.0993123791236732,
|
||
|
|
"learning_rate": 7.370562768885823e-06,
|
||
|
|
"loss": 0.3937,
|
||
|
|
"step": 848
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.121267242686069,
|
||
|
|
"grad_norm": 0.09692548506896993,
|
||
|
|
"learning_rate": 7.292339331647848e-06,
|
||
|
|
"loss": 0.3957,
|
||
|
|
"step": 849
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.126117932393512,
|
||
|
|
"grad_norm": 0.0977639150148905,
|
||
|
|
"learning_rate": 7.214491549557898e-06,
|
||
|
|
"loss": 0.3969,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.130968622100955,
|
||
|
|
"grad_norm": 0.10100358672328608,
|
||
|
|
"learning_rate": 7.1370203167160326e-06,
|
||
|
|
"loss": 0.3937,
|
||
|
|
"step": 851
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1358193118083975,
|
||
|
|
"grad_norm": 0.0961508995875077,
|
||
|
|
"learning_rate": 7.0599265228975e-06,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 852
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.140670001515841,
|
||
|
|
"grad_norm": 0.09450992963252156,
|
||
|
|
"learning_rate": 6.983211053542591e-06,
|
||
|
|
"loss": 0.4008,
|
||
|
|
"step": 853
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.145520691223283,
|
||
|
|
"grad_norm": 0.10411768031595499,
|
||
|
|
"learning_rate": 6.9068747897464535e-06,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"step": 854
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.150371380930726,
|
||
|
|
"grad_norm": 0.09981896937364662,
|
||
|
|
"learning_rate": 6.830918608248964e-06,
|
||
|
|
"loss": 0.4002,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.155222070638169,
|
||
|
|
"grad_norm": 0.08754517232013051,
|
||
|
|
"learning_rate": 6.755343381424659e-06,
|
||
|
|
"loss": 0.3976,
|
||
|
|
"step": 856
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.160072760345612,
|
||
|
|
"grad_norm": 0.09520671649907132,
|
||
|
|
"learning_rate": 6.68014997727275e-06,
|
||
|
|
"loss": 0.3921,
|
||
|
|
"step": 857
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.164923450053054,
|
||
|
|
"grad_norm": 0.09719121632502135,
|
||
|
|
"learning_rate": 6.605339259407104e-06,
|
||
|
|
"loss": 0.3852,
|
||
|
|
"step": 858
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.169774139760497,
|
||
|
|
"grad_norm": 0.08870928687089744,
|
||
|
|
"learning_rate": 6.530912087046317e-06,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 859
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.17462482946794,
|
||
|
|
"grad_norm": 0.08685694037296439,
|
||
|
|
"learning_rate": 6.456869315003946e-06,
|
||
|
|
"loss": 0.3941,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.179475519175383,
|
||
|
|
"grad_norm": 0.08904997313439429,
|
||
|
|
"learning_rate": 6.3832117936785564e-06,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 861
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.184326208882825,
|
||
|
|
"grad_norm": 0.0910482544729166,
|
||
|
|
"learning_rate": 6.309940369044047e-06,
|
||
|
|
"loss": 0.3945,
|
||
|
|
"step": 862
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.189176898590269,
|
||
|
|
"grad_norm": 0.0887134263309384,
|
||
|
|
"learning_rate": 6.23705588263992e-06,
|
||
|
|
"loss": 0.3881,
|
||
|
|
"step": 863
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.194027588297711,
|
||
|
|
"grad_norm": 0.08977043998387933,
|
||
|
|
"learning_rate": 6.164559171561553e-06,
|
||
|
|
"loss": 0.3957,
|
||
|
|
"step": 864
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.198878278005154,
|
||
|
|
"grad_norm": 0.09216637353138708,
|
||
|
|
"learning_rate": 6.092451068450671e-06,
|
||
|
|
"loss": 0.3969,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.203728967712596,
|
||
|
|
"grad_norm": 0.08464526503746563,
|
||
|
|
"learning_rate": 6.020732401485751e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"step": 866
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.20857965742004,
|
||
|
|
"grad_norm": 0.08983668958241023,
|
||
|
|
"learning_rate": 5.9494039943724845e-06,
|
||
|
|
"loss": 0.4025,
|
||
|
|
"step": 867
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.213430347127482,
|
||
|
|
"grad_norm": 0.08753368121579265,
|
||
|
|
"learning_rate": 5.878466666334341e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"step": 868
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.218281036834925,
|
||
|
|
"grad_norm": 0.08687937990016806,
|
||
|
|
"learning_rate": 5.80792123210316e-06,
|
||
|
|
"loss": 0.391,
|
||
|
|
"step": 869
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.223131726542368,
|
||
|
|
"grad_norm": 0.08701962255264487,
|
||
|
|
"learning_rate": 5.737768501909773e-06,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2279824162498105,
|
||
|
|
"grad_norm": 0.08811499746627446,
|
||
|
|
"learning_rate": 5.668009281474751e-06,
|
||
|
|
"loss": 0.3846,
|
||
|
|
"step": 871
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.232833105957253,
|
||
|
|
"grad_norm": 0.08561633996061273,
|
||
|
|
"learning_rate": 5.598644371999085e-06,
|
||
|
|
"loss": 0.3919,
|
||
|
|
"step": 872
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.237683795664696,
|
||
|
|
"grad_norm": 0.08321419945310787,
|
||
|
|
"learning_rate": 5.5296745701549906e-06,
|
||
|
|
"loss": 0.394,
|
||
|
|
"step": 873
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.242534485372139,
|
||
|
|
"grad_norm": 0.08724643182123827,
|
||
|
|
"learning_rate": 5.4611006680768305e-06,
|
||
|
|
"loss": 0.3994,
|
||
|
|
"step": 874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2473851750795815,
|
||
|
|
"grad_norm": 0.08361301460174729,
|
||
|
|
"learning_rate": 5.3929234533519345e-06,
|
||
|
|
"loss": 0.388,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.252235864787025,
|
||
|
|
"grad_norm": 0.0862576560444418,
|
||
|
|
"learning_rate": 5.325143709011587e-06,
|
||
|
|
"loss": 0.3942,
|
||
|
|
"step": 876
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.257086554494467,
|
||
|
|
"grad_norm": 0.082606577377262,
|
||
|
|
"learning_rate": 5.257762213522055e-06,
|
||
|
|
"loss": 0.3907,
|
||
|
|
"step": 877
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.26193724420191,
|
||
|
|
"grad_norm": 0.08601520095290771,
|
||
|
|
"learning_rate": 5.19077974077558e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"step": 878
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.266787933909352,
|
||
|
|
"grad_norm": 0.08144847498379731,
|
||
|
|
"learning_rate": 5.124197060081564e-06,
|
||
|
|
"loss": 0.3981,
|
||
|
|
"step": 879
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.271638623616796,
|
||
|
|
"grad_norm": 0.08369648223405692,
|
||
|
|
"learning_rate": 5.058014936157714e-06,
|
||
|
|
"loss": 0.3987,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.276489313324238,
|
||
|
|
"grad_norm": 0.08487450387282452,
|
||
|
|
"learning_rate": 4.992234129121225e-06,
|
||
|
|
"loss": 0.3993,
|
||
|
|
"step": 881
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.281340003031681,
|
||
|
|
"grad_norm": 0.0835928002976722,
|
||
|
|
"learning_rate": 4.926855394480079e-06,
|
||
|
|
"loss": 0.394,
|
||
|
|
"step": 882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.286190692739124,
|
||
|
|
"grad_norm": 0.0877663910781713,
|
||
|
|
"learning_rate": 4.861879483124372e-06,
|
||
|
|
"loss": 0.3888,
|
||
|
|
"step": 883
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.291041382446567,
|
||
|
|
"grad_norm": 0.08201902566316537,
|
||
|
|
"learning_rate": 4.797307141317666e-06,
|
||
|
|
"loss": 0.3978,
|
||
|
|
"step": 884
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.295892072154009,
|
||
|
|
"grad_norm": 0.08252391090570044,
|
||
|
|
"learning_rate": 4.7331391106884364e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.300742761861452,
|
||
|
|
"grad_norm": 0.08121494056766716,
|
||
|
|
"learning_rate": 4.6693761282215766e-06,
|
||
|
|
"loss": 0.3922,
|
||
|
|
"step": 886
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.305593451568895,
|
||
|
|
"grad_norm": 0.08414265476669228,
|
||
|
|
"learning_rate": 4.606018926249851e-06,
|
||
|
|
"loss": 0.389,
|
||
|
|
"step": 887
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.310444141276338,
|
||
|
|
"grad_norm": 0.08528892500629966,
|
||
|
|
"learning_rate": 4.543068232445596e-06,
|
||
|
|
"loss": 0.3956,
|
||
|
|
"step": 888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.31529483098378,
|
||
|
|
"grad_norm": 0.08280199210655267,
|
||
|
|
"learning_rate": 4.480524769812276e-06,
|
||
|
|
"loss": 0.3938,
|
||
|
|
"step": 889
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.320145520691224,
|
||
|
|
"grad_norm": 0.08884076809121294,
|
||
|
|
"learning_rate": 4.418389256676206e-06,
|
||
|
|
"loss": 0.3947,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.324996210398666,
|
||
|
|
"grad_norm": 0.08902762031211295,
|
||
|
|
"learning_rate": 4.35666240667834e-06,
|
||
|
|
"loss": 0.3907,
|
||
|
|
"step": 891
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.329846900106109,
|
||
|
|
"grad_norm": 0.08464410835355753,
|
||
|
|
"learning_rate": 4.295344928765999e-06,
|
||
|
|
"loss": 0.3939,
|
||
|
|
"step": 892
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.334697589813552,
|
||
|
|
"grad_norm": 0.08347421805827179,
|
||
|
|
"learning_rate": 4.234437527184785e-06,
|
||
|
|
"loss": 0.3985,
|
||
|
|
"step": 893
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3395482795209945,
|
||
|
|
"grad_norm": 0.08770395898608876,
|
||
|
|
"learning_rate": 4.173940901470488e-06,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 894
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.344398969228437,
|
||
|
|
"grad_norm": 0.08139356257791996,
|
||
|
|
"learning_rate": 4.11385574644104e-06,
|
||
|
|
"loss": 0.39,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.34924965893588,
|
||
|
|
"grad_norm": 0.0818047074224124,
|
||
|
|
"learning_rate": 4.054182752188501e-06,
|
||
|
|
"loss": 0.3948,
|
||
|
|
"step": 896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.354100348643323,
|
||
|
|
"grad_norm": 0.08370905382784635,
|
||
|
|
"learning_rate": 3.994922604071217e-06,
|
||
|
|
"loss": 0.3941,
|
||
|
|
"step": 897
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3589510383507655,
|
||
|
|
"grad_norm": 0.08543140680473625,
|
||
|
|
"learning_rate": 3.936075982705871e-06,
|
||
|
|
"loss": 0.3982,
|
||
|
|
"step": 898
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.363801728058208,
|
||
|
|
"grad_norm": 0.08330546263387466,
|
||
|
|
"learning_rate": 3.877643563959694e-06,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 899
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.368652417765651,
|
||
|
|
"grad_norm": 0.08181359063571672,
|
||
|
|
"learning_rate": 3.819626018942732e-06,
|
||
|
|
"loss": 0.3962,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.373503107473094,
|
||
|
|
"grad_norm": 0.08458157848919085,
|
||
|
|
"learning_rate": 3.762024014000054e-06,
|
||
|
|
"loss": 0.3974,
|
||
|
|
"step": 901
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.378353797180536,
|
||
|
|
"grad_norm": 0.0797245693813596,
|
||
|
|
"learning_rate": 3.7048382107042113e-06,
|
||
|
|
"loss": 0.3849,
|
||
|
|
"step": 902
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.38320448688798,
|
||
|
|
"grad_norm": 0.08552325047075819,
|
||
|
|
"learning_rate": 3.6480692658475446e-06,
|
||
|
|
"loss": 0.3908,
|
||
|
|
"step": 903
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.388055176595422,
|
||
|
|
"grad_norm": 0.08191614537028945,
|
||
|
|
"learning_rate": 3.5917178314346955e-06,
|
||
|
|
"loss": 0.398,
|
||
|
|
"step": 904
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.392905866302865,
|
||
|
|
"grad_norm": 0.08324472646577967,
|
||
|
|
"learning_rate": 3.535784554675088e-06,
|
||
|
|
"loss": 0.3941,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.397756556010307,
|
||
|
|
"grad_norm": 0.08386476178169076,
|
||
|
|
"learning_rate": 3.480270077975525e-06,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 906
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.402607245717751,
|
||
|
|
"grad_norm": 0.08759720428686872,
|
||
|
|
"learning_rate": 3.42517503893276e-06,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"step": 907
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.407457935425193,
|
||
|
|
"grad_norm": 0.0834695533465509,
|
||
|
|
"learning_rate": 3.370500070326257e-06,
|
||
|
|
"loss": 0.3832,
|
||
|
|
"step": 908
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.412308625132636,
|
||
|
|
"grad_norm": 0.080547419306159,
|
||
|
|
"learning_rate": 3.3162458001108332e-06,
|
||
|
|
"loss": 0.3858,
|
||
|
|
"step": 909
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.417159314840079,
|
||
|
|
"grad_norm": 0.08406690651169581,
|
||
|
|
"learning_rate": 3.2624128514094778e-06,
|
||
|
|
"loss": 0.3923,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.422010004547522,
|
||
|
|
"grad_norm": 0.08417275645743631,
|
||
|
|
"learning_rate": 3.20900184250625e-06,
|
||
|
|
"loss": 0.3933,
|
||
|
|
"step": 911
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.426860694254964,
|
||
|
|
"grad_norm": 0.08104238775596306,
|
||
|
|
"learning_rate": 3.1560133868390895e-06,
|
||
|
|
"loss": 0.4023,
|
||
|
|
"step": 912
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.431711383962407,
|
||
|
|
"grad_norm": 0.08376361425082632,
|
||
|
|
"learning_rate": 3.1034480929928333e-06,
|
||
|
|
"loss": 0.399,
|
||
|
|
"step": 913
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.43656207366985,
|
||
|
|
"grad_norm": 0.08058350379685782,
|
||
|
|
"learning_rate": 3.0513065646921957e-06,
|
||
|
|
"loss": 0.3946,
|
||
|
|
"step": 914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.441412763377293,
|
||
|
|
"grad_norm": 0.08013760556674378,
|
||
|
|
"learning_rate": 2.999589400794851e-06,
|
||
|
|
"loss": 0.392,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.446263453084735,
|
||
|
|
"grad_norm": 0.08082216615145961,
|
||
|
|
"learning_rate": 2.948297195284546e-06,
|
||
|
|
"loss": 0.3916,
|
||
|
|
"step": 916
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4511141427921785,
|
||
|
|
"grad_norm": 0.09007717527915819,
|
||
|
|
"learning_rate": 2.897430537264283e-06,
|
||
|
|
"loss": 0.3947,
|
||
|
|
"step": 917
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.455964832499621,
|
||
|
|
"grad_norm": 0.0866390993026153,
|
||
|
|
"learning_rate": 2.8469900109495553e-06,
|
||
|
|
"loss": 0.3942,
|
||
|
|
"step": 918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.460815522207064,
|
||
|
|
"grad_norm": 0.08034663815912857,
|
||
|
|
"learning_rate": 2.79697619566162e-06,
|
||
|
|
"loss": 0.3888,
|
||
|
|
"step": 919
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.465666211914507,
|
||
|
|
"grad_norm": 0.08423465438674188,
|
||
|
|
"learning_rate": 2.7473896658208743e-06,
|
||
|
|
"loss": 0.391,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4705169016219495,
|
||
|
|
"grad_norm": 0.08769412223500794,
|
||
|
|
"learning_rate": 2.6982309909402293e-06,
|
||
|
|
"loss": 0.3936,
|
||
|
|
"step": 921
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.475367591329392,
|
||
|
|
"grad_norm": 0.0794092004599977,
|
||
|
|
"learning_rate": 2.649500735618582e-06,
|
||
|
|
"loss": 0.3993,
|
||
|
|
"step": 922
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.480218281036835,
|
||
|
|
"grad_norm": 0.08321318361659834,
|
||
|
|
"learning_rate": 2.6011994595343516e-06,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 923
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.485068970744278,
|
||
|
|
"grad_norm": 0.08400144661884966,
|
||
|
|
"learning_rate": 2.5533277174389916e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"step": 924
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.48991966045172,
|
||
|
|
"grad_norm": 0.08039773750467258,
|
||
|
|
"learning_rate": 2.5058860591506973e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.494770350159163,
|
||
|
|
"grad_norm": 0.08132175209524008,
|
||
|
|
"learning_rate": 2.4588750295480246e-06,
|
||
|
|
"loss": 0.3888,
|
||
|
|
"step": 926
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.499621039866606,
|
||
|
|
"grad_norm": 0.07920562392368859,
|
||
|
|
"learning_rate": 2.4122951685636674e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"step": 927
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.504471729574049,
|
||
|
|
"grad_norm": 0.07833571502263627,
|
||
|
|
"learning_rate": 2.366147011178246e-06,
|
||
|
|
"loss": 0.398,
|
||
|
|
"step": 928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.509322419281491,
|
||
|
|
"grad_norm": 0.08232920047142565,
|
||
|
|
"learning_rate": 2.320431087414159e-06,
|
||
|
|
"loss": 0.3838,
|
||
|
|
"step": 929
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.514173108988935,
|
||
|
|
"grad_norm": 0.08136833742803433,
|
||
|
|
"learning_rate": 2.275147922329506e-06,
|
||
|
|
"loss": 0.3935,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.519023798696377,
|
||
|
|
"grad_norm": 0.08383396678636225,
|
||
|
|
"learning_rate": 2.230298036012055e-06,
|
||
|
|
"loss": 0.3913,
|
||
|
|
"step": 931
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.52387448840382,
|
||
|
|
"grad_norm": 0.08000391439211,
|
||
|
|
"learning_rate": 2.1858819435732583e-06,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 932
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.528725178111262,
|
||
|
|
"grad_norm": 0.080147749223563,
|
||
|
|
"learning_rate": 2.141900155142351e-06,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"step": 933
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.533575867818706,
|
||
|
|
"grad_norm": 0.07978655829531454,
|
||
|
|
"learning_rate": 2.0983531758604726e-06,
|
||
|
|
"loss": 0.4011,
|
||
|
|
"step": 934
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.538426557526148,
|
||
|
|
"grad_norm": 0.08386421814730027,
|
||
|
|
"learning_rate": 2.055241505874892e-06,
|
||
|
|
"loss": 0.3999,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.543277247233591,
|
||
|
|
"grad_norm": 0.08174904946083562,
|
||
|
|
"learning_rate": 2.0125656403332396e-06,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"step": 936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.548127936941034,
|
||
|
|
"grad_norm": 0.07927025601942647,
|
||
|
|
"learning_rate": 1.970326069377828e-06,
|
||
|
|
"loss": 0.4001,
|
||
|
|
"step": 937
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.552978626648477,
|
||
|
|
"grad_norm": 0.0783638905343862,
|
||
|
|
"learning_rate": 1.928523278140033e-06,
|
||
|
|
"loss": 0.3943,
|
||
|
|
"step": 938
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.557829316355919,
|
||
|
|
"grad_norm": 0.08012821453349714,
|
||
|
|
"learning_rate": 1.887157746734718e-06,
|
||
|
|
"loss": 0.3972,
|
||
|
|
"step": 939
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.562680006063362,
|
||
|
|
"grad_norm": 0.08022389886437123,
|
||
|
|
"learning_rate": 1.846229950254692e-06,
|
||
|
|
"loss": 0.3888,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.567530695770805,
|
||
|
|
"grad_norm": 0.07938938767749412,
|
||
|
|
"learning_rate": 1.8057403587652977e-06,
|
||
|
|
"loss": 0.3892,
|
||
|
|
"step": 941
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.572381385478248,
|
||
|
|
"grad_norm": 0.07979264917062728,
|
||
|
|
"learning_rate": 1.7656894372989785e-06,
|
||
|
|
"loss": 0.3991,
|
||
|
|
"step": 942
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.57723207518569,
|
||
|
|
"grad_norm": 0.07900416574598519,
|
||
|
|
"learning_rate": 1.726077645849955e-06,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"step": 943
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5820827648931335,
|
||
|
|
"grad_norm": 0.0816244017270053,
|
||
|
|
"learning_rate": 1.6869054393689265e-06,
|
||
|
|
"loss": 0.394,
|
||
|
|
"step": 944
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.586933454600576,
|
||
|
|
"grad_norm": 0.07678354405433441,
|
||
|
|
"learning_rate": 1.6481732677578798e-06,
|
||
|
|
"loss": 0.4026,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5917841443080185,
|
||
|
|
"grad_norm": 0.07913436605637802,
|
||
|
|
"learning_rate": 1.60988157586488e-06,
|
||
|
|
"loss": 0.378,
|
||
|
|
"step": 946
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.596634834015462,
|
||
|
|
"grad_norm": 0.0793297266554538,
|
||
|
|
"learning_rate": 1.5720308034789721e-06,
|
||
|
|
"loss": 0.391,
|
||
|
|
"step": 947
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.601485523722904,
|
||
|
|
"grad_norm": 0.07767980761029898,
|
||
|
|
"learning_rate": 1.5346213853251546e-06,
|
||
|
|
"loss": 0.3978,
|
||
|
|
"step": 948
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.606336213430347,
|
||
|
|
"grad_norm": 0.08494507410525762,
|
||
|
|
"learning_rate": 1.4976537510593646e-06,
|
||
|
|
"loss": 0.3995,
|
||
|
|
"step": 949
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.61118690313779,
|
||
|
|
"grad_norm": 0.08237577571806884,
|
||
|
|
"learning_rate": 1.4611283252635412e-06,
|
||
|
|
"loss": 0.4038,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.616037592845233,
|
||
|
|
"grad_norm": 0.07590194142960284,
|
||
|
|
"learning_rate": 1.425045527440756e-06,
|
||
|
|
"loss": 0.3956,
|
||
|
|
"step": 951
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.620888282552675,
|
||
|
|
"grad_norm": 0.07954982455169565,
|
||
|
|
"learning_rate": 1.3894057720104104e-06,
|
||
|
|
"loss": 0.399,
|
||
|
|
"step": 952
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.625738972260118,
|
||
|
|
"grad_norm": 0.07731849776688487,
|
||
|
|
"learning_rate": 1.354209468303429e-06,
|
||
|
|
"loss": 0.3828,
|
||
|
|
"step": 953
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.630589661967561,
|
||
|
|
"grad_norm": 0.07970299041030604,
|
||
|
|
"learning_rate": 1.3194570205576284e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"step": 954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.635440351675004,
|
||
|
|
"grad_norm": 0.07874124868840192,
|
||
|
|
"learning_rate": 1.2851488279130053e-06,
|
||
|
|
"loss": 0.3876,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.640291041382446,
|
||
|
|
"grad_norm": 0.07691612719760402,
|
||
|
|
"learning_rate": 1.2512852844071933e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"step": 956
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.64514173108989,
|
||
|
|
"grad_norm": 0.08126016720695953,
|
||
|
|
"learning_rate": 1.2178667789709287e-06,
|
||
|
|
"loss": 0.3919,
|
||
|
|
"step": 957
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.649992420797332,
|
||
|
|
"grad_norm": 0.08002643782323322,
|
||
|
|
"learning_rate": 1.1848936954235702e-06,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 958
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.654843110504775,
|
||
|
|
"grad_norm": 0.08026194265591635,
|
||
|
|
"learning_rate": 1.1523664124687284e-06,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 959
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.659693800212217,
|
||
|
|
"grad_norm": 0.07660352848559206,
|
||
|
|
"learning_rate": 1.1202853036898476e-06,
|
||
|
|
"loss": 0.3974,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.664544489919661,
|
||
|
|
"grad_norm": 0.08113236962163348,
|
||
|
|
"learning_rate": 1.0886507375459908e-06,
|
||
|
|
"loss": 0.3981,
|
||
|
|
"step": 961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.669395179627103,
|
||
|
|
"grad_norm": 0.07701309756204706,
|
||
|
|
"learning_rate": 1.0574630773675687e-06,
|
||
|
|
"loss": 0.3839,
|
||
|
|
"step": 962
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.674245869334546,
|
||
|
|
"grad_norm": 0.07891978078549244,
|
||
|
|
"learning_rate": 1.0267226813521635e-06,
|
||
|
|
"loss": 0.3877,
|
||
|
|
"step": 963
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.679096559041989,
|
||
|
|
"grad_norm": 0.07852321868608765,
|
||
|
|
"learning_rate": 9.964299025604274e-07,
|
||
|
|
"loss": 0.3921,
|
||
|
|
"step": 964
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.683947248749432,
|
||
|
|
"grad_norm": 0.07818107376466682,
|
||
|
|
"learning_rate": 9.66585088912022e-07,
|
||
|
|
"loss": 0.3967,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.688797938456874,
|
||
|
|
"grad_norm": 0.076600770628096,
|
||
|
|
"learning_rate": 9.371885831816319e-07,
|
||
|
|
"loss": 0.395,
|
||
|
|
"step": 966
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6936486281643175,
|
||
|
|
"grad_norm": 0.07627250485843899,
|
||
|
|
"learning_rate": 9.082407229950018e-07,
|
||
|
|
"loss": 0.3976,
|
||
|
|
"step": 967
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.69849931787176,
|
||
|
|
"grad_norm": 0.07905168555654667,
|
||
|
|
"learning_rate": 8.797418408251101e-07,
|
||
|
|
"loss": 0.3918,
|
||
|
|
"step": 968
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7033500075792025,
|
||
|
|
"grad_norm": 0.07625038396328788,
|
||
|
|
"learning_rate": 8.516922639882819e-07,
|
||
|
|
"loss": 0.3897,
|
||
|
|
"step": 969
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.708200697286646,
|
||
|
|
"grad_norm": 0.08055085830124646,
|
||
|
|
"learning_rate": 8.2409231464049e-07,
|
||
|
|
"loss": 0.3901,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.713051386994088,
|
||
|
|
"grad_norm": 0.07599467665769233,
|
||
|
|
"learning_rate": 7.969423097736162e-07,
|
||
|
|
"loss": 0.3931,
|
||
|
|
"step": 971
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.717902076701531,
|
||
|
|
"grad_norm": 0.08049982038815076,
|
||
|
|
"learning_rate": 7.702425612118269e-07,
|
||
|
|
"loss": 0.3962,
|
||
|
|
"step": 972
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7227527664089735,
|
||
|
|
"grad_norm": 0.07817439712988589,
|
||
|
|
"learning_rate": 7.439933756079942e-07,
|
||
|
|
"loss": 0.3913,
|
||
|
|
"step": 973
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.727603456116417,
|
||
|
|
"grad_norm": 0.08001209525722262,
|
||
|
|
"learning_rate": 7.181950544401695e-07,
|
||
|
|
"loss": 0.3935,
|
||
|
|
"step": 974
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.732454145823859,
|
||
|
|
"grad_norm": 0.07544329449164154,
|
||
|
|
"learning_rate": 6.928478940081107e-07,
|
||
|
|
"loss": 0.3984,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.737304835531302,
|
||
|
|
"grad_norm": 0.07792056514995602,
|
||
|
|
"learning_rate": 6.679521854299032e-07,
|
||
|
|
"loss": 0.3914,
|
||
|
|
"step": 976
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.742155525238745,
|
||
|
|
"grad_norm": 0.07777159770613211,
|
||
|
|
"learning_rate": 6.435082146385885e-07,
|
||
|
|
"loss": 0.3901,
|
||
|
|
"step": 977
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.747006214946188,
|
||
|
|
"grad_norm": 0.07684966968716175,
|
||
|
|
"learning_rate": 6.195162623789052e-07,
|
||
|
|
"loss": 0.393,
|
||
|
|
"step": 978
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.75185690465363,
|
||
|
|
"grad_norm": 0.07663626226452194,
|
||
|
|
"learning_rate": 5.959766042040426e-07,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"step": 979
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.756707594361073,
|
||
|
|
"grad_norm": 0.07653692258850123,
|
||
|
|
"learning_rate": 5.728895104724963e-07,
|
||
|
|
"loss": 0.3959,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.761558284068516,
|
||
|
|
"grad_norm": 0.07731590848782262,
|
||
|
|
"learning_rate": 5.502552463449418e-07,
|
||
|
|
"loss": 0.3984,
|
||
|
|
"step": 981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.766408973775959,
|
||
|
|
"grad_norm": 0.07715742201671594,
|
||
|
|
"learning_rate": 5.280740717812149e-07,
|
||
|
|
"loss": 0.3977,
|
||
|
|
"step": 982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.771259663483401,
|
||
|
|
"grad_norm": 0.07733908081214975,
|
||
|
|
"learning_rate": 5.063462415372967e-07,
|
||
|
|
"loss": 0.3948,
|
||
|
|
"step": 983
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.776110353190845,
|
||
|
|
"grad_norm": 0.07797117300936077,
|
||
|
|
"learning_rate": 4.850720051624124e-07,
|
||
|
|
"loss": 0.3914,
|
||
|
|
"step": 984
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.780961042898287,
|
||
|
|
"grad_norm": 0.07721734718928677,
|
||
|
|
"learning_rate": 4.642516069961556e-07,
|
||
|
|
"loss": 0.392,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.78581173260573,
|
||
|
|
"grad_norm": 0.07648551851063208,
|
||
|
|
"learning_rate": 4.438852861656751e-07,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"step": 986
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.790662422313172,
|
||
|
|
"grad_norm": 0.07521119037010907,
|
||
|
|
"learning_rate": 4.2397327658294076e-07,
|
||
|
|
"loss": 0.3899,
|
||
|
|
"step": 987
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.795513112020616,
|
||
|
|
"grad_norm": 0.07655879860272995,
|
||
|
|
"learning_rate": 4.045158069420474e-07,
|
||
|
|
"loss": 0.3963,
|
||
|
|
"step": 988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.800363801728058,
|
||
|
|
"grad_norm": 0.07622760847546149,
|
||
|
|
"learning_rate": 3.8551310071659023e-07,
|
||
|
|
"loss": 0.3975,
|
||
|
|
"step": 989
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.805214491435501,
|
||
|
|
"grad_norm": 0.0750428400819878,
|
||
|
|
"learning_rate": 3.6696537615711124e-07,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.810065181142944,
|
||
|
|
"grad_norm": 0.07852243689578567,
|
||
|
|
"learning_rate": 3.4887284628857266e-07,
|
||
|
|
"loss": 0.3932,
|
||
|
|
"step": 991
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8149158708503865,
|
||
|
|
"grad_norm": 0.0771348195411035,
|
||
|
|
"learning_rate": 3.3123571890791405e-07,
|
||
|
|
"loss": 0.3887,
|
||
|
|
"step": 992
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.819766560557829,
|
||
|
|
"grad_norm": 0.07614331045752255,
|
||
|
|
"learning_rate": 3.1405419658168125e-07,
|
||
|
|
"loss": 0.394,
|
||
|
|
"step": 993
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.824617250265272,
|
||
|
|
"grad_norm": 0.07563944264490313,
|
||
|
|
"learning_rate": 2.973284766436857e-07,
|
||
|
|
"loss": 0.3917,
|
||
|
|
"step": 994
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.829467939972715,
|
||
|
|
"grad_norm": 0.07503164664083585,
|
||
|
|
"learning_rate": 2.810587511927354e-07,
|
||
|
|
"loss": 0.3901,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8343186296801575,
|
||
|
|
"grad_norm": 0.07663595734772072,
|
||
|
|
"learning_rate": 2.652452070904499e-07,
|
||
|
|
"loss": 0.3923,
|
||
|
|
"step": 996
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.839169319387601,
|
||
|
|
"grad_norm": 0.07606961706547127,
|
||
|
|
"learning_rate": 2.498880259590797e-07,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"step": 997
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.844020009095043,
|
||
|
|
"grad_norm": 0.0761182148197196,
|
||
|
|
"learning_rate": 2.3498738417945034e-07,
|
||
|
|
"loss": 0.3975,
|
||
|
|
"step": 998
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.848870698802486,
|
||
|
|
"grad_norm": 0.08039104985634041,
|
||
|
|
"learning_rate": 2.205434528889283e-07,
|
||
|
|
"loss": 0.3971,
|
||
|
|
"step": 999
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.853721388509928,
|
||
|
|
"grad_norm": 0.07775606427920397,
|
||
|
|
"learning_rate": 2.0655639797944937e-07,
|
||
|
|
"loss": 0.3903,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.858572078217372,
|
||
|
|
"grad_norm": 0.07610940999590161,
|
||
|
|
"learning_rate": 1.9302638009561782e-07,
|
||
|
|
"loss": 0.396,
|
||
|
|
"step": 1001
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.863422767924814,
|
||
|
|
"grad_norm": 0.07569450912275462,
|
||
|
|
"learning_rate": 1.7995355463285457e-07,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 1002
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.868273457632257,
|
||
|
|
"grad_norm": 0.07643720956773195,
|
||
|
|
"learning_rate": 1.6733807173562988e-07,
|
||
|
|
"loss": 0.3913,
|
||
|
|
"step": 1003
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8731241473397,
|
||
|
|
"grad_norm": 0.07571887859535044,
|
||
|
|
"learning_rate": 1.5518007629571342e-07,
|
||
|
|
"loss": 0.3995,
|
||
|
|
"step": 1004
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.877974837047143,
|
||
|
|
"grad_norm": 0.07657090064859913,
|
||
|
|
"learning_rate": 1.4347970795054456e-07,
|
||
|
|
"loss": 0.3967,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.882825526754585,
|
||
|
|
"grad_norm": 0.07475277823538463,
|
||
|
|
"learning_rate": 1.3223710108158483e-07,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 1006
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.887676216462028,
|
||
|
|
"grad_norm": 0.07465351831333361,
|
||
|
|
"learning_rate": 1.214523848128124e-07,
|
||
|
|
"loss": 0.3971,
|
||
|
|
"step": 1007
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.892526906169471,
|
||
|
|
"grad_norm": 0.07637388169909817,
|
||
|
|
"learning_rate": 1.111256830092211e-07,
|
||
|
|
"loss": 0.3996,
|
||
|
|
"step": 1008
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.897377595876914,
|
||
|
|
"grad_norm": 0.07639665552172381,
|
||
|
|
"learning_rate": 1.0125711427540374e-07,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"step": 1009
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.902228285584356,
|
||
|
|
"grad_norm": 0.07373092012101537,
|
||
|
|
"learning_rate": 9.184679195417989e-08,
|
||
|
|
"loss": 0.3889,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9070789752918,
|
||
|
|
"grad_norm": 0.07583698163385665,
|
||
|
|
"learning_rate": 8.289482412531246e-08,
|
||
|
|
"loss": 0.3984,
|
||
|
|
"step": 1011
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.911929664999242,
|
||
|
|
"grad_norm": 0.07647787343235872,
|
||
|
|
"learning_rate": 7.440131360424652e-08,
|
||
|
|
"loss": 0.3887,
|
||
|
|
"step": 1012
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.916780354706685,
|
||
|
|
"grad_norm": 0.07751142464425709,
|
||
|
|
"learning_rate": 6.636635794094126e-08,
|
||
|
|
"loss": 0.3908,
|
||
|
|
"step": 1013
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.921631044414128,
|
||
|
|
"grad_norm": 0.07884353160777417,
|
||
|
|
"learning_rate": 5.879004941874655e-08,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"step": 1014
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9264817341215705,
|
||
|
|
"grad_norm": 0.07661585400143268,
|
||
|
|
"learning_rate": 5.16724750533415e-08,
|
||
|
|
"loss": 0.398,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.931332423829013,
|
||
|
|
"grad_norm": 0.07506936516416932,
|
||
|
|
"learning_rate": 4.5013716591730815e-08,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"step": 1016
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9361831135364564,
|
||
|
|
"grad_norm": 0.07625120541429038,
|
||
|
|
"learning_rate": 3.881385051132114e-08,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"step": 1017
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.941033803243899,
|
||
|
|
"grad_norm": 0.0754548388472603,
|
||
|
|
"learning_rate": 3.307294801902838e-08,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"step": 1018
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9458844929513415,
|
||
|
|
"grad_norm": 0.07563871272528049,
|
||
|
|
"learning_rate": 2.7791075050460636e-08,
|
||
|
|
"loss": 0.3977,
|
||
|
|
"step": 1019
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.950735182658784,
|
||
|
|
"grad_norm": 0.0760975342061979,
|
||
|
|
"learning_rate": 2.2968292269167637e-08,
|
||
|
|
"loss": 0.3978,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.955585872366227,
|
||
|
|
"grad_norm": 0.07484229156933196,
|
||
|
|
"learning_rate": 1.8604655065939116e-08,
|
||
|
|
"loss": 0.3888,
|
||
|
|
"step": 1021
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.96043656207367,
|
||
|
|
"grad_norm": 0.07489129192172529,
|
||
|
|
"learning_rate": 1.470021355816975e-08,
|
||
|
|
"loss": 0.3952,
|
||
|
|
"step": 1022
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.965287251781112,
|
||
|
|
"grad_norm": 0.07561234415729748,
|
||
|
|
"learning_rate": 1.1255012589286297e-08,
|
||
|
|
"loss": 0.3988,
|
||
|
|
"step": 1023
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.970137941488556,
|
||
|
|
"grad_norm": 0.07589712959511802,
|
||
|
|
"learning_rate": 8.269091728232426e-09,
|
||
|
|
"loss": 0.3858,
|
||
|
|
"step": 1024
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.974988631195998,
|
||
|
|
"grad_norm": 0.07684966203710471,
|
||
|
|
"learning_rate": 5.742485269006892e-09,
|
||
|
|
"loss": 0.3976,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.979839320903441,
|
||
|
|
"grad_norm": 0.07450707523151534,
|
||
|
|
"learning_rate": 3.6752222302727238e-09,
|
||
|
|
"loss": 0.3922,
|
||
|
|
"step": 1026
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.984690010610883,
|
||
|
|
"grad_norm": 0.0754634711281639,
|
||
|
|
"learning_rate": 2.06732635503748e-09,
|
||
|
|
"loss": 0.3894,
|
||
|
|
"step": 1027
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.989540700318327,
|
||
|
|
"grad_norm": 0.07460842756020492,
|
||
|
|
"learning_rate": 9.188161103557136e-10,
|
||
|
|
"loss": 0.3893,
|
||
|
|
"step": 1028
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.994391390025769,
|
||
|
|
"grad_norm": 0.07411367814575705,
|
||
|
|
"learning_rate": 2.2970468714245132e-10,
|
||
|
|
"loss": 0.3934,
|
||
|
|
"step": 1029
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.999242079733212,
|
||
|
|
"grad_norm": 0.07533391242414708,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 0.3877,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.999242079733212,
|
||
|
|
"step": 1030,
|
||
|
|
"total_flos": 2.739131934768418e+19,
|
||
|
|
"train_loss": 0.07882811409755817,
|
||
|
|
"train_runtime": 48155.7887,
|
||
|
|
"train_samples_per_second": 10.958,
|
||
|
|
"train_steps_per_second": 0.021
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 1030,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 5,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2.739131934768418e+19,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|